diff options
Diffstat (limited to 'lib/librte_eal')
68 files changed, 3862 insertions, 662 deletions
diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile index d27da3d1..bfeddaad 100644 --- a/lib/librte_eal/bsdapp/eal/Makefile +++ b/lib/librte_eal/bsdapp/eal/Makefile @@ -22,7 +22,7 @@ LDLIBS += -lrte_kvargs EXPORT_MAP := ../../rte_eal_version.map -LIBABIVER := 8 +LIBABIVER := 9 # specific to bsdapp exec-env SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) := eal.c @@ -62,10 +62,12 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_proc.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_uuid.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += hotplug_mp.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_mp.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_option.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_service.c SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_reciprocal.c @@ -77,11 +79,6 @@ SRCS-y += rte_cycles.c CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) -CFLAGS_eal.o := -D_GNU_SOURCE -#CFLAGS_eal_thread.o := -D_GNU_SOURCE -CFLAGS_eal_log.o := -D_GNU_SOURCE -CFLAGS_eal_common_log.o := -D_GNU_SOURCE - # workaround for a gcc bug with noreturn attribute # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c index d7ae9d68..508cbc46 100644 --- a/lib/librte_eal/bsdapp/eal/eal.c +++ b/lib/librte_eal/bsdapp/eal/eal.c @@ -42,6 +42,7 @@ #include <rte_devargs.h> #include <rte_version.h> #include <rte_vfio.h> +#include <rte_option.h> #include <rte_atomic.h> #include <malloc_heap.h> @@ -141,7 +142,7 @@ eal_create_runtime_dir(void) } const char * -eal_get_runtime_dir(void) +rte_eal_get_runtime_dir(void) { return runtime_dir; } @@ -414,12 +415,20 @@ eal_parse_args(int argc, char **argv) argvopt = argv; optind = 1; optreset = 1; + opterr = 0; while ((opt = getopt_long(argc, argvopt, eal_short_options, eal_long_options, &option_index)) != EOF) { - /* getopt is not happy, stop right now */ + /* + * getopt didn't recognise the option, lets parse the + * registered options to see if the flag is valid + */ if (opt == '?') { + ret = rte_option_parse(argv[optind-1]); + if (ret == 0) + continue; + eal_usage(prgname); ret = -1; goto out; @@ -502,6 +511,9 @@ check_socket(const struct rte_memseg_list *msl, void *arg) { int *socket_id = arg; + if (msl->external) + return 0; + if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0) return 1; @@ -607,7 +619,7 @@ rte_eal_init(int argc, char **argv) internal_config.legacy_mem = true; if (eal_plugins_init() < 0) { - rte_eal_init_alert("Cannot init plugins\n"); + rte_eal_init_alert("Cannot init plugins"); rte_errno = EINVAL; rte_atomic32_clear(&run_once); return -1; @@ -622,7 +634,7 @@ rte_eal_init(int argc, char **argv) rte_config_init(); if (rte_eal_intr_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + rte_eal_init_alert("Cannot init interrupt-handling thread"); return -1; } @@ -630,7 +642,7 @@ rte_eal_init(int argc, char **argv) * bus through mp channel in the secondary process before the bus scan. */ if (rte_mp_channel_init() < 0) { - rte_eal_init_alert("failed to init mp channel\n"); + rte_eal_init_alert("failed to init mp channel"); if (rte_eal_process_type() == RTE_PROC_PRIMARY) { rte_errno = EFAULT; return -1; @@ -638,14 +650,21 @@ rte_eal_init(int argc, char **argv) } if (rte_bus_scan()) { - rte_eal_init_alert("Cannot scan the buses for devices\n"); + rte_eal_init_alert("Cannot scan the buses for devices"); rte_errno = ENODEV; rte_atomic32_clear(&run_once); return -1; } - /* autodetect the iova mapping mode (default is iova_pa) */ - rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class(); + /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */ + if (internal_config.iova_mode == RTE_IOVA_DC) { + /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */ + rte_eal_get_configuration()->iova_mode = + rte_bus_get_iommu_class(); + } else { + rte_eal_get_configuration()->iova_mode = + internal_config.iova_mode; + } if (internal_config.no_hugetlbfs == 0) { /* rte_config isn't initialized yet */ @@ -685,37 +704,37 @@ rte_eal_init(int argc, char **argv) * initialize memzones first. */ if (rte_eal_memzone_init() < 0) { - rte_eal_init_alert("Cannot init memzone\n"); + rte_eal_init_alert("Cannot init memzone"); rte_errno = ENODEV; return -1; } if (rte_eal_memory_init() < 0) { - rte_eal_init_alert("Cannot init memory\n"); + rte_eal_init_alert("Cannot init memory"); rte_errno = ENOMEM; return -1; } if (rte_eal_malloc_heap_init() < 0) { - rte_eal_init_alert("Cannot init malloc heap\n"); + rte_eal_init_alert("Cannot init malloc heap"); rte_errno = ENODEV; return -1; } if (rte_eal_tailqs_init() < 0) { - rte_eal_init_alert("Cannot init tail queues for objects\n"); + rte_eal_init_alert("Cannot init tail queues for objects"); rte_errno = EFAULT; return -1; } if (rte_eal_alarm_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + rte_eal_init_alert("Cannot init interrupt-handling thread"); /* rte_eal_alarm_init sets rte_errno on failure. */ return -1; } if (rte_eal_timer_init() < 0) { - rte_eal_init_alert("Cannot init HPET or TSC timers\n"); + rte_eal_init_alert("Cannot init HPET or TSC timers"); rte_errno = ENOTSUP; return -1; } @@ -765,14 +784,14 @@ rte_eal_init(int argc, char **argv) /* initialize services so vdevs register service during bus_probe. */ ret = rte_service_init(); if (ret) { - rte_eal_init_alert("rte_service_init() failed\n"); + rte_eal_init_alert("rte_service_init() failed"); rte_errno = ENOEXEC; return -1; } /* Probe all the buses and devices/drivers on them */ if (rte_bus_probe()) { - rte_eal_init_alert("Cannot probe devices\n"); + rte_eal_init_alert("Cannot probe devices"); rte_errno = ENOTSUP; return -1; } @@ -788,6 +807,9 @@ rte_eal_init(int argc, char **argv) rte_eal_mcfg_complete(); + /* Call each registered callback, if enabled */ + rte_option_init(); + return fctret; } diff --git a/lib/librte_eal/bsdapp/eal/eal_dev.c b/lib/librte_eal/bsdapp/eal/eal_dev.c index 1c6c51bd..255d611e 100644 --- a/lib/librte_eal/bsdapp/eal/eal_dev.c +++ b/lib/librte_eal/bsdapp/eal/eal_dev.c @@ -19,3 +19,17 @@ rte_dev_event_monitor_stop(void) RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); return -1; } + +int __rte_experimental +rte_dev_hotplug_handle_enable(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} + +int __rte_experimental +rte_dev_hotplug_handle_disable(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c index f7f07abd..a5847f0b 100644 --- a/lib/librte_eal/bsdapp/eal/eal_memalloc.c +++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c @@ -4,6 +4,7 @@ #include <inttypes.h> +#include <rte_errno.h> #include <rte_log.h> #include <rte_memory.h> @@ -48,6 +49,26 @@ eal_memalloc_sync_with_primary(void) } int +eal_memalloc_get_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_set_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused, + int fd __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_get_seg_fd_offset(int list_idx __rte_unused, + int seg_idx __rte_unused, size_t *offset __rte_unused) +{ + return -ENOTSUP; +} + +int eal_memalloc_init(void) { return 0; diff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c index 16d2bc7c..4b092e1f 100644 --- a/lib/librte_eal/bsdapp/eal/eal_memory.c +++ b/lib/librte_eal/bsdapp/eal/eal_memory.c @@ -79,6 +79,7 @@ rte_eal_hugepage_init(void) } msl->base_va = addr; msl->page_sz = page_sz; + msl->len = internal_config.memory; msl->socket_id = 0; /* populate memsegs. each memseg is 1 page long */ @@ -235,12 +236,15 @@ struct attach_walk_args { int seg_idx; }; static int -attach_segment(const struct rte_memseg_list *msl __rte_unused, - const struct rte_memseg *ms, void *arg) +attach_segment(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) { struct attach_walk_args *wa = arg; void *addr; + if (msl->external) + return 0; + addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, wa->fd_hugepage, wa->seg_idx * EAL_PAGE_SIZE); @@ -370,6 +374,7 @@ alloc_va_space(struct rte_memseg_list *msl) return -1; } msl->base_va = addr; + msl->len = mem_sz; return 0; } diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile index cca68826..87d8c455 100644 --- a/lib/librte_eal/common/Makefile +++ b/lib/librte_eal/common/Makefile @@ -12,6 +12,7 @@ INC += rte_tailq.h rte_interrupts.h rte_alarm.h INC += rte_string_fns.h rte_version.h INC += rte_eal_memconfig.h rte_malloc_heap.h INC += rte_hexdump.h rte_devargs.h rte_bus.h rte_dev.h rte_class.h +INC += rte_option.h INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h INC += rte_malloc.h rte_keepalive.h rte_time.h INC += rte_service.h rte_service_component.h diff --git a/lib/librte_eal/common/arch/arm/meson.build b/lib/librte_eal/common/arch/arm/meson.build index c6bd9227..79731e1a 100644 --- a/lib/librte_eal/common/arch/arm/meson.build +++ b/lib/librte_eal/common/arch/arm/meson.build @@ -2,4 +2,4 @@ # Copyright(c) 2017 Intel Corporation. eal_common_arch_sources = files('rte_cpuflags.c', - 'rte_cycles.c') + 'rte_cycles.c', 'rte_hypervisor.c') diff --git a/lib/librte_eal/common/arch/ppc_64/meson.build b/lib/librte_eal/common/arch/ppc_64/meson.build new file mode 100644 index 00000000..40b3dc53 --- /dev/null +++ b/lib/librte_eal/common/arch/ppc_64/meson.build @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Luca Boccassi <bluca@debian.org> + +eal_common_arch_sources = files('rte_cpuflags.c', + 'rte_cycles.c', 'rte_hypervisor.c') diff --git a/lib/librte_eal/common/arch/x86/meson.build b/lib/librte_eal/common/arch/x86/meson.build index 4e0f7790..14bf204c 100644 --- a/lib/librte_eal/common/arch/x86/meson.build +++ b/lib/librte_eal/common/arch/x86/meson.build @@ -2,4 +2,4 @@ # Copyright(c) 2017 Intel Corporation eal_common_arch_sources = files('rte_spinlock.c', 'rte_cpuflags.c', - 'rte_cycles.c') + 'rte_cycles.c', 'rte_hypervisor.c') diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c index 0943851c..c8f1901f 100644 --- a/lib/librte_eal/common/eal_common_bus.c +++ b/lib/librte_eal/common/eal_common_bus.c @@ -37,10 +37,11 @@ #include <rte_bus.h> #include <rte_debug.h> #include <rte_string_fns.h> +#include <rte_errno.h> #include "eal_private.h" -struct rte_bus_list rte_bus_list = +static struct rte_bus_list rte_bus_list = TAILQ_HEAD_INITIALIZER(rte_bus_list); void @@ -242,3 +243,45 @@ rte_bus_get_iommu_class(void) } return mode; } + +static int +bus_handle_sigbus(const struct rte_bus *bus, + const void *failure_addr) +{ + int ret; + + if (!bus->sigbus_handler) + return -1; + + ret = bus->sigbus_handler(failure_addr); + + /* find bus but handle failed, keep the errno be set. */ + if (ret < 0 && rte_errno == 0) + rte_errno = ENOTSUP; + + return ret > 0; +} + +int +rte_bus_sigbus_handler(const void *failure_addr) +{ + struct rte_bus *bus; + + int ret = 0; + int old_errno = rte_errno; + + rte_errno = 0; + + bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr); + /* can not find bus. */ + if (!bus) + return 1; + /* find bus but handle failed, pass on the new errno. */ + else if (rte_errno != 0) + return -1; + + /* restore the old errno. */ + rte_errno = old_errno; + + return ret; +} diff --git a/lib/librte_eal/common/eal_common_class.c b/lib/librte_eal/common/eal_common_class.c index 404a9065..d922266d 100644 --- a/lib/librte_eal/common/eal_common_class.c +++ b/lib/librte_eal/common/eal_common_class.c @@ -9,7 +9,7 @@ #include <rte_class.h> #include <rte_debug.h> -struct rte_class_list rte_class_list = +static struct rte_class_list rte_class_list = TAILQ_HEAD_INITIALIZER(rte_class_list); __rte_experimental void diff --git a/lib/librte_eal/common/eal_common_dev.c b/lib/librte_eal/common/eal_common_dev.c index 678dbcac..62e9ed47 100644 --- a/lib/librte_eal/common/eal_common_dev.c +++ b/lib/librte_eal/common/eal_common_dev.c @@ -19,8 +19,10 @@ #include <rte_log.h> #include <rte_spinlock.h> #include <rte_malloc.h> +#include <rte_string_fns.h> #include "eal_private.h" +#include "hotplug_mp.h" /** * The device event callback description. @@ -74,119 +76,110 @@ static int cmp_dev_name(const struct rte_device *dev, const void *_name) return strcmp(dev->name, name); } -int rte_eal_dev_attach(const char *name, const char *devargs) +int __rte_experimental +rte_dev_is_probed(const struct rte_device *dev) { - struct rte_bus *bus; + /* The field driver should be set only when the probe is successful. */ + return dev->driver != NULL; +} - if (name == NULL || devargs == NULL) { - RTE_LOG(ERR, EAL, "Invalid device or arguments provided\n"); +/* helper function to build devargs, caller should free the memory */ +static int +build_devargs(const char *busname, const char *devname, + const char *drvargs, char **devargs) +{ + int length; + + length = snprintf(NULL, 0, "%s:%s,%s", busname, devname, drvargs); + if (length < 0) return -EINVAL; - } - bus = rte_bus_find_by_device_name(name); - if (bus == NULL) { - RTE_LOG(ERR, EAL, "Unable to find a bus for the device '%s'\n", - name); + *devargs = malloc(length + 1); + if (*devargs == NULL) + return -ENOMEM; + + length = snprintf(*devargs, length + 1, "%s:%s,%s", + busname, devname, drvargs); + if (length < 0) { + free(*devargs); return -EINVAL; } - if (strcmp(bus->name, "pci") == 0 || strcmp(bus->name, "vdev") == 0) - return rte_eal_hotplug_add(bus->name, name, devargs); - - RTE_LOG(ERR, EAL, - "Device attach is only supported for PCI and vdev devices.\n"); - return -ENOTSUP; + return 0; } -int rte_eal_dev_detach(struct rte_device *dev) +int +rte_eal_hotplug_add(const char *busname, const char *devname, + const char *drvargs) { - struct rte_bus *bus; - int ret; - if (dev == NULL) { - RTE_LOG(ERR, EAL, "Invalid device provided.\n"); - return -EINVAL; - } + char *devargs; + int ret; - bus = rte_bus_find_by_device(dev); - if (bus == NULL) { - RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n", - dev->name); - return -EINVAL; - } + ret = build_devargs(busname, devname, drvargs, &devargs); + if (ret != 0) + return ret; - if (bus->unplug == NULL) { - RTE_LOG(ERR, EAL, "Bus function not supported\n"); - return -ENOTSUP; - } + ret = rte_dev_probe(devargs); + free(devargs); - ret = bus->unplug(dev); - if (ret) - RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", - dev->name); return ret; } -int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname, - const char *devargs) +/* probe device at local process. */ +int +local_dev_probe(const char *devargs, struct rte_device **new_dev) { - struct rte_bus *bus; struct rte_device *dev; struct rte_devargs *da; int ret; - bus = rte_bus_find_by_name(busname); - if (bus == NULL) { - RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname); - return -ENOENT; - } - - if (bus->plug == NULL) { - RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n", - bus->name); - return -ENOTSUP; - } - + *new_dev = NULL; da = calloc(1, sizeof(*da)); if (da == NULL) return -ENOMEM; - ret = rte_devargs_parsef(da, "%s:%s,%s", - busname, devname, devargs); + ret = rte_devargs_parse(da, devargs); if (ret) goto err_devarg; + if (da->bus->plug == NULL) { + RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n", + da->bus->name); + ret = -ENOTSUP; + goto err_devarg; + } + ret = rte_devargs_insert(da); if (ret) goto err_devarg; - ret = bus->scan(); + ret = da->bus->scan(); if (ret) goto err_devarg; - dev = bus->find_device(NULL, cmp_dev_name, devname); + dev = da->bus->find_device(NULL, cmp_dev_name, da->name); if (dev == NULL) { RTE_LOG(ERR, EAL, "Cannot find device (%s)\n", - devname); + da->name); ret = -ENODEV; goto err_devarg; } - if (dev->driver != NULL) { - RTE_LOG(ERR, EAL, "Device is already plugged\n"); - return -EEXIST; - } - - ret = bus->plug(dev); + ret = dev->bus->plug(dev); if (ret) { + if (rte_dev_is_probed(dev)) /* if already succeeded earlier */ + return ret; /* no rollback */ RTE_LOG(ERR, EAL, "Driver cannot attach the device (%s)\n", dev->name); goto err_devarg; } + + *new_dev = dev; return 0; err_devarg: - if (rte_devargs_remove(busname, devname)) { + if (rte_devargs_remove(da) != 0) { free(da->args); free(da); } @@ -194,40 +187,235 @@ err_devarg: } int __rte_experimental -rte_eal_hotplug_remove(const char *busname, const char *devname) +rte_dev_probe(const char *devargs) { - struct rte_bus *bus; + struct eal_dev_mp_req req; struct rte_device *dev; int ret; + memset(&req, 0, sizeof(req)); + req.t = EAL_DEV_REQ_TYPE_ATTACH; + strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN); + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + /** + * If in secondary process, just send IPC request to + * primary process. + */ + ret = eal_dev_hotplug_request_to_primary(&req); + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug request to primary\n"); + return -ENOMSG; + } + if (req.result != 0) + RTE_LOG(ERR, EAL, + "Failed to hotplug add device\n"); + return req.result; + } + + /* attach a shared device from primary start from here: */ + + /* primary attach the new device itself. */ + ret = local_dev_probe(devargs, &dev); + + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to attach device on primary process\n"); + + /** + * it is possible that secondary process failed to attached a + * device that primary process have during initialization, + * so for -EEXIST case, we still need to sync with secondary + * process. + */ + if (ret != -EEXIST) + return ret; + } + + /* primary send attach sync request to secondary. */ + ret = eal_dev_hotplug_request_to_secondary(&req); + + /* if any communication error, we need to rollback. */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug add request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + /** + * if any secondary failed to attach, we need to consider if rollback + * is necessary. + */ + if (req.result != 0) { + RTE_LOG(ERR, EAL, + "Failed to attach device on secondary process\n"); + ret = req.result; + + /* for -EEXIST, we don't need to rollback. */ + if (ret == -EEXIST) + return ret; + goto rollback; + } + + return 0; + +rollback: + req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK; + + /* primary send rollback request to secondary. */ + if (eal_dev_hotplug_request_to_secondary(&req) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device attach on secondary." + "Devices in secondary may not sync with primary\n"); + + /* primary rollback itself. */ + if (local_dev_remove(dev) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device attach on primary." + "Devices in secondary may not sync with primary\n"); + + return ret; +} + +int +rte_eal_hotplug_remove(const char *busname, const char *devname) +{ + struct rte_device *dev; + struct rte_bus *bus; + bus = rte_bus_find_by_name(busname); if (bus == NULL) { RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname); return -ENOENT; } - if (bus->unplug == NULL) { - RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n", - bus->name); - return -ENOTSUP; - } - dev = bus->find_device(NULL, cmp_dev_name, devname); if (dev == NULL) { RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", devname); return -EINVAL; } - if (dev->driver == NULL) { - RTE_LOG(ERR, EAL, "Device is already unplugged\n"); - return -ENOENT; + return rte_dev_remove(dev); +} + +/* remove device at local process. */ +int +local_dev_remove(struct rte_device *dev) +{ + int ret; + + if (dev->bus->unplug == NULL) { + RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n", + dev->bus->name); + return -ENOTSUP; } - ret = bus->unplug(dev); - if (ret) + ret = dev->bus->unplug(dev); + if (ret) { RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", dev->name); - rte_devargs_remove(busname, devname); + return ret; + } + + return 0; +} + +int __rte_experimental +rte_dev_remove(struct rte_device *dev) +{ + struct eal_dev_mp_req req; + char *devargs; + int ret; + + if (!rte_dev_is_probed(dev)) { + RTE_LOG(ERR, EAL, "Device is not probed\n"); + return -ENOENT; + } + + ret = build_devargs(dev->bus->name, dev->name, "", &devargs); + if (ret != 0) + return ret; + + memset(&req, 0, sizeof(req)); + req.t = EAL_DEV_REQ_TYPE_DETACH; + strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN); + free(devargs); + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + /** + * If in secondary process, just send IPC request to + * primary process. + */ + ret = eal_dev_hotplug_request_to_primary(&req); + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug request to primary\n"); + return -ENOMSG; + } + if (req.result != 0) + RTE_LOG(ERR, EAL, + "Failed to hotplug remove device\n"); + return req.result; + } + + /* detach a device from primary start from here: */ + + /* primary send detach sync request to secondary */ + ret = eal_dev_hotplug_request_to_secondary(&req); + + /** + * if communication error, we need to rollback, because it is possible + * part of the secondary processes still detached it successfully. + */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send device detach request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + /** + * if any secondary failed to detach, we need to consider if rollback + * is necessary. + */ + if (req.result != 0) { + RTE_LOG(ERR, EAL, + "Failed to detach device on secondary process\n"); + ret = req.result; + /** + * if -ENOENT, we don't need to rollback, since devices is + * already detached on secondary process. + */ + if (ret != -ENOENT) + goto rollback; + } + + /* primary detach the device itself. */ + ret = local_dev_remove(dev); + + /* if primary failed, still need to consider if rollback is necessary */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to detach device on primary process\n"); + /* if -ENOENT, we don't need to rollback */ + if (ret == -ENOENT) + return ret; + goto rollback; + } + + return 0; + +rollback: + req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK; + + /* primary send rollback request to secondary. */ + if (eal_dev_hotplug_request_to_secondary(&req) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device detach on secondary." + "Devices in secondary may not sync with primary\n"); + return ret; } @@ -342,8 +530,9 @@ rte_dev_event_callback_unregister(const char *device_name, return ret; } -void -dev_callback_process(char *device_name, enum rte_dev_event_type event) +void __rte_experimental +rte_dev_event_callback_process(const char *device_name, + enum rte_dev_event_type event) { struct dev_event_callback *cb_lst; diff --git a/lib/librte_eal/common/eal_common_devargs.c b/lib/librte_eal/common/eal_common_devargs.c index dac2402a..b7b9cb69 100644 --- a/lib/librte_eal/common/eal_common_devargs.c +++ b/lib/librte_eal/common/eal_common_devargs.c @@ -4,9 +4,6 @@ /* This file manages the list of devices and their arguments, as given * by the user at startup - * - * Code here should not call rte_log since the EAL environment - * may not be initialized. */ #include <stdio.h> @@ -28,39 +25,9 @@ TAILQ_HEAD(rte_devargs_list, rte_devargs); /** Global list of user devices */ -struct rte_devargs_list devargs_list = +static struct rte_devargs_list devargs_list = TAILQ_HEAD_INITIALIZER(devargs_list); -int -rte_eal_parse_devargs_str(const char *devargs_str, - char **drvname, char **drvargs) -{ - char *sep; - - if ((devargs_str) == NULL || (drvname) == NULL || (drvargs == NULL)) - return -1; - - *drvname = strdup(devargs_str); - if (*drvname == NULL) - return -1; - - /* set the first ',' to '\0' to split name and arguments */ - sep = strchr(*drvname, ','); - if (sep != NULL) { - sep[0] = '\0'; - *drvargs = strdup(sep + 1); - } else { - *drvargs = strdup(""); - } - - if (*drvargs == NULL) { - free(*drvname); - *drvname = NULL; - return -1; - } - return 0; -} - static size_t devargs_layer_count(const char *s) { @@ -270,6 +237,7 @@ rte_devargs_parsef(struct rte_devargs *da, const char *format, ...) va_list ap; size_t len; char *dev; + int ret; if (da == NULL) return -EINVAL; @@ -288,7 +256,10 @@ rte_devargs_parsef(struct rte_devargs *da, const char *format, ...) vsnprintf(dev, len + 1, format, ap); va_end(ap); - return rte_devargs_parse(da, dev); + ret = rte_devargs_parse(da, dev); + + free(dev); + return ret; } int __rte_experimental @@ -296,7 +267,7 @@ rte_devargs_insert(struct rte_devargs *da) { int ret; - ret = rte_devargs_remove(da->bus->name, da->name); + ret = rte_devargs_remove(da); if (ret < 0) return ret; TAILQ_INSERT_TAIL(&devargs_list, da, next); @@ -342,14 +313,17 @@ fail: } int __rte_experimental -rte_devargs_remove(const char *busname, const char *devname) +rte_devargs_remove(struct rte_devargs *devargs) { struct rte_devargs *d; void *tmp; + if (devargs == NULL || devargs->bus == NULL) + return -1; + TAILQ_FOREACH_SAFE(d, &devargs_list, next, tmp) { - if (strcmp(d->bus->name, busname) == 0 && - strcmp(d->name, devname) == 0) { + if (strcmp(d->bus->name, devargs->bus->name) == 0 && + strcmp(d->name, devargs->name) == 0) { TAILQ_REMOVE(&devargs_list, d, next); free(d->args); free(d); diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c index 43caf3ce..ea0735cb 100644 --- a/lib/librte_eal/common/eal_common_fbarray.c +++ b/lib/librte_eal/common/eal_common_fbarray.c @@ -2,6 +2,7 @@ * Copyright(c) 2017-2018 Intel Corporation */ +#include <fcntl.h> #include <inttypes.h> #include <limits.h> #include <sys/mman.h> @@ -878,6 +879,10 @@ rte_fbarray_destroy(struct rte_fbarray *arr) if (ret) return ret; + /* with no shconf, there were never any files to begin with */ + if (internal_config.no_shconf) + return 0; + /* try deleting the file */ eal_get_fbarray_path(path, sizeof(path), arr->name); diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c index fbfb1b05..12dcedf5 100644 --- a/lib/librte_eal/common/eal_common_memory.c +++ b/lib/librte_eal/common/eal_common_memory.c @@ -2,6 +2,7 @@ * Copyright(c) 2010-2014 Intel Corporation */ +#include <fcntl.h> #include <errno.h> #include <stdio.h> #include <stdint.h> @@ -37,6 +38,23 @@ static void *next_baseaddr; static uint64_t system_page_sz; +#ifdef RTE_ARCH_64 +/* + * Linux kernel uses a really high address as starting address for serving + * mmaps calls. If there exists addressing limitations and IOVA mode is VA, + * this starting address is likely too high for those devices. However, it + * is possible to use a lower address in the process virtual address space + * as with 64 bits there is a lot of available space. + * + * Current known limitations are 39 or 40 bits. Setting the starting address + * at 4GB implies there are 508GB or 1020GB for mapping the available + * hugepages. This is likely enough for most systems, although a device with + * addressing limitations should call rte_eal_check_dma_mask for ensuring all + * memory is within supported range. + */ +static uint64_t baseaddr = 0x100000000; +#endif + void * eal_get_virtual_area(void *requested_addr, size_t *size, size_t page_sz, int flags, int mmap_flags) @@ -60,6 +78,11 @@ eal_get_virtual_area(void *requested_addr, size_t *size, rte_eal_process_type() == RTE_PROC_PRIMARY) next_baseaddr = (void *) internal_config.base_virtaddr; +#ifdef RTE_ARCH_64 + if (next_baseaddr == NULL && internal_config.base_virtaddr == 0 && + rte_eal_process_type() == RTE_PROC_PRIMARY) + next_baseaddr = (void *) baseaddr; +#endif if (requested_addr == NULL && next_baseaddr != NULL) { requested_addr = next_baseaddr; requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); @@ -91,7 +114,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size, mmap_flags, -1, 0); if (mapped_addr == MAP_FAILED && allow_shrink) *size -= page_sz; - } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0); + + if (mapped_addr != MAP_FAILED && addr_is_hint && + mapped_addr != requested_addr) { + /* hint was not used. Try with another offset */ + munmap(mapped_addr, map_sz); + mapped_addr = MAP_FAILED; + next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz); + requested_addr = next_baseaddr; + } + } while ((allow_shrink || addr_is_hint) && + mapped_addr == MAP_FAILED && *size > 0); /* align resulting address - if map failed, we will ignore the value * anyway, so no need to add additional checks. @@ -171,7 +204,7 @@ virt2memseg(const void *addr, const struct rte_memseg_list *msl) /* a memseg list was specified, check if it's the right one */ start = msl->base_va; - end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len); + end = RTE_PTR_ADD(start, msl->len); if (addr < start || addr >= end) return NULL; @@ -194,8 +227,7 @@ virt2memseg_list(const void *addr) msl = &mcfg->memsegs[msl_idx]; start = msl->base_va; - end = RTE_PTR_ADD(start, - (size_t)msl->page_sz * msl->memseg_arr.len); + end = RTE_PTR_ADD(start, msl->len); if (addr >= start && addr < end) break; } @@ -273,6 +305,9 @@ physmem_size(const struct rte_memseg_list *msl, void *arg) { uint64_t *total_len = arg; + if (msl->external) + return 0; + *total_len += msl->memseg_arr.count * msl->page_sz; return 0; @@ -294,7 +329,7 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int msl_idx, ms_idx; + int msl_idx, ms_idx, fd; FILE *f = arg; msl_idx = msl - mcfg->memsegs; @@ -305,10 +340,11 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, if (ms_idx < 0) return -1; + fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx); fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, " "virt:%p, socket_id:%"PRId32", " "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " - "nrank:%"PRIx32"\n", + "nrank:%"PRIx32" fd:%i\n", msl_idx, ms_idx, ms->iova, ms->len, @@ -316,7 +352,8 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, ms->socket_id, ms->hugepage_sz, ms->nchannel, - ms->nrank); + ms->nrank, + fd); return 0; } @@ -383,6 +420,66 @@ rte_dump_physmem_layout(FILE *f) rte_memseg_walk(dump_memseg, f); } +static int +check_iova(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + uint64_t *mask = arg; + rte_iova_t iova; + + /* higher address within segment */ + iova = (ms->iova + ms->len) - 1; + if (!(iova & *mask)) + return 0; + + RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n", + ms->iova, ms->len); + + RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask); + return 1; +} + +#if defined(RTE_ARCH_64) +#define MAX_DMA_MASK_BITS 63 +#else +#define MAX_DMA_MASK_BITS 31 +#endif + +/* check memseg iovas are within the required range based on dma mask */ +int __rte_experimental +rte_eal_check_dma_mask(uint8_t maskbits) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint64_t mask; + + /* sanity check */ + if (maskbits > MAX_DMA_MASK_BITS) { + RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n", + maskbits, MAX_DMA_MASK_BITS); + return -1; + } + + /* create dma mask */ + mask = ~((1ULL << maskbits) - 1); + + if (rte_memseg_walk(check_iova, &mask)) + /* + * Dma mask precludes hugepage usage. + * This device can not be used and we do not need to keep + * the dma mask. + */ + return 1; + + /* + * we need to keep the more restricted maskbit for checking + * potential dynamic memory allocation in the future. + */ + mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : + RTE_MIN(mcfg->dma_maskbits, maskbits); + + return 0; +} + /* return the number of memory channels */ unsigned rte_memory_get_nchannel(void) { @@ -548,6 +645,105 @@ rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) return ret; } +int __rte_experimental +rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int msl_idx, seg_idx, ret; + + if (ms == NULL) { + rte_errno = EINVAL; + return -1; + } + + msl = rte_mem_virt2memseg_list(ms->addr); + if (msl == NULL) { + rte_errno = EINVAL; + return -1; + } + arr = &msl->memseg_arr; + + msl_idx = msl - mcfg->memsegs; + seg_idx = rte_fbarray_find_idx(arr, ms); + + if (!rte_fbarray_is_used(arr, seg_idx)) { + rte_errno = ENOENT; + return -1; + } + + ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx); + if (ret < 0) { + rte_errno = -ret; + ret = -1; + } + return ret; +} + +int __rte_experimental +rte_memseg_get_fd(const struct rte_memseg *ms) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_get_fd_thread_unsafe(ms); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int __rte_experimental +rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, + size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int msl_idx, seg_idx, ret; + + if (ms == NULL || offset == NULL) { + rte_errno = EINVAL; + return -1; + } + + msl = rte_mem_virt2memseg_list(ms->addr); + if (msl == NULL) { + rte_errno = EINVAL; + return -1; + } + arr = &msl->memseg_arr; + + msl_idx = msl - mcfg->memsegs; + seg_idx = rte_fbarray_find_idx(arr, ms); + + if (!rte_fbarray_is_used(arr, seg_idx)) { + rte_errno = ENOENT; + return -1; + } + + ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset); + if (ret < 0) { + rte_errno = -ret; + ret = -1; + } + return ret; +} + +int __rte_experimental +rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + /* init memory subsystem */ int rte_eal_memory_init(void) diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c index 7300fe05..b7081afb 100644 --- a/lib/librte_eal/common/eal_common_memzone.c +++ b/lib/librte_eal/common/eal_common_memzone.c @@ -120,13 +120,15 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, return NULL; } - if ((socket_id != SOCKET_ID_ANY) && - (socket_id >= RTE_MAX_NUMA_NODES || socket_id < 0)) { + if ((socket_id != SOCKET_ID_ANY) && socket_id < 0) { rte_errno = EINVAL; return NULL; } - if (!rte_eal_has_hugepages()) + /* only set socket to SOCKET_ID_ANY if we aren't allocating for an + * external heap. + */ + if (!rte_eal_has_hugepages() && socket_id < RTE_MAX_NUMA_NODES) socket_id = SOCKET_ID_ANY; contig = (flags & RTE_MEMZONE_IOVA_CONTIG) != 0; diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c index dd5f9740..b82f3ddd 100644 --- a/lib/librte_eal/common/eal_common_options.c +++ b/lib/librte_eal/common/eal_common_options.c @@ -58,6 +58,7 @@ eal_long_options[] = { {OPT_HELP, 0, NULL, OPT_HELP_NUM }, {OPT_HUGE_DIR, 1, NULL, OPT_HUGE_DIR_NUM }, {OPT_HUGE_UNLINK, 0, NULL, OPT_HUGE_UNLINK_NUM }, + {OPT_IOVA_MODE, 1, NULL, OPT_IOVA_MODE_NUM }, {OPT_LCORES, 1, NULL, OPT_LCORES_NUM }, {OPT_LOG_LEVEL, 1, NULL, OPT_LOG_LEVEL_NUM }, {OPT_MASTER_LCORE, 1, NULL, OPT_MASTER_LCORE_NUM }, @@ -205,6 +206,7 @@ eal_reset_internal_config(struct internal_config *internal_cfg) #endif internal_cfg->vmware_tsc_map = 0; internal_cfg->create_uio_dev = 0; + internal_cfg->iova_mode = RTE_IOVA_DC; internal_cfg->user_mbuf_pool_ops_name = NULL; internal_cfg->init_complete = 0; } @@ -1075,6 +1077,25 @@ eal_parse_proc_type(const char *arg) return RTE_PROC_INVALID; } +static int +eal_parse_iova_mode(const char *name) +{ + int mode; + + if (name == NULL) + return -1; + + if (!strcmp("pa", name)) + mode = RTE_IOVA_PA; + else if (!strcmp("va", name)) + mode = RTE_IOVA_VA; + else + return -1; + + internal_config.iova_mode = mode; + return 0; +} + int eal_parse_common_option(int opt, const char *optarg, struct internal_config *conf) @@ -1281,6 +1302,13 @@ eal_parse_common_option(int opt, const char *optarg, case OPT_SINGLE_FILE_SEGMENTS_NUM: conf->single_file_segments = 1; break; + case OPT_IOVA_MODE_NUM: + if (eal_parse_iova_mode(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_IOVA_MODE "\n"); + return -1; + } + break; /* don't know what to do, leave this to caller */ default: @@ -1384,10 +1412,16 @@ eal_check_common_options(struct internal_config *internal_cfg) " is only supported in non-legacy memory mode\n"); } if (internal_cfg->single_file_segments && - internal_cfg->hugepage_unlink) { + internal_cfg->hugepage_unlink && + !internal_cfg->in_memory) { RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE_SEGMENTS" is " - "not compatible with neither --"OPT_IN_MEMORY" nor " - "--"OPT_HUGE_UNLINK"\n"); + "not compatible with --"OPT_HUGE_UNLINK"\n"); + return -1; + } + if (internal_cfg->legacy_mem && + internal_cfg->in_memory) { + RTE_LOG(ERR, EAL, "Option --"OPT_LEGACY_MEM" is not compatible " + "with --"OPT_IN_MEMORY"\n"); return -1; } @@ -1428,6 +1462,8 @@ eal_common_usage(void) " --"OPT_VDEV" Add a virtual device.\n" " The argument format is <driver><id>[,key=val,...]\n" " (ex: --vdev=net_pcap0,iface=eth2).\n" + " --"OPT_IOVA_MODE" Set IOVA mode. 'pa' for IOVA_PA\n" + " 'va' for IOVA_VA\n" " -d LIB.so|DIR Add a driver or driver directory\n" " (can be used multiple times)\n" " --"OPT_VMWARE_TSC_MAP" Use VMware TSC map instead of native RDTSC\n" diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c index 9fcb9121..97663d3b 100644 --- a/lib/librte_eal/common/eal_common_proc.c +++ b/lib/librte_eal/common/eal_common_proc.c @@ -939,13 +939,17 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, if (check_input(req) == false) return -1; + reply->nb_sent = 0; + reply->nb_received = 0; + reply->msgs = NULL; + if (internal_config.no_shconf) { RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); return 0; } if (gettimeofday(&now, NULL) < 0) { - RTE_LOG(ERR, EAL, "Faile to get current time\n"); + RTE_LOG(ERR, EAL, "Failed to get current time\n"); rte_errno = errno; return -1; } @@ -954,10 +958,6 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, end.tv_sec = now.tv_sec + ts->tv_sec + (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000; - reply->nb_sent = 0; - reply->nb_received = 0; - reply->msgs = NULL; - /* for secondary process, send request to the primary process only */ if (rte_eal_process_type() == RTE_PROC_SECONDARY) { pthread_mutex_lock(&pending_requests.lock); diff --git a/lib/librte_eal/common/eal_common_string_fns.c b/lib/librte_eal/common/eal_common_string_fns.c index 6ac5f828..60c5dd66 100644 --- a/lib/librte_eal/common/eal_common_string_fns.c +++ b/lib/librte_eal/common/eal_common_string_fns.c @@ -38,3 +38,29 @@ einval_error: errno = EINVAL; return -1; } + +/* Copy src string into dst. + * + * Return negative value and NUL-terminate if dst is too short, + * Otherwise return number of bytes copied. + */ +ssize_t +rte_strscpy(char *dst, const char *src, size_t dsize) +{ + size_t nleft = dsize; + size_t res = 0; + + /* Copy as many bytes as will fit. */ + while (nleft != 0) { + dst[res] = src[res]; + if (src[res] == '\0') + return res; + res++; + nleft--; + } + + /* Not enough room in dst, set NUL and return error. */ + if (res != 0) + dst[res - 1] = '\0'; + return -E2BIG; +} diff --git a/lib/librte_eal/common/eal_common_timer.c b/lib/librte_eal/common/eal_common_timer.c index 2e2b770f..dcf26bfe 100644 --- a/lib/librte_eal/common/eal_common_timer.c +++ b/lib/librte_eal/common/eal_common_timer.c @@ -7,9 +7,11 @@ #include <unistd.h> #include <inttypes.h> #include <sys/types.h> +#include <time.h> #include <errno.h> #include <rte_common.h> +#include <rte_compat.h> #include <rte_log.h> #include <rte_cycles.h> #include <rte_pause.h> @@ -31,6 +33,28 @@ rte_delay_us_block(unsigned int us) rte_pause(); } +void __rte_experimental +rte_delay_us_sleep(unsigned int us) +{ + struct timespec wait[2]; + int ind = 0; + + wait[0].tv_sec = 0; + if (us >= US_PER_S) { + wait[0].tv_sec = us / US_PER_S; + us -= wait[0].tv_sec * US_PER_S; + } + wait[0].tv_nsec = 1000 * us; + + while (nanosleep(&wait[ind], &wait[1 - ind]) && errno == EINTR) { + /* + * Sleep was interrupted. Flip the index, so the 'remainder' + * will become the 'request' for a next call. + */ + ind = 1 - ind; + } +} + uint64_t rte_get_tsc_hz(void) { diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h index de05febf..b3e8ae5e 100644 --- a/lib/librte_eal/common/eal_filesystem.h +++ b/lib/librte_eal/common/eal_filesystem.h @@ -27,7 +27,7 @@ eal_create_runtime_dir(void); /* returns runtime dir */ const char * -eal_get_runtime_dir(void); +rte_eal_get_runtime_dir(void); #define RUNTIME_CONFIG_FNAME "config" static inline const char * @@ -35,7 +35,7 @@ eal_runtime_config_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), RUNTIME_CONFIG_FNAME); return buffer; } @@ -47,7 +47,7 @@ eal_mp_socket_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), MP_SOCKET_FNAME); return buffer; } @@ -55,7 +55,8 @@ eal_mp_socket_path(void) #define FBARRAY_NAME_FMT "%s/fbarray_%s" static inline const char * eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) { - snprintf(buffer, buflen, FBARRAY_NAME_FMT, eal_get_runtime_dir(), name); + snprintf(buffer, buflen, FBARRAY_NAME_FMT, rte_eal_get_runtime_dir(), + name); return buffer; } @@ -66,7 +67,7 @@ eal_hugepage_info_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), HUGEPAGE_INFO_FNAME); return buffer; } @@ -78,7 +79,7 @@ eal_hugepage_data_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), HUGEPAGE_DATA_FNAME); return buffer; } @@ -99,7 +100,7 @@ eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id static inline const char * eal_get_hugefile_lock_path(char *buffer, size_t buflen, int f_id) { - snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, eal_get_runtime_dir(), + snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, rte_eal_get_runtime_dir(), f_id); buffer[buflen - 1] = '\0'; return buffer; diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h index 00ee6e06..737f17e3 100644 --- a/lib/librte_eal/common/eal_internal_cfg.h +++ b/lib/librte_eal/common/eal_internal_cfg.h @@ -70,6 +70,7 @@ struct internal_config { /**< user defined mbuf pool ops name */ unsigned num_hugepage_sizes; /**< how many sizes on this system */ struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES]; + enum rte_iova_mode iova_mode ; /**< Set IOVA mode on this system */ volatile unsigned int init_complete; /**< indicates whether EAL has completed initialization */ }; diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h index 36bb1a02..af917c2f 100644 --- a/lib/librte_eal/common/eal_memalloc.h +++ b/lib/librte_eal/common/eal_memalloc.h @@ -76,6 +76,17 @@ eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id); int eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len); +/* returns fd or -errno */ +int +eal_memalloc_get_seg_fd(int list_idx, int seg_idx); + +/* returns 0 or -errno */ +int +eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd); + +int +eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset); + int eal_memalloc_init(void); diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h index 96e16678..5271f944 100644 --- a/lib/librte_eal/common/eal_options.h +++ b/lib/librte_eal/common/eal_options.h @@ -63,6 +63,8 @@ enum { OPT_LEGACY_MEM_NUM, #define OPT_SINGLE_FILE_SEGMENTS "single-file-segments" OPT_SINGLE_FILE_SEGMENTS_NUM, +#define OPT_IOVA_MODE "iova-mode" + OPT_IOVA_MODE_NUM, OPT_LONG_MAX_NUM }; diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h index 4f809a83..442c6dc4 100644 --- a/lib/librte_eal/common/eal_private.h +++ b/lib/librte_eal/common/eal_private.h @@ -259,18 +259,6 @@ struct rte_bus *rte_bus_find_by_device_name(const char *str); int rte_mp_channel_init(void); /** - * Internal Executes all the user application registered callbacks for - * the specific device. It is for DPDK internal user only. User - * application should not call it directly. - * - * @param device_name - * The device name. - * @param event - * the device event type. - */ -void dev_callback_process(char *device_name, enum rte_dev_event_type event); - -/** * @internal * Parse a device string and store its information in an * rte_devargs structure. @@ -304,4 +292,82 @@ int rte_devargs_layers_parse(struct rte_devargs *devargs, const char *devstr); +/* + * probe a device at local process. + * + * @param devargs + * Device arguments including bus, class and driver properties. + * @param new_dev + * new device be probed as output. + * @return + * 0 on success, negative on error. + */ +int local_dev_probe(const char *devargs, struct rte_device **new_dev); + +/** + * Hotplug remove a given device from a specific bus at local process. + * + * @param dev + * Data structure of the device to remove. + * @return + * 0 on success, negative on error. + */ +int local_dev_remove(struct rte_device *dev); + +/** + * Iterate over all buses to find the corresponding bus to handle the sigbus + * error. + * @param failure_addr + * Pointer of the fault address of the sigbus error. + * + * @return + * 0 success to handle the sigbus. + * -1 failed to handle the sigbus + * 1 no bus can handler the sigbus + */ +int rte_bus_sigbus_handler(const void *failure_addr); + +/** + * @internal + * Register the sigbus handler. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +dev_sigbus_handler_register(void); + +/** + * @internal + * Unregister the sigbus handler. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +dev_sigbus_handler_unregister(void); + +/** + * Check if the option is registered. + * + * @param option + * The option to be parsed. + * + * @return + * 0 on success + * @return + * -1 on fail + */ +int +rte_option_parse(const char *opt); + +/** + * Iterate through the registered options and execute the associated + * callback if enabled. + */ +void +rte_option_init(void); + #endif /* _EAL_PRIVATE_H_ */ diff --git a/lib/librte_eal/common/hotplug_mp.c b/lib/librte_eal/common/hotplug_mp.c new file mode 100644 index 00000000..84f59d95 --- /dev/null +++ b/lib/librte_eal/common/hotplug_mp.c @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ +#include <string.h> + +#include <rte_eal.h> +#include <rte_alarm.h> +#include <rte_string_fns.h> +#include <rte_devargs.h> + +#include "hotplug_mp.h" +#include "eal_private.h" + +#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */ + +struct mp_reply_bundle { + struct rte_mp_msg msg; + void *peer; +}; + +static int cmp_dev_name(const struct rte_device *dev, const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + +/** + * Secondary to primary request. + * start from function eal_dev_hotplug_request_to_primary. + * + * device attach on secondary: + * a) secondary send sync request to the primary. + * b) primary receive the request and attach the new device if + * failed goto i). + * c) primary forward attach sync request to all secondary. + * d) secondary receive the request and attach the device and send a reply. + * e) primary check the reply if all success goes to j). + * f) primary send attach rollback sync request to all secondary. + * g) secondary receive the request and detach the device and send a reply. + * h) primary receive the reply and detach device as rollback action. + * i) send attach fail to secondary as a reply of step a), goto k). + * j) send attach success to secondary as a reply of step a). + * k) secondary receive reply and return. + * + * device detach on secondary: + * a) secondary send sync request to the primary. + * b) primary send detach sync request to all secondary. + * c) secondary detach the device and send a reply. + * d) primary check the reply if all success goes to g). + * e) primary send detach rollback sync request to all secondary. + * f) secondary receive the request and attach back device. goto h). + * g) primary detach the device if success goto i), else goto e). + * h) primary send detach fail to secondary as a reply of step a), goto j). + * i) primary send detach success to secondary as a reply of step a). + * j) secondary receive reply and return. + */ + +static int +send_response_to_secondary(const struct eal_dev_mp_req *req, + int result, + const void *peer) +{ + struct rte_mp_msg mp_resp; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + int ret; + + memset(&mp_resp, 0, sizeof(mp_resp)); + mp_resp.len_param = sizeof(*resp); + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + memcpy(resp, req, sizeof(*req)); + resp->result = result; + + ret = rte_mp_reply(&mp_resp, peer); + if (ret != 0) + RTE_LOG(ERR, EAL, "failed to send response to secondary\n"); + + return ret; +} + +static void +__handle_secondary_request(void *param) +{ + struct mp_reply_bundle *bundle = param; + const struct rte_mp_msg *msg = &bundle->msg; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct eal_dev_mp_req tmp_req; + struct rte_devargs *da; + struct rte_device *dev; + struct rte_bus *bus; + int ret = 0; + + tmp_req = *req; + + if (req->t == EAL_DEV_REQ_TYPE_ATTACH) { + ret = local_dev_probe(req->devargs, &dev); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug add device on primary\n"); + if (ret != -EEXIST) + goto finish; + } + ret = eal_dev_hotplug_request_to_secondary(&tmp_req); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + if (tmp_req.result != 0) { + ret = tmp_req.result; + RTE_LOG(ERR, EAL, "Failed to hotplug add device on secondary\n"); + if (ret != -EEXIST) + goto rollback; + } + } else if (req->t == EAL_DEV_REQ_TYPE_DETACH) { + da = calloc(1, sizeof(*da)); + if (da == NULL) { + ret = -ENOMEM; + goto finish; + } + + ret = rte_devargs_parse(da, req->devargs); + if (ret != 0) + goto finish; + + ret = eal_dev_hotplug_request_to_secondary(&tmp_req); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + bus = rte_bus_find_by_name(da->bus->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name); + ret = -ENOENT; + goto finish; + } + + dev = bus->find_device(NULL, cmp_dev_name, da->name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name); + ret = -ENOENT; + goto finish; + } + + if (tmp_req.result != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug remove device on secondary\n"); + ret = tmp_req.result; + if (ret != -ENOENT) + goto rollback; + } + + ret = local_dev_remove(dev); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug remove device on primary\n"); + if (ret != -ENOENT) + goto rollback; + } + } else { + RTE_LOG(ERR, EAL, "unsupported secondary to primary request\n"); + ret = -ENOTSUP; + } + goto finish; + +rollback: + if (req->t == EAL_DEV_REQ_TYPE_ATTACH) { + tmp_req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK; + eal_dev_hotplug_request_to_secondary(&tmp_req); + local_dev_remove(dev); + } else { + tmp_req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK; + eal_dev_hotplug_request_to_secondary(&tmp_req); + } + +finish: + ret = send_response_to_secondary(&tmp_req, ret, bundle->peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send response to secondary\n"); + + free(bundle->peer); + free(bundle); +} + +static int +handle_secondary_request(const struct rte_mp_msg *msg, const void *peer) +{ + struct mp_reply_bundle *bundle; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + int ret = 0; + + bundle = malloc(sizeof(*bundle)); + if (bundle == NULL) { + RTE_LOG(ERR, EAL, "not enough memory\n"); + return send_response_to_secondary(req, -ENOMEM, peer); + } + + bundle->msg = *msg; + /** + * We need to send reply on interrupt thread, but peer can't be + * parsed directly, so this is a temporal hack, need to be fixed + * when it is ready. + */ + bundle->peer = strdup(peer); + + /** + * We are at IPC callback thread, sync IPC is not allowed due to + * dead lock, so we delegate the task to interrupt thread. + */ + ret = rte_eal_alarm_set(1, __handle_secondary_request, bundle); + if (ret != 0) { + RTE_LOG(ERR, EAL, "failed to add mp task\n"); + return send_response_to_secondary(req, ret, peer); + } + return 0; +} + +static void __handle_primary_request(void *param) +{ + struct mp_reply_bundle *bundle = param; + struct rte_mp_msg *msg = &bundle->msg; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct rte_mp_msg mp_resp; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + struct rte_devargs *da; + struct rte_device *dev; + struct rte_bus *bus; + int ret = 0; + + memset(&mp_resp, 0, sizeof(mp_resp)); + + switch (req->t) { + case EAL_DEV_REQ_TYPE_ATTACH: + case EAL_DEV_REQ_TYPE_DETACH_ROLLBACK: + ret = local_dev_probe(req->devargs, &dev); + break; + case EAL_DEV_REQ_TYPE_DETACH: + case EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK: + da = calloc(1, sizeof(*da)); + if (da == NULL) { + ret = -ENOMEM; + goto quit; + } + + ret = rte_devargs_parse(da, req->devargs); + if (ret != 0) + goto quit; + + bus = rte_bus_find_by_name(da->bus->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name); + ret = -ENOENT; + goto quit; + } + + dev = bus->find_device(NULL, cmp_dev_name, da->name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name); + ret = -ENOENT; + goto quit; + } + + ret = local_dev_remove(dev); +quit: + break; + default: + ret = -EINVAL; + } + + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + mp_resp.len_param = sizeof(*req); + memcpy(resp, req, sizeof(*resp)); + resp->result = ret; + if (rte_mp_reply(&mp_resp, bundle->peer) < 0) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + + free(bundle->peer); + free(bundle); +} + +static int +handle_primary_request(const struct rte_mp_msg *msg, const void *peer) +{ + struct rte_mp_msg mp_resp; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + struct mp_reply_bundle *bundle; + int ret = 0; + + memset(&mp_resp, 0, sizeof(mp_resp)); + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + mp_resp.len_param = sizeof(*req); + memcpy(resp, req, sizeof(*resp)); + + bundle = calloc(1, sizeof(*bundle)); + if (bundle == NULL) { + resp->result = -ENOMEM; + ret = rte_mp_reply(&mp_resp, peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + + bundle->msg = *msg; + /** + * We need to send reply on interrupt thread, but peer can't be + * parsed directly, so this is a temporal hack, need to be fixed + * when it is ready. + */ + bundle->peer = (void *)strdup(peer); + + /** + * We are at IPC callback thread, sync IPC is not allowed due to + * dead lock, so we delegate the task to interrupt thread. + */ + ret = rte_eal_alarm_set(1, __handle_primary_request, bundle); + if (ret != 0) { + resp->result = ret; + ret = rte_mp_reply(&mp_resp, peer); + if (ret != 0) { + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + } + return 0; +} + +int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req) +{ + struct rte_mp_msg mp_req; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0}; + struct eal_dev_mp_req *resp; + int ret; + + memset(&mp_req, 0, sizeof(mp_req)); + memcpy(mp_req.param, req, sizeof(*req)); + mp_req.len_param = sizeof(*req); + strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name)); + + ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts); + if (ret || mp_reply.nb_received != 1) { + RTE_LOG(ERR, EAL, "cannot send request to primary"); + if (!ret) + return -1; + return ret; + } + + resp = (struct eal_dev_mp_req *)mp_reply.msgs[0].param; + req->result = resp->result; + + return ret; +} + +int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req) +{ + struct rte_mp_msg mp_req; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0}; + int ret; + int i; + + memset(&mp_req, 0, sizeof(mp_req)); + memcpy(mp_req.param, req, sizeof(*req)); + mp_req.len_param = sizeof(*req); + strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name)); + + ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts); + if (ret != 0) { + RTE_LOG(ERR, EAL, "rte_mp_request_sync failed\n"); + return ret; + } + + if (mp_reply.nb_sent != mp_reply.nb_received) { + RTE_LOG(ERR, EAL, "not all secondary reply\n"); + return -1; + } + + req->result = 0; + for (i = 0; i < mp_reply.nb_received; i++) { + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_reply.msgs[i].param; + if (resp->result != 0) { + req->result = resp->result; + if (req->t == EAL_DEV_REQ_TYPE_ATTACH && + req->result != -EEXIST) + break; + if (req->t == EAL_DEV_REQ_TYPE_DETACH && + req->result != -ENOENT) + break; + } + } + + return 0; +} + +int rte_mp_dev_hotplug_init(void) +{ + int ret; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST, + handle_secondary_request); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + EAL_DEV_MP_ACTION_REQUEST); + return ret; + } + } else { + ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST, + handle_primary_request); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + EAL_DEV_MP_ACTION_REQUEST); + return ret; + } + } + + return 0; +} diff --git a/lib/librte_eal/common/hotplug_mp.h b/lib/librte_eal/common/hotplug_mp.h new file mode 100644 index 00000000..597fde3d --- /dev/null +++ b/lib/librte_eal/common/hotplug_mp.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _HOTPLUG_MP_H_ +#define _HOTPLUG_MP_H_ + +#include "rte_dev.h" +#include "rte_bus.h" + +#define EAL_DEV_MP_ACTION_REQUEST "eal_dev_mp_request" +#define EAL_DEV_MP_ACTION_RESPONSE "eal_dev_mp_response" + +#define EAL_DEV_MP_DEV_NAME_MAX_LEN RTE_DEV_NAME_MAX_LEN +#define EAL_DEV_MP_BUS_NAME_MAX_LEN 32 +#define EAL_DEV_MP_DEV_ARGS_MAX_LEN 128 + +enum eal_dev_req_type { + EAL_DEV_REQ_TYPE_ATTACH, + EAL_DEV_REQ_TYPE_DETACH, + EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK, + EAL_DEV_REQ_TYPE_DETACH_ROLLBACK, +}; + +struct eal_dev_mp_req { + enum eal_dev_req_type t; + char devargs[EAL_DEV_MP_DEV_ARGS_MAX_LEN]; + int result; +}; + +/** + * This is a synchronous wrapper for secondary process send + * request to primary process, this is invoked when an attach + * or detach request is issued from primary process. + */ +int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req); + +/** + * this is a synchronous wrapper for primary process send + * request to secondary process, this is invoked when an attach + * or detach request issued from secondary process. + */ +int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req); + + +#endif /* _HOTPLUG_MP_H_ */ diff --git a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h index c4f974fe..859b0974 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h +++ b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h @@ -29,8 +29,8 @@ extern "C" { #ifndef RTE_ARM_EAL_RDTSC_USE_PMU /** - * This call is easily portable to any ARM architecture, however, - * it may be damn slow and inprecise for some tasks. + * This call is easily portable to any architecture, however, + * it may require a system call and inprecise for some tasks. */ static inline uint64_t __rte_rdtsc_syscall(void) diff --git a/lib/librte_eal/common/include/arch/ppc_64/meson.build b/lib/librte_eal/common/include/arch/ppc_64/meson.build new file mode 100644 index 00000000..00f96117 --- /dev/null +++ b/lib/librte_eal/common/include/arch/ppc_64/meson.build @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Luca Boccassi <bluca@debian.org> + +install_headers( + 'rte_atomic.h', + 'rte_byteorder.h', + 'rte_cpuflags.h', + 'rte_cycles.h', + 'rte_io.h', + 'rte_memcpy.h', + 'rte_pause.h', + 'rte_prefetch.h', + 'rte_rwlock.h', + 'rte_spinlock.h', + 'rte_vect.h', + subdir: get_option('include_subdir_arch')) diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h b/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h index 8bd83576..16e47ce2 100644 --- a/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h +++ b/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h @@ -9,10 +9,17 @@ extern "C" { #endif +#include "rte_atomic.h" + #include "generic/rte_pause.h" static inline void rte_pause(void) { + /* Set hardware multi-threading low priority */ + asm volatile("or 1,1,1"); + /* Set hardware multi-threading medium priority */ + asm volatile("or 2,2,2"); + rte_compiler_barrier(); } #ifdef __cplusplus diff --git a/lib/librte_eal/common/include/generic/rte_cycles.h b/lib/librte_eal/common/include/generic/rte_cycles.h index 0ff1af50..ac379e87 100644 --- a/lib/librte_eal/common/include/generic/rte_cycles.h +++ b/lib/librte_eal/common/include/generic/rte_cycles.h @@ -13,6 +13,7 @@ */ #include <stdint.h> +#include <rte_compat.h> #include <rte_debug.h> #include <rte_atomic.h> @@ -158,6 +159,16 @@ rte_delay_ms(unsigned ms) void rte_delay_us_block(unsigned int us); /** + * Delay function that uses system sleep. + * Does not block the CPU core. + * + * @param us + * Number of microseconds to wait. + */ +void __rte_experimental +rte_delay_us_sleep(unsigned int us); + +/** * Replace rte_delay_us with user defined function. * * @param userfunc diff --git a/lib/librte_eal/common/include/rte_bitmap.h b/lib/librte_eal/common/include/rte_bitmap.h index d9facc64..7a36ce73 100644 --- a/lib/librte_eal/common/include/rte_bitmap.h +++ b/lib/librte_eal/common/include/rte_bitmap.h @@ -88,7 +88,7 @@ __rte_bitmap_index1_inc(struct rte_bitmap *bmp) static inline uint64_t __rte_bitmap_mask1_get(struct rte_bitmap *bmp) { - return (~1lu) << bmp->offset1; + return (~1llu) << bmp->offset1; } static inline void @@ -317,7 +317,7 @@ rte_bitmap_get(struct rte_bitmap *bmp, uint32_t pos) index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; offset2 = pos & RTE_BITMAP_SLAB_BIT_MASK; slab2 = bmp->array2 + index2; - return (*slab2) & (1lu << offset2); + return (*slab2) & (1llu << offset2); } /** @@ -342,8 +342,8 @@ rte_bitmap_set(struct rte_bitmap *bmp, uint32_t pos) slab2 = bmp->array2 + index2; slab1 = bmp->array1 + index1; - *slab2 |= 1lu << offset2; - *slab1 |= 1lu << offset1; + *slab2 |= 1llu << offset2; + *slab1 |= 1llu << offset1; } /** @@ -370,7 +370,7 @@ rte_bitmap_set_slab(struct rte_bitmap *bmp, uint32_t pos, uint64_t slab) slab1 = bmp->array1 + index1; *slab2 |= slab; - *slab1 |= 1lu << offset1; + *slab1 |= 1llu << offset1; } static inline uint64_t @@ -408,7 +408,7 @@ rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos) slab2 = bmp->array2 + index2; /* Return if array2 slab is not all-zeros */ - *slab2 &= ~(1lu << offset2); + *slab2 &= ~(1llu << offset2); if (*slab2){ return; } @@ -424,7 +424,7 @@ rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos) index1 = pos >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 + RTE_BITMAP_CL_BIT_SIZE_LOG2); offset1 = (pos >> RTE_BITMAP_CL_BIT_SIZE_LOG2) & RTE_BITMAP_SLAB_BIT_MASK; slab1 = bmp->array1 + index1; - *slab1 &= ~(1lu << offset1); + *slab1 &= ~(1llu << offset1); return; } diff --git a/lib/librte_eal/common/include/rte_bus.h b/lib/librte_eal/common/include/rte_bus.h index b7b5b084..6be4b5ca 100644 --- a/lib/librte_eal/common/include/rte_bus.h +++ b/lib/librte_eal/common/include/rte_bus.h @@ -168,6 +168,35 @@ typedef int (*rte_bus_unplug_t)(struct rte_device *dev); typedef int (*rte_bus_parse_t)(const char *name, void *addr); /** + * Implement a specific hot-unplug handler, which is responsible for + * handle the failure when device be hot-unplugged. When the event of + * hot-unplug be detected, it could call this function to handle + * the hot-unplug failure and avoid app crash. + * @param dev + * Pointer of the device structure. + * + * @return + * 0 on success. + * !0 on error. + */ +typedef int (*rte_bus_hot_unplug_handler_t)(struct rte_device *dev); + +/** + * Implement a specific sigbus handler, which is responsible for handling + * the sigbus error which is either original memory error, or specific memory + * error that caused of device be hot-unplugged. When sigbus error be captured, + * it could call this function to handle sigbus error. + * @param failure_addr + * Pointer of the fault address of the sigbus error. + * + * @return + * 0 for success handle the sigbus for hot-unplug. + * 1 for not process it, because it is a generic sigbus error. + * -1 for failed to handle the sigbus for hot-unplug. + */ +typedef int (*rte_bus_sigbus_handler_t)(const void *failure_addr); + +/** * Bus scan policies */ enum rte_bus_scan_mode { @@ -212,6 +241,11 @@ struct rte_bus { struct rte_bus_conf conf; /**< Bus configuration */ rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu class */ rte_dev_iterate_t dev_iterate; /**< Device iterator. */ + rte_bus_hot_unplug_handler_t hot_unplug_handler; + /**< handle hot-unplug failure on the bus */ + rte_bus_sigbus_handler_t sigbus_handler; + /**< handle sigbus error on the bus */ + }; /** diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h index 069c13ec..cba7bbc1 100644 --- a/lib/librte_eal/common/include/rte_common.h +++ b/lib/librte_eal/common/include/rte_common.h @@ -68,6 +68,11 @@ typedef uint16_t unaligned_uint16_t; /******* Macro to mark functions and fields scheduled for removal *****/ #define __rte_deprecated __attribute__((__deprecated__)) +/** + * Mark a function or variable to a weak reference. + */ +#define __rte_weak __attribute__((__weak__)) + /*********** Macros to eliminate unused variable warnings ********/ /** @@ -164,6 +169,12 @@ static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void) */ #define RTE_PTR_DIFF(ptr1, ptr2) ((uintptr_t)(ptr1) - (uintptr_t)(ptr2)) +/** + * Workaround to cast a const field of a structure to non-const type. + */ +#define RTE_CAST_FIELD(var, field, type) \ + (*(type *)((uintptr_t)(var) + offsetof(typeof(*(var)), field))) + /*********** Macros/static functions for doing alignment ********/ diff --git a/lib/librte_eal/common/include/rte_dev.h b/lib/librte_eal/common/include/rte_dev.h index b80a8059..cd6c187c 100644 --- a/lib/librte_eal/common/include/rte_dev.h +++ b/lib/librte_eal/common/include/rte_dev.h @@ -39,7 +39,7 @@ struct rte_dev_event { char *devname; /**< device name */ }; -typedef void (*rte_dev_event_cb_fn)(char *device_name, +typedef void (*rte_dev_event_cb_fn)(const char *device_name, enum rte_dev_event_type event, void *cb_arg); @@ -156,63 +156,67 @@ struct rte_driver { struct rte_device { TAILQ_ENTRY(rte_device) next; /**< Next device */ const char *name; /**< Device name */ - const struct rte_driver *driver;/**< Associated driver */ + const struct rte_driver *driver; /**< Driver assigned after probing */ + const struct rte_bus *bus; /**< Bus handle assigned on scan */ int numa_node; /**< NUMA node connection */ - struct rte_devargs *devargs; /**< Device user arguments */ + struct rte_devargs *devargs; /**< Arguments for latest probing */ }; /** - * Attach a device to a registered driver. + * @warning + * @b EXPERIMENTAL: this API may change without prior notice * - * @param name - * The device name, that refers to a pci device (or some private - * way of designating a vdev device). Based on this device name, eal - * will identify a driver capable of handling it and pass it to the - * driver probing function. - * @param devargs - * Device arguments to be passed to the driver. - * @return - * 0 on success, negative on error. - */ -__rte_deprecated -int rte_eal_dev_attach(const char *name, const char *devargs); - -/** - * Detach a device from its driver. + * Query status of a device. * * @param dev - * A pointer to a rte_device structure. + * Generic device pointer. * @return - * 0 on success, negative on error. + * (int)true if already probed successfully, 0 otherwise. */ -__rte_deprecated -int rte_eal_dev_detach(struct rte_device *dev); +__rte_experimental +int rte_dev_is_probed(const struct rte_device *dev); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Hotplug add a given device to a specific bus. * + * In multi-process, it will request other processes to add the same device. + * A failure, in any process, will rollback the action + * * @param busname * The bus name the device is added to. * @param devname * The device name. Based on this device name, eal will identify a driver * capable of handling it and pass it to the driver probing function. - * @param devargs + * @param drvargs * Device arguments to be passed to the driver. * @return * 0 on success, negative on error. */ -int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname, - const char *devargs); +int rte_eal_hotplug_add(const char *busname, const char *devname, + const char *drvargs); /** * @warning * @b EXPERIMENTAL: this API may change without prior notice * + * Add matching devices. + * + * In multi-process, it will request other processes to add the same device. + * A failure, in any process, will rollback the action + * + * @param devargs + * Device arguments including bus, class and driver properties. + * @return + * 0 on success, negative on error. + */ +int __rte_experimental rte_dev_probe(const char *devargs); + +/** * Hotplug remove a given device from a specific bus. * + * In multi-process, it will request other processes to remove the same device. + * A failure, in any process, will rollback the action + * * @param busname * The bus name the device is removed from. * @param devname @@ -220,8 +224,23 @@ int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devn * @return * 0 on success, negative on error. */ -int __rte_experimental rte_eal_hotplug_remove(const char *busname, - const char *devname); +int rte_eal_hotplug_remove(const char *busname, const char *devname); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Remove one device. + * + * In multi-process, it will request other processes to remove the same device. + * A failure, in any process, will rollback the action + * + * @param dev + * Data structure of the device to remove. + * @return + * 0 on success, negative on error. + */ +int __rte_experimental rte_dev_remove(struct rte_device *dev); /** * Device comparison function. @@ -438,6 +457,22 @@ rte_dev_event_callback_unregister(const char *device_name, * @warning * @b EXPERIMENTAL: this API may change without prior notice * + * Executes all the user application registered callbacks for + * the specific device. + * + * @param device_name + * The device name. + * @param event + * the device event type. + */ +void __rte_experimental +rte_dev_event_callback_process(const char *device_name, + enum rte_dev_event_type event); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * * Start the device event monitoring. * * @return @@ -460,4 +495,30 @@ rte_dev_event_monitor_start(void); int __rte_experimental rte_dev_event_monitor_stop(void); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Enable hotplug handling for devices. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_hotplug_handle_enable(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Disable hotplug handling for devices. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_hotplug_handle_disable(void); + #endif /* _RTE_DEV_H_ */ diff --git a/lib/librte_eal/common/include/rte_devargs.h b/lib/librte_eal/common/include/rte_devargs.h index 097a4ce7..b1f121f8 100644 --- a/lib/librte_eal/common/include/rte_devargs.h +++ b/lib/librte_eal/common/include/rte_devargs.h @@ -67,36 +67,6 @@ struct rte_devargs { }; /** - * @deprecated - * Parse a devargs string. - * - * For PCI devices, the format of arguments string is "PCI_ADDR" or - * "PCI_ADDR,key=val,key2=val2,...". Examples: "08:00.1", "0000:5:00.0", - * "04:00.0,arg=val". - * - * For virtual devices, the format of arguments string is "DRIVER_NAME*" - * or "DRIVER_NAME*,key=val,key2=val2,...". Examples: "net_ring", - * "net_ring0", "net_pmdAnything,arg=0:arg2=1". - * - * The function parses the arguments string to get driver name and driver - * arguments. - * - * @param devargs_str - * The arguments as given by the user. - * @param drvname - * The pointer to the string to store parsed driver name. - * @param drvargs - * The pointer to the string to store parsed driver arguments. - * - * @return - * - 0 on success - * - A negative value on error - */ -__rte_deprecated -int rte_eal_parse_devargs_str(const char *devargs_str, - char **drvname, char **drvargs); - -/** * Parse a device string. * * Verify that a bus is capable of handling the device passed @@ -202,32 +172,12 @@ __rte_experimental int rte_devargs_add(enum rte_devtype devtype, const char *devargs_str); /** - * @deprecated - * Add a device to the user device list - * See rte_devargs_parse() for details. - * - * @param devtype - * The type of the device. - * @param devargs_str - * The arguments as given by the user. - * - * @return - * - 0 on success - * - A negative value on error - */ -__rte_deprecated -int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str); - -/** * Remove a device from the user device list. * Its resources are freed. * If the devargs cannot be found, nothing happens. * - * @param busname - * bus name of the devargs to remove. - * - * @param devname - * device name of the devargs to remove. + * @param devargs + * The instance or a copy of devargs to remove. * * @return * 0 on success. @@ -235,8 +185,7 @@ int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str); * >0 if the devargs was not within the user device list. */ __rte_experimental -int rte_devargs_remove(const char *busname, - const char *devname); +int rte_devargs_remove(struct rte_devargs *devargs); /** * Count the number of user devices of a specified type @@ -252,20 +201,6 @@ unsigned int rte_devargs_type_count(enum rte_devtype devtype); /** - * @deprecated - * Count the number of user devices of a specified type - * - * @param devtype - * The type of the devices to counted. - * - * @return - * The number of devices. - */ -__rte_deprecated -unsigned int -rte_eal_devargs_type_count(enum rte_devtype devtype); - -/** * This function dumps the list of user device and their arguments. * * @param f @@ -275,16 +210,6 @@ __rte_experimental void rte_devargs_dump(FILE *f); /** - * @deprecated - * This function dumps the list of user device and their arguments. - * - * @param f - * A pointer to a file for output - */ -__rte_deprecated -void rte_eal_devargs_dump(FILE *f); - -/** * Find next rte_devargs matching the provided bus name. * * @param busname diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h index e114dcbd..a0cedd57 100644 --- a/lib/librte_eal/common/include/rte_eal.h +++ b/lib/librte_eal/common/include/rte_eal.h @@ -316,7 +316,7 @@ rte_mp_sendmsg(struct rte_mp_msg *msg); * * @param reply * The reply argument will be for storing all the replied messages; - * the caller is responsible for free reply->replies. + * the caller is responsible for free reply->msgs. * * @param ts * The ts argument specifies how long we can wait for the peer(s) to reply. @@ -378,6 +378,15 @@ int __rte_experimental rte_mp_reply(struct rte_mp_msg *msg, const char *peer); /** + * Register all mp action callbacks for hotplug. + * + * @return + * 0 on success, negative on error. + */ +int __rte_experimental +rte_mp_dev_hotplug_init(void); + +/** * Usage function typedef used by the application usage function. * * Use this function typedef to define and call rte_set_application_usage_hook() @@ -498,6 +507,15 @@ enum rte_iova_mode rte_eal_iova_mode(void); const char * rte_eal_mbuf_user_pool_ops(void); +/** + * Get the runtime directory of DPDK + * + * @return + * The runtime directory path of DPDK + */ +const char * +rte_eal_get_runtime_dir(void); + #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_eal_interrupts.h b/lib/librte_eal/common/include/rte_eal_interrupts.h index 6eb49327..9d302f41 100644 --- a/lib/librte_eal/common/include/rte_eal_interrupts.h +++ b/lib/librte_eal/common/include/rte_eal_interrupts.h @@ -35,6 +35,7 @@ enum rte_intr_handle_type { RTE_INTR_HANDLE_EXT, /**< external handler */ RTE_INTR_HANDLE_VDEV, /**< virtual device */ RTE_INTR_HANDLE_DEV_EVENT, /**< device event handle */ + RTE_INTR_HANDLE_VFIO_REQ, /**< VFIO request handle */ RTE_INTR_HANDLE_MAX /**< count of elements */ }; diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h index aff0688d..84aabe36 100644 --- a/lib/librte_eal/common/include/rte_eal_memconfig.h +++ b/lib/librte_eal/common/include/rte_eal_memconfig.h @@ -30,9 +30,11 @@ struct rte_memseg_list { uint64_t addr_64; /**< Makes sure addr is always 64-bits */ }; - int socket_id; /**< Socket ID for all memsegs in this list. */ uint64_t page_sz; /**< Page size for all memsegs in this list. */ + int socket_id; /**< Socket ID for all memsegs in this list. */ volatile uint32_t version; /**< version number for multiprocess sync. */ + size_t len; /**< Length of memory area covered by this memseg list. */ + unsigned int external; /**< 1 if this list points to external memory */ struct rte_fbarray memseg_arr; }; @@ -70,13 +72,23 @@ struct rte_mem_config { struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */ - /* Heaps of Malloc per socket */ - struct malloc_heap malloc_heaps[RTE_MAX_NUMA_NODES]; + /* Heaps of Malloc */ + struct malloc_heap malloc_heaps[RTE_MAX_HEAPS]; + + /* next socket ID for external malloc heap */ + int next_socket_id; /* address of mem_config in primary process. used to map shared config into * exact same address the primary process maps it. */ uint64_t mem_cfg_addr; + + /* legacy mem and single file segments options are shared */ + uint32_t legacy_mem; + uint32_t single_file_segments; + + /* keeps the more restricted dma mask */ + uint8_t dma_maskbits; } __attribute__((__packed__)); diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h index a9fb7e45..7249e6aa 100644 --- a/lib/librte_eal/common/include/rte_malloc.h +++ b/lib/librte_eal/common/include/rte_malloc.h @@ -264,6 +264,198 @@ rte_malloc_get_socket_stats(int socket, struct rte_malloc_socket_stats *socket_stats); /** + * Add memory chunk to a heap with specified name. + * + * @note Multiple memory chunks can be added to the same heap + * + * @note Before accessing this memory in other processes, it needs to be + * attached in each of those processes by calling + * ``rte_malloc_heap_memory_attach`` in each other process. + * + * @note Memory must be previously allocated for DPDK to be able to use it as a + * malloc heap. Failing to do so will result in undefined behavior, up to and + * including segmentation faults. + * + * @note Calling this function will erase any contents already present at the + * supplied memory address. + * + * @param heap_name + * Name of the heap to add memory chunk to + * @param va_addr + * Start of virtual area to add to the heap + * @param len + * Length of virtual area to add to the heap + * @param iova_addrs + * Array of page IOVA addresses corresponding to each page in this memory + * area. Can be NULL, in which case page IOVA addresses will be set to + * RTE_BAD_IOVA. + * @param n_pages + * Number of elements in the iova_addrs array. Ignored if ``iova_addrs`` + * is NULL. + * @param page_sz + * Page size of the underlying memory + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to add memory to a reserved heap + * ENOSPC - no more space in internal config to store a new memory chunk + */ +int __rte_experimental +rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz); + +/** + * Remove memory chunk from heap with specified name. + * + * @note Memory chunk being removed must be the same as one that was added; + * partially removing memory chunks is not supported + * + * @note Memory area must not contain any allocated elements to allow its + * removal from the heap + * + * @note All other processes must detach from the memory chunk prior to it being + * removed from the heap. + * + * @param heap_name + * Name of the heap to remove memory from + * @param va_addr + * Virtual address to remove from the heap + * @param len + * Length of virtual area to remove from the heap + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to remove memory from a reserved heap + * ENOENT - heap or memory chunk was not found + * EBUSY - memory chunk still contains data + */ +int __rte_experimental +rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len); + +/** + * Attach to an already existing chunk of external memory in another process. + * + * @note This function must be called before any attempt is made to use an + * already existing external memory chunk. This function does *not* need to + * be called if a call to ``rte_malloc_heap_memory_add`` was made in the + * current process. + * + * @param heap_name + * Heap name to which this chunk of memory belongs + * @param va_addr + * Start address of memory chunk to attach to + * @param len + * Length of memory chunk to attach to + * @return + * 0 on successful attach + * -1 on unsuccessful attach, with rte_errno set to indicate cause for error: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to attach memory to a reserved heap + * ENOENT - heap or memory chunk was not found + */ +int __rte_experimental +rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len); + +/** + * Detach from a chunk of external memory in secondary process. + * + * @note This function must be called in before any attempt is made to remove + * external memory from the heap in another process. This function does *not* + * need to be called if a call to ``rte_malloc_heap_memory_remove`` will be + * called in current process. + * + * @param heap_name + * Heap name to which this chunk of memory belongs + * @param va_addr + * Start address of memory chunk to attach to + * @param len + * Length of memory chunk to attach to + * @return + * 0 on successful detach + * -1 on unsuccessful detach, with rte_errno set to indicate cause for error: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to detach memory from a reserved heap + * ENOENT - heap or memory chunk was not found + */ +int __rte_experimental +rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len); + +/** + * Creates a new empty malloc heap with a specified name. + * + * @note Heaps created via this call will automatically get assigned a unique + * socket ID, which can be found using ``rte_malloc_heap_get_socket()`` + * + * @param heap_name + * Name of the heap to create. + * + * @return + * - 0 on successful creation + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``heap_name`` was NULL, empty or too long + * EEXIST - heap by name of ``heap_name`` already exists + * ENOSPC - no more space in internal config to store a new heap + */ +int __rte_experimental +rte_malloc_heap_create(const char *heap_name); + +/** + * Destroys a previously created malloc heap with specified name. + * + * @note This function will return a failure result if not all memory allocated + * from the heap has been freed back to the heap + * + * @note This function will return a failure result if not all memory segments + * were removed from the heap prior to its destruction + * + * @param heap_name + * Name of the heap to create. + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``heap_name`` was NULL, empty or too long + * ENOENT - heap by the name of ``heap_name`` was not found + * EPERM - attempting to destroy reserved heap + * EBUSY - heap still contains data + */ +int __rte_experimental +rte_malloc_heap_destroy(const char *heap_name); + +/** + * Find socket ID corresponding to a named heap. + * + * @param name + * Heap name to find socket ID for + * @return + * Socket ID in case of success (a non-negative number) + * -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``name`` was NULL + * ENOENT - heap identified by the name ``name`` was not found + */ +int __rte_experimental +rte_malloc_heap_get_socket(const char *name); + +/** + * Check if a given socket ID refers to externally allocated memory. + * + * @note Passing SOCKET_ID_ANY will return 0. + * + * @param socket_id + * Socket ID to check + * @return + * 1 if socket ID refers to externally allocated memory + * 0 if socket ID refers to internal DPDK memory + * -1 if socket ID is invalid + */ +int __rte_experimental +rte_malloc_heap_socket_is_external(int socket_id); + +/** * Dump statistics. * * Dump for the specified type to a file. If the type argument is diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h index d43fa909..4a7e0eb1 100644 --- a/lib/librte_eal/common/include/rte_malloc_heap.h +++ b/lib/librte_eal/common/include/rte_malloc_heap.h @@ -12,6 +12,7 @@ /* Number of free lists per heap, grouped by size. */ #define RTE_HEAP_NUM_FREELISTS 13 +#define RTE_HEAP_NAME_MAX_LEN 32 /* dummy definition, for pointers */ struct malloc_elem; @@ -26,7 +27,9 @@ struct malloc_heap { struct malloc_elem *volatile last; unsigned alloc_count; + unsigned int socket_id; size_t total_size; + char name[RTE_HEAP_NAME_MAX_LEN]; } __rte_cache_aligned; #endif /* _RTE_MALLOC_HEAP_H_ */ diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h index c4b7f4cf..ce937058 100644 --- a/lib/librte_eal/common/include/rte_memory.h +++ b/lib/librte_eal/common/include/rte_memory.h @@ -215,6 +215,9 @@ typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl, * @note This function read-locks the memory hotplug subsystem, and thus cannot * be used within memory-related callback functions. * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * * @param func * Iterator function * @param arg @@ -233,6 +236,9 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg); * @note This function read-locks the memory hotplug subsystem, and thus cannot * be used within memory-related callback functions. * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * * @param func * Iterator function * @param arg @@ -251,6 +257,9 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg); * @note This function read-locks the memory hotplug subsystem, and thus cannot * be used within memory-related callback functions. * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * * @param func * Iterator function * @param arg @@ -318,6 +327,103 @@ int __rte_experimental rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg); /** + * Return file descriptor associated with a particular memseg (if available). + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @note This returns an internal file descriptor. Performing any operations on + * this file descriptor is inherently dangerous, so it should be treated + * as read-only for all intents and purposes. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd(const struct rte_memseg *ms); + +/** + * Return file descriptor associated with a particular memseg (if available). + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @note This returns an internal file descriptor. Performing any operations on + * this file descriptor is inherently dangerous, so it should be treated + * as read-only for all intents and purposes. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms); + +/** + * Get offset into segment file descriptor associated with a particular memseg + * (if available). + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * @param offset + * A pointer to offset value where the result will be stored. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - EINVAL - ``offset`` pointer was NULL + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset); + +/** + * Get offset into segment file descriptor associated with a particular memseg + * (if available). + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * @param offset + * A pointer to offset value where the result will be stored. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - EINVAL - ``offset`` pointer was NULL + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, + size_t *offset); + +/** * Dump the physical memory layout to a file. * * @note This function read-locks the memory hotplug subsystem, and thus cannot @@ -357,6 +463,9 @@ unsigned rte_memory_get_nchannel(void); */ unsigned rte_memory_get_nrank(void); +/* check memsegs iovas are within a range based on dma mask */ +int __rte_experimental rte_eal_check_dma_mask(uint8_t maskbits); + /** * Drivers based on uio will not load unless physical * addresses are obtainable. It is only possible to get diff --git a/lib/librte_eal/common/include/rte_option.h b/lib/librte_eal/common/include/rte_option.h new file mode 100644 index 00000000..8957b970 --- /dev/null +++ b/lib/librte_eal/common/include/rte_option.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation. + */ + +#ifndef __INCLUDE_RTE_OPTION_H__ +#define __INCLUDE_RTE_OPTION_H__ + +/** + * @file + * + * This API offers the ability to register options to the EAL command line and + * map those options to functions that will be executed at the end of EAL + * initialization. These options will be available as part of the EAL command + * line of applications and are dynamically managed. + * + * This is used primarily by DPDK libraries offering command line options. + * Currently, this API is limited to registering options without argument. + * + * The register API can be used to resolve circular dependency issues + * between EAL and the library. The library uses EAL, but is also initialized + * by EAL. Hence, EAL depends on the init function of the library. The API + * introduced in rte_option allows us to register the library init with EAL + * (passing a function pointer) and avoid the circular dependency. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int (*rte_option_cb)(void); + +/* + * Structure describing the EAL command line option being registered. + */ +struct rte_option { + TAILQ_ENTRY(rte_option) next; /**< Next entry in the list. */ + char *opt_str; /**< The option name. */ + rte_option_cb cb; /**< Function called when option is used. */ + int enabled; /**< Set when the option is used. */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register an option to the EAL command line. + * When recognized, the associated function will be executed at the end of EAL + * initialization. + * + * The associated structure must be available the whole time this option is + * registered (i.e. not stack memory). + * + * @param opt + * Structure describing the option to parse. + */ +void __rte_experimental +rte_option_register(struct rte_option *opt); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/librte_eal/common/include/rte_string_fns.h b/lib/librte_eal/common/include/rte_string_fns.h index 97597a14..9a2a1ff9 100644 --- a/lib/librte_eal/common/include/rte_string_fns.h +++ b/lib/librte_eal/common/include/rte_string_fns.h @@ -16,6 +16,7 @@ extern "C" { #endif #include <stdio.h> +#include <string.h> /** * Takes string "string" parameter and splits it at character "delim" @@ -60,12 +61,10 @@ rte_strlcpy(char *dst, const char *src, size_t size) /* pull in a strlcpy function */ #ifdef RTE_EXEC_ENV_BSDAPP -#include <string.h> #ifndef __BSD_VISIBLE /* non-standard functions are hidden */ #define strlcpy(dst, src, size) rte_strlcpy(dst, src, size) #endif - #else /* non-BSD platforms */ #ifdef RTE_USE_LIBBSD #include <bsd/string.h> @@ -76,6 +75,29 @@ rte_strlcpy(char *dst, const char *src, size_t size) #endif /* RTE_USE_LIBBSD */ #endif /* BSDAPP */ +/** + * Copy string src to buffer dst of size dsize. + * At most dsize-1 chars will be copied. + * Always NUL-terminates, unless (dsize == 0). + * Returns number of bytes copied (terminating NUL-byte excluded) on success ; + * negative errno on error. + * + * @param dst + * The destination string. + * + * @param src + * The input string to be copied. + * + * @param dsize + * Length in bytes of the destination buffer. + * + * @return + * The number of bytes copied on success + * -E2BIG if the destination buffer is too small. + */ +ssize_t +rte_strscpy(char *dst, const char *src, size_t dsize); + #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_version.h b/lib/librte_eal/common/include/rte_version.h index 7c6714a2..412ed2db 100644 --- a/lib/librte_eal/common/include/rte_version.h +++ b/lib/librte_eal/common/include/rte_version.h @@ -32,7 +32,7 @@ extern "C" { /** * Minor version/month number i.e. the mm in yy.mm.z */ -#define RTE_VER_MONTH 8 +#define RTE_VER_MONTH 11 /** * Patch level number i.e. the z in yy.mm.z @@ -42,14 +42,14 @@ extern "C" { /** * Extra string to be appended to version number */ -#define RTE_VER_SUFFIX "" +#define RTE_VER_SUFFIX "-rc" /** * Patch release number * 0-15 = release candidates * 16 = release */ -#define RTE_VER_RELEASE 16 +#define RTE_VER_RELEASE 1 /** * Macro to compute a version number usable for comparisons diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h index 5ca13fcc..cae96fab 100644 --- a/lib/librte_eal/common/include/rte_vfio.h +++ b/lib/librte_eal/common/include/rte_vfio.h @@ -14,6 +14,8 @@ extern "C" { #endif +#include <stdint.h> + /* * determine if VFIO is present on the system */ @@ -22,6 +24,9 @@ extern "C" { #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) #define VFIO_PRESENT #endif /* kernel version >= 3.6.0 */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0) +#define HAVE_VFIO_DEV_REQ_INTERFACE +#endif /* kernel version >= 4.0.0 */ #endif /* RTE_EAL_VFIO */ #ifdef VFIO_PRESENT @@ -44,6 +49,30 @@ extern "C" { #define RTE_VFIO_NOIOMMU 8 #endif +/* + * capabilities are only supported on kernel 4.6+. there were also some API + * changes as well, so add a macro to get cap offset. + */ +#ifdef VFIO_REGION_INFO_FLAG_CAPS +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS +#define VFIO_CAP_OFFSET(x) (x->cap_offset) +#else +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3) +#define VFIO_CAP_OFFSET(x) (x->resv) +struct vfio_info_cap_header { + uint16_t id; + uint16_t version; + uint32_t next; +}; +#endif + +/* kernels 4.16+ can map BAR containing MSI-X table */ +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE +#else +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3 +#endif + #else /* not VFIO_PRESENT */ /* we don't need an actual definition, only pointer is used */ @@ -227,7 +256,7 @@ rte_vfio_get_group_num(const char *sysfs_base, const char *dev_addr, int *iommu_group_num); /** - * Open VFIO container fd or get an existing one + * Open a new VFIO container fd * * This function is only relevant to linux and will return * an error on BSD. diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c index e0a8ed15..1a74660d 100644 --- a/lib/librte_eal/common/malloc_elem.c +++ b/lib/librte_eal/common/malloc_elem.c @@ -39,10 +39,14 @@ malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align) contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align); /* if we're in IOVA as VA mode, or if we're in legacy mode with - * hugepages, all elements are IOVA-contiguous. + * hugepages, all elements are IOVA-contiguous. however, we can only + * make these assumptions about internal memory - externally allocated + * segments have to be checked. */ - if (rte_eal_iova_mode() == RTE_IOVA_VA || - (internal_config.legacy_mem && rte_eal_has_hugepages())) + if (!elem->msl->external && + (rte_eal_iova_mode() == RTE_IOVA_VA || + (internal_config.legacy_mem && + rte_eal_has_hugepages()))) return RTE_PTR_DIFF(data_end, contig_seg_start); cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz); diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c index 12aaf2d7..1973b6e6 100644 --- a/lib/librte_eal/common/malloc_heap.c +++ b/lib/librte_eal/common/malloc_heap.c @@ -29,6 +29,10 @@ #include "malloc_heap.h" #include "malloc_mp.h" +/* start external socket ID's at a very high number */ +#define CONST_MAX(a, b) (a > b ? a : b) /* RTE_MAX is not a constant */ +#define EXTERNAL_HEAP_MIN_SOCKET_ID (CONST_MAX((1 << 8), RTE_MAX_NUMA_NODES)) + static unsigned check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) { @@ -66,6 +70,21 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) return check_flag & flags; } +int +malloc_socket_to_heap_id(unsigned int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i; + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + + if (heap->socket_id == socket_id) + return i; + } + return -1; +} + /* * Expand the heap with a memory area. */ @@ -93,9 +112,17 @@ malloc_add_seg(const struct rte_memseg_list *msl, struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_memseg_list *found_msl; struct malloc_heap *heap; - int msl_idx; + int msl_idx, heap_idx; + + if (msl->external) + return 0; - heap = &mcfg->malloc_heaps[msl->socket_id]; + heap_idx = malloc_socket_to_heap_id(msl->socket_id); + if (heap_idx < 0) { + RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n"); + return -1; + } + heap = &mcfg->malloc_heaps[heap_idx]; /* msl is const, so find it */ msl_idx = msl - mcfg->memsegs; @@ -165,7 +192,9 @@ find_biggest_element(struct malloc_heap *heap, size_t *size, for (elem = LIST_FIRST(&heap->free_head[idx]); !!elem; elem = LIST_NEXT(elem, free_list)) { size_t cur_size; - if (!check_hugepage_sz(flags, elem->msl->page_sz)) + if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 && + !check_hugepage_sz(flags, + elem->msl->page_sz)) continue; if (contig) { cur_size = @@ -259,11 +288,13 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, int socket, unsigned int flags, size_t align, size_t bound, bool contig, struct rte_memseg **ms, int n_segs) { + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_memseg_list *msl; struct malloc_elem *elem = NULL; size_t alloc_sz; int allocd_pages; void *ret, *map_addr; + uint64_t mask; alloc_sz = (size_t)pg_sz * n_segs; @@ -291,6 +322,16 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, goto fail; } + if (mcfg->dma_maskbits) { + mask = ~((1ULL << mcfg->dma_maskbits) - 1); + if (rte_eal_check_dma_mask(mask)) { + RTE_LOG(ERR, EAL, + "%s(): couldn't allocate memory due to DMA mask\n", + __func__); + goto fail; + } + } + /* add newly minted memsegs to malloc heap */ elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz); @@ -326,11 +367,9 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz, /* we can't know in advance how many pages we'll need, so we malloc */ ms = malloc(sizeof(*ms) * n_segs); - - memset(ms, 0, sizeof(*ms) * n_segs); - if (ms == NULL) return -1; + memset(ms, 0, sizeof(*ms) * n_segs); elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align, bound, contig, ms, n_segs); @@ -560,12 +599,14 @@ alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket, /* this will try lower page sizes first */ static void * -heap_alloc_on_socket(const char *type, size_t size, int socket, - unsigned int flags, size_t align, size_t bound, bool contig) +malloc_heap_alloc_on_heap_id(const char *type, size_t size, + unsigned int heap_id, unsigned int flags, size_t align, + size_t bound, bool contig) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct malloc_heap *heap = &mcfg->malloc_heaps[socket]; + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; + int socket_id; void *ret; rte_spinlock_lock(&(heap->lock)); @@ -583,12 +624,28 @@ heap_alloc_on_socket(const char *type, size_t size, int socket, * we may still be able to allocate memory from appropriate page sizes, * we just need to request more memory first. */ + + socket_id = rte_socket_id_by_idx(heap_id); + /* + * if socket ID is negative, we cannot find a socket ID for this heap - + * which means it's an external heap. those can have unexpected page + * sizes, so if the user asked to allocate from there - assume user + * knows what they're doing, and allow allocating from there with any + * page size flags. + */ + if (socket_id < 0) + size_flags |= RTE_MEMZONE_SIZE_HINT_ONLY; + ret = heap_alloc(heap, type, size, size_flags, align, bound, contig); if (ret != NULL) goto alloc_unlock; - if (!alloc_more_mem_on_socket(heap, size, socket, flags, align, bound, - contig)) { + /* if socket ID is invalid, this is an external heap */ + if (socket_id < 0) + goto alloc_unlock; + + if (!alloc_more_mem_on_socket(heap, size, socket_id, flags, align, + bound, contig)) { ret = heap_alloc(heap, type, size, flags, align, bound, contig); /* this should have succeeded */ @@ -604,14 +661,14 @@ void * malloc_heap_alloc(const char *type, size_t size, int socket_arg, unsigned int flags, size_t align, size_t bound, bool contig) { - int socket, i, cur_socket; + int socket, heap_id, i; void *ret; /* return NULL if size is 0 or alignment is not power-of-2 */ if (size == 0 || (align && !rte_is_power_of_2(align))) return NULL; - if (!rte_eal_has_hugepages()) + if (!rte_eal_has_hugepages() && socket_arg < RTE_MAX_NUMA_NODES) socket_arg = SOCKET_ID_ANY; if (socket_arg == SOCKET_ID_ANY) @@ -619,22 +676,25 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg, else socket = socket_arg; - /* Check socket parameter */ - if (socket >= RTE_MAX_NUMA_NODES) + /* turn socket ID into heap ID */ + heap_id = malloc_socket_to_heap_id(socket); + /* if heap id is negative, socket ID was invalid */ + if (heap_id < 0) return NULL; - ret = heap_alloc_on_socket(type, size, socket, flags, align, bound, - contig); + ret = malloc_heap_alloc_on_heap_id(type, size, heap_id, flags, align, + bound, contig); if (ret != NULL || socket_arg != SOCKET_ID_ANY) return ret; - /* try other heaps */ + /* try other heaps. we are only iterating through native DPDK sockets, + * so external heaps won't be included. + */ for (i = 0; i < (int) rte_socket_count(); i++) { - cur_socket = rte_socket_id_by_idx(i); - if (cur_socket == socket) + if (i == heap_id) continue; - ret = heap_alloc_on_socket(type, size, cur_socket, flags, - align, bound, contig); + ret = malloc_heap_alloc_on_heap_id(type, size, i, flags, align, + bound, contig); if (ret != NULL) return ret; } @@ -642,11 +702,11 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg, } static void * -heap_alloc_biggest_on_socket(const char *type, int socket, unsigned int flags, - size_t align, bool contig) +heap_alloc_biggest_on_heap_id(const char *type, unsigned int heap_id, + unsigned int flags, size_t align, bool contig) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct malloc_heap *heap = &mcfg->malloc_heaps[socket]; + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; void *ret; rte_spinlock_lock(&(heap->lock)); @@ -664,7 +724,7 @@ void * malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, size_t align, bool contig) { - int socket, i, cur_socket; + int socket, i, cur_socket, heap_id; void *ret; /* return NULL if align is not power-of-2 */ @@ -679,11 +739,13 @@ malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, else socket = socket_arg; - /* Check socket parameter */ - if (socket >= RTE_MAX_NUMA_NODES) + /* turn socket ID into heap ID */ + heap_id = malloc_socket_to_heap_id(socket); + /* if heap id is negative, socket ID was invalid */ + if (heap_id < 0) return NULL; - ret = heap_alloc_biggest_on_socket(type, socket, flags, align, + ret = heap_alloc_biggest_on_heap_id(type, heap_id, flags, align, contig); if (ret != NULL || socket_arg != SOCKET_ID_ANY) return ret; @@ -693,8 +755,8 @@ malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, cur_socket = rte_socket_id_by_idx(i); if (cur_socket == socket) continue; - ret = heap_alloc_biggest_on_socket(type, cur_socket, flags, - align, contig); + ret = heap_alloc_biggest_on_heap_id(type, i, flags, align, + contig); if (ret != NULL) return ret; } @@ -756,8 +818,10 @@ malloc_heap_free(struct malloc_elem *elem) /* anything after this is a bonus */ ret = 0; - /* ...of which we can't avail if we are in legacy mode */ - if (internal_config.legacy_mem) + /* ...of which we can't avail if we are in legacy mode, or if this is an + * externally allocated segment. + */ + if (internal_config.legacy_mem || (msl->external > 0)) goto free_unlock; /* check if we can free any memory back to the system */ @@ -914,7 +978,7 @@ malloc_heap_resize(struct malloc_elem *elem, size_t size) } /* - * Function to retrieve data for heap on given socket + * Function to retrieve data for a given heap */ int malloc_heap_get_stats(struct malloc_heap *heap, @@ -952,7 +1016,7 @@ malloc_heap_get_stats(struct malloc_heap *heap, } /* - * Function to retrieve data for heap on given socket + * Function to retrieve data for a given heap */ void malloc_heap_dump(struct malloc_heap *heap, FILE *f) @@ -973,10 +1037,216 @@ malloc_heap_dump(struct malloc_heap *heap, FILE *f) rte_spinlock_unlock(&heap->lock); } +static int +destroy_seg(struct malloc_elem *elem, size_t len) +{ + struct malloc_heap *heap = elem->heap; + struct rte_memseg_list *msl; + + msl = elem->msl; + + /* notify all subscribers that a memory area is going to be removed */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, elem, len); + + /* this element can be removed */ + malloc_elem_free_list_remove(elem); + malloc_elem_hide_region(elem, elem, len); + + heap->total_size -= len; + + memset(elem, 0, sizeof(*elem)); + + /* destroy the fbarray backing this memory */ + if (rte_fbarray_destroy(&msl->memseg_arr) < 0) + return -1; + + /* reset the memseg list */ + memset(msl, 0, sizeof(*msl)); + + return 0; +} + +int +malloc_heap_add_external_memory(struct malloc_heap *heap, void *va_addr, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + char fbarray_name[RTE_FBARRAY_NAME_LEN]; + struct rte_memseg_list *msl = NULL; + struct rte_fbarray *arr; + size_t seg_len = n_pages * page_sz; + unsigned int i; + + /* first, find a free memseg list */ + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *tmp = &mcfg->memsegs[i]; + if (tmp->base_va == NULL) { + msl = tmp; + break; + } + } + if (msl == NULL) { + RTE_LOG(ERR, EAL, "Couldn't find empty memseg list\n"); + rte_errno = ENOSPC; + return -1; + } + + snprintf(fbarray_name, sizeof(fbarray_name) - 1, "%s_%p", + heap->name, va_addr); + + /* create the backing fbarray */ + if (rte_fbarray_init(&msl->memseg_arr, fbarray_name, n_pages, + sizeof(struct rte_memseg)) < 0) { + RTE_LOG(ERR, EAL, "Couldn't create fbarray backing the memseg list\n"); + return -1; + } + arr = &msl->memseg_arr; + + /* fbarray created, fill it up */ + for (i = 0; i < n_pages; i++) { + struct rte_memseg *ms; + + rte_fbarray_set_used(arr, i); + ms = rte_fbarray_get(arr, i); + ms->addr = RTE_PTR_ADD(va_addr, i * page_sz); + ms->iova = iova_addrs == NULL ? RTE_BAD_IOVA : iova_addrs[i]; + ms->hugepage_sz = page_sz; + ms->len = page_sz; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + ms->socket_id = heap->socket_id; + } + + /* set up the memseg list */ + msl->base_va = va_addr; + msl->page_sz = page_sz; + msl->socket_id = heap->socket_id; + msl->len = seg_len; + msl->version = 0; + msl->external = 1; + + /* erase contents of new memory */ + memset(va_addr, 0, seg_len); + + /* now, add newly minted memory to the malloc heap */ + malloc_heap_add_memory(heap, msl, va_addr, seg_len); + + heap->total_size += seg_len; + + /* all done! */ + RTE_LOG(DEBUG, EAL, "Added segment for heap %s starting at %p\n", + heap->name, va_addr); + + /* notify all subscribers that a new memory area has been added */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + va_addr, seg_len); + + return 0; +} + +int +malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, + size_t len) +{ + struct malloc_elem *elem = heap->first; + + /* find element with specified va address */ + while (elem != NULL && elem != va_addr) { + elem = elem->next; + /* stop if we've blown past our VA */ + if (elem > (struct malloc_elem *)va_addr) { + rte_errno = ENOENT; + return -1; + } + } + /* check if element was found */ + if (elem == NULL || elem->msl->len != len) { + rte_errno = ENOENT; + return -1; + } + /* if element's size is not equal to segment len, segment is busy */ + if (elem->state == ELEM_BUSY || elem->size != len) { + rte_errno = EBUSY; + return -1; + } + return destroy_seg(elem, len); +} + +int +malloc_heap_create(struct malloc_heap *heap, const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint32_t next_socket_id = mcfg->next_socket_id; + + /* prevent overflow. did you really create 2 billion heaps??? */ + if (next_socket_id > INT32_MAX) { + RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); + rte_errno = ENOSPC; + return -1; + } + + /* initialize empty heap */ + heap->alloc_count = 0; + heap->first = NULL; + heap->last = NULL; + LIST_INIT(heap->free_head); + rte_spinlock_init(&heap->lock); + heap->total_size = 0; + heap->socket_id = next_socket_id; + + /* we hold a global mem hotplug writelock, so it's safe to increment */ + mcfg->next_socket_id++; + + /* set up name */ + strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); + return 0; +} + +int +malloc_heap_destroy(struct malloc_heap *heap) +{ + if (heap->alloc_count != 0) { + RTE_LOG(ERR, EAL, "Heap is still in use\n"); + rte_errno = EBUSY; + return -1; + } + if (heap->first != NULL || heap->last != NULL) { + RTE_LOG(ERR, EAL, "Heap still contains memory segments\n"); + rte_errno = EBUSY; + return -1; + } + if (heap->total_size != 0) + RTE_LOG(ERR, EAL, "Total size not zero, heap is likely corrupt\n"); + + /* after this, the lock will be dropped */ + memset(heap, 0, sizeof(*heap)); + + return 0; +} + int rte_eal_malloc_heap_init(void) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* assign min socket ID to external heaps */ + mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID; + + /* assign names to default DPDK heaps */ + for (i = 0; i < rte_socket_count(); i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + char heap_name[RTE_HEAP_NAME_MAX_LEN]; + int socket_id = rte_socket_id_by_idx(i); + + snprintf(heap_name, sizeof(heap_name) - 1, + "socket_%i", socket_id); + strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); + heap->socket_id = socket_id; + } + } + if (register_mp_requests()) { RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n"); diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h index f52cb555..e48996d5 100644 --- a/lib/librte_eal/common/malloc_heap.h +++ b/lib/librte_eal/common/malloc_heap.h @@ -34,6 +34,20 @@ malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags, size_t align, bool contig); int +malloc_heap_create(struct malloc_heap *heap, const char *heap_name); + +int +malloc_heap_destroy(struct malloc_heap *heap); + +int +malloc_heap_add_external_memory(struct malloc_heap *heap, void *va_addr, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz); + +int +malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, + size_t len); + +int malloc_heap_free(struct malloc_elem *elem); int @@ -47,6 +61,9 @@ void malloc_heap_dump(struct malloc_heap *heap, FILE *f); int +malloc_socket_to_heap_id(unsigned int socket_id); + +int rte_eal_malloc_heap_init(void); #ifdef __cplusplus diff --git a/lib/librte_eal/common/malloc_mp.c b/lib/librte_eal/common/malloc_mp.c index 931c14bc..5f2d4e0b 100644 --- a/lib/librte_eal/common/malloc_mp.c +++ b/lib/librte_eal/common/malloc_mp.c @@ -194,13 +194,11 @@ handle_alloc_request(const struct malloc_mp_req *m, /* we can't know in advance how many pages we'll need, so we malloc */ ms = malloc(sizeof(*ms) * n_segs); - - memset(ms, 0, sizeof(*ms) * n_segs); - if (ms == NULL) { RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n"); goto fail; } + memset(ms, 0, sizeof(*ms) * n_segs); elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket, ar->flags, ar->align, ar->bound, ar->contig, ms, diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build index 56005bea..2a10d57d 100644 --- a/lib/librte_eal/common/meson.build +++ b/lib/librte_eal/common/meson.build @@ -14,6 +14,7 @@ common_sources = files( 'eal_common_errno.c', 'eal_common_fbarray.c', 'eal_common_hexdump.c', + 'eal_common_hypervisor.c', 'eal_common_launch.c', 'eal_common_lcore.c', 'eal_common_log.c', @@ -27,11 +28,13 @@ common_sources = files( 'eal_common_thread.c', 'eal_common_timer.c', 'eal_common_uuid.c', + 'hotplug_mp.c', 'malloc_elem.c', 'malloc_heap.c', 'malloc_mp.c', 'rte_keepalive.c', 'rte_malloc.c', + 'rte_option.c', 'rte_reciprocal.c', 'rte_service.c' ) @@ -59,6 +62,7 @@ common_headers = files( 'include/rte_errno.h', 'include/rte_fbarray.h', 'include/rte_hexdump.h', + 'include/rte_hypervisor.h', 'include/rte_interrupts.h', 'include/rte_keepalive.h', 'include/rte_launch.h', @@ -68,6 +72,7 @@ common_headers = files( 'include/rte_malloc_heap.h', 'include/rte_memory.h', 'include/rte_memzone.h', + 'include/rte_option.h', 'include/rte_pci_dev_feature_defs.h', 'include/rte_pci_dev_features.h', 'include/rte_per_lcore.h', diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c index b51a6d11..9e61dc41 100644 --- a/lib/librte_eal/common/rte_malloc.c +++ b/lib/librte_eal/common/rte_malloc.c @@ -8,6 +8,7 @@ #include <string.h> #include <sys/queue.h> +#include <rte_errno.h> #include <rte_memcpy.h> #include <rte_memory.h> #include <rte_eal.h> @@ -23,6 +24,7 @@ #include <rte_malloc.h> #include "malloc_elem.h" #include "malloc_heap.h" +#include "eal_memalloc.h" /* Free the memory space back to heap */ @@ -44,13 +46,15 @@ rte_malloc_socket(const char *type, size_t size, unsigned int align, if (size == 0 || (align && !rte_is_power_of_2(align))) return NULL; - if (!rte_eal_has_hugepages()) + /* if there are no hugepages and if we are not allocating from an + * external heap, use memory from any socket available. checking for + * socket being external may return -1 in case of invalid socket, but + * that's OK - if there are no hugepages, it doesn't matter. + */ + if (rte_malloc_heap_socket_is_external(socket_arg) != 1 && + !rte_eal_has_hugepages()) socket_arg = SOCKET_ID_ANY; - /* Check socket parameter */ - if (socket_arg >= RTE_MAX_NUMA_NODES) - return NULL; - return malloc_heap_alloc(type, size, socket_arg, 0, align == 0 ? 1 : align, 0, false); } @@ -152,11 +156,20 @@ rte_malloc_get_socket_stats(int socket, struct rte_malloc_socket_stats *socket_stats) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int heap_idx, ret = -1; - if (socket >= RTE_MAX_NUMA_NODES || socket < 0) - return -1; + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + heap_idx = malloc_socket_to_heap_id(socket); + if (heap_idx < 0) + goto unlock; - return malloc_heap_get_stats(&mcfg->malloc_heaps[socket], socket_stats); + ret = malloc_heap_get_stats(&mcfg->malloc_heaps[heap_idx], + socket_stats); +unlock: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; } /* @@ -168,12 +181,75 @@ rte_malloc_dump_heaps(FILE *f) struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; unsigned int idx; - for (idx = 0; idx < rte_socket_count(); idx++) { - unsigned int socket = rte_socket_id_by_idx(idx); - fprintf(f, "Heap on socket %i:\n", socket); - malloc_heap_dump(&mcfg->malloc_heaps[socket], f); + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + fprintf(f, "Heap id: %u\n", idx); + malloc_heap_dump(&mcfg->malloc_heaps[idx], f); + } + + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); +} + +int +rte_malloc_heap_get_socket(const char *name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + unsigned int idx; + int ret; + + if (name == NULL || + strnlen(name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[idx]; + + if (!strncmp(name, tmp->name, RTE_HEAP_NAME_MAX_LEN)) { + heap = tmp; + break; + } + } + + if (heap != NULL) { + ret = heap->socket_id; + } else { + rte_errno = ENOENT; + ret = -1; + } + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int +rte_malloc_heap_socket_is_external(int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int idx; + int ret = -1; + + if (socket_id == SOCKET_ID_ANY) + return 0; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[idx]; + + if ((int)tmp->socket_id == socket_id) { + /* external memory always has large socket ID's */ + ret = tmp->socket_id >= RTE_MAX_NUMA_NODES; + break; + } } + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return ret; } /* @@ -182,14 +258,20 @@ rte_malloc_dump_heaps(FILE *f) void rte_malloc_dump_stats(FILE *f, __rte_unused const char *type) { - unsigned int socket; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int heap_id; struct rte_malloc_socket_stats sock_stats; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + /* Iterate through all initialised heaps */ - for (socket=0; socket< RTE_MAX_NUMA_NODES; socket++) { - if ((rte_malloc_get_socket_stats(socket, &sock_stats) < 0)) - continue; + for (heap_id = 0; heap_id < RTE_MAX_HEAPS; heap_id++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; + + malloc_heap_get_stats(heap, &sock_stats); - fprintf(f, "Socket:%u\n", socket); + fprintf(f, "Heap id:%u\n", heap_id); + fprintf(f, "\tHeap name:%s\n", heap->name); fprintf(f, "\tHeap_size:%zu,\n", sock_stats.heap_totalsz_bytes); fprintf(f, "\tFree_size:%zu,\n", sock_stats.heap_freesz_bytes); fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes); @@ -198,6 +280,7 @@ rte_malloc_dump_stats(FILE *f, __rte_unused const char *type) fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count); fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count); } + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); return; } @@ -223,7 +306,7 @@ rte_malloc_virt2iova(const void *addr) if (elem == NULL) return RTE_BAD_IOVA; - if (rte_eal_iova_mode() == RTE_IOVA_VA) + if (!elem->msl->external && rte_eal_iova_mode() == RTE_IOVA_VA) return (uintptr_t) addr; ms = rte_mem_virt2memseg(addr, elem->msl); @@ -235,3 +318,320 @@ rte_malloc_virt2iova(const void *addr) return ms->iova + RTE_PTR_DIFF(addr, ms->addr); } + +static struct malloc_heap * +find_named_heap(const char *name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i; + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + + if (!strncmp(name, heap->name, RTE_HEAP_NAME_MAX_LEN)) + return heap; + } + return NULL; +} + +int +rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + unsigned int n; + int ret; + + if (heap_name == NULL || va_addr == NULL || + page_sz == 0 || !rte_is_power_of_2(page_sz) || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + ret = -1; + goto unlock; + } + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + /* cannot add memory to internal heaps */ + rte_errno = EPERM; + ret = -1; + goto unlock; + } + n = len / page_sz; + if (n != n_pages && iova_addrs != NULL) { + rte_errno = EINVAL; + ret = -1; + goto unlock; + } + + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_add_external_memory(heap, va_addr, iova_addrs, n, + page_sz); + rte_spinlock_unlock(&heap->lock); + +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int +rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + int ret; + + if (heap_name == NULL || va_addr == NULL || len == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + /* cannot remove memory from internal heaps */ + rte_errno = EPERM; + ret = -1; + goto unlock; + } + + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_remove_external_memory(heap, va_addr, len); + rte_spinlock_unlock(&heap->lock); + +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +struct sync_mem_walk_arg { + void *va_addr; + size_t len; + int result; + bool attach; +}; + +static int +sync_mem_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct sync_mem_walk_arg *wa = arg; + size_t len = msl->page_sz * msl->memseg_arr.len; + + if (msl->base_va == wa->va_addr && + len == wa->len) { + struct rte_memseg_list *found_msl; + int msl_idx, ret; + + /* msl is const */ + msl_idx = msl - mcfg->memsegs; + found_msl = &mcfg->memsegs[msl_idx]; + + if (wa->attach) { + ret = rte_fbarray_attach(&found_msl->memseg_arr); + } else { + /* notify all subscribers that a memory area is about to + * be removed + */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + msl->base_va, msl->len); + ret = rte_fbarray_detach(&found_msl->memseg_arr); + } + + if (ret < 0) { + wa->result = -rte_errno; + } else { + /* notify all subscribers that a new memory area was + * added + */ + if (wa->attach) + eal_memalloc_mem_event_notify( + RTE_MEM_EVENT_ALLOC, + msl->base_va, msl->len); + wa->result = 0; + } + return 1; + } + return 0; +} + +static int +sync_memory(const char *heap_name, void *va_addr, size_t len, bool attach) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + struct sync_mem_walk_arg wa; + int ret; + + if (heap_name == NULL || va_addr == NULL || len == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + /* we shouldn't be able to sync to internal heaps */ + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + rte_errno = EPERM; + ret = -1; + goto unlock; + } + + /* find corresponding memseg list to sync to */ + wa.va_addr = va_addr; + wa.len = len; + wa.result = -ENOENT; /* fail unless explicitly told to succeed */ + wa.attach = attach; + + /* we're already holding a read lock */ + rte_memseg_list_walk_thread_unsafe(sync_mem_walk, &wa); + + if (wa.result < 0) { + rte_errno = -wa.result; + ret = -1; + } else { + /* notify all subscribers that a new memory area was added */ + if (attach) + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + va_addr, len); + ret = 0; + } +unlock: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return ret; +} + +int +rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len) +{ + return sync_memory(heap_name, va_addr, len, true); +} + +int +rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len) +{ + return sync_memory(heap_name, va_addr, len, false); +} + +int +rte_malloc_heap_create(const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + int i, ret; + + if (heap_name == NULL || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + /* check if there is space in the heap list, or if heap with this name + * already exists. + */ + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[i]; + /* existing heap */ + if (strncmp(heap_name, tmp->name, + RTE_HEAP_NAME_MAX_LEN) == 0) { + RTE_LOG(ERR, EAL, "Heap %s already exists\n", + heap_name); + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + /* empty heap */ + if (strnlen(tmp->name, RTE_HEAP_NAME_MAX_LEN) == 0) { + heap = tmp; + break; + } + } + if (heap == NULL) { + RTE_LOG(ERR, EAL, "Cannot create new heap: no space\n"); + rte_errno = ENOSPC; + ret = -1; + goto unlock; + } + + /* we're sure that we can create a new heap, so do it */ + ret = malloc_heap_create(heap, heap_name); +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int +rte_malloc_heap_destroy(const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + int ret; + + if (heap_name == NULL || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + /* start from non-socket heaps */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + RTE_LOG(ERR, EAL, "Heap %s not found\n", heap_name); + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + /* we shouldn't be able to destroy internal heaps */ + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + rte_errno = EPERM; + ret = -1; + goto unlock; + } + /* sanity checks done, now we can destroy the heap */ + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_destroy(heap); + + /* if we failed, lock is still active */ + if (ret < 0) + rte_spinlock_unlock(&heap->lock); +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} diff --git a/lib/librte_eal/common/rte_option.c b/lib/librte_eal/common/rte_option.c new file mode 100644 index 00000000..02d59a86 --- /dev/null +++ b/lib/librte_eal/common/rte_option.c @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation. + */ + +#include <unistd.h> +#include <string.h> + +#include <rte_eal.h> +#include <rte_option.h> + +#include "eal_private.h" + +TAILQ_HEAD(rte_option_list, rte_option); + +struct rte_option_list rte_option_list = + TAILQ_HEAD_INITIALIZER(rte_option_list); + +static struct rte_option *option; + +int +rte_option_parse(const char *opt) +{ + /* Check if the option is registered */ + TAILQ_FOREACH(option, &rte_option_list, next) { + if (strcmp(opt, option->opt_str) == 0) { + option->enabled = 1; + return 0; + } + } + + return -1; +} + +void __rte_experimental +rte_option_register(struct rte_option *opt) +{ + TAILQ_FOREACH(option, &rte_option_list, next) { + if (strcmp(opt->opt_str, option->opt_str) == 0) + RTE_LOG(INFO, EAL, "Option %s has already been registered.", + opt->opt_str); + return; + } + + TAILQ_INSERT_HEAD(&rte_option_list, opt, next); +} + +void +rte_option_init(void) +{ + TAILQ_FOREACH(option, &rte_option_list, next) { + if (option->enabled) + option->cb(); + } +} diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile index fd92c75c..51deb579 100644 --- a/lib/librte_eal/linuxapp/eal/Makefile +++ b/lib/librte_eal/linuxapp/eal/Makefile @@ -10,7 +10,7 @@ ARCH_DIR ?= $(RTE_ARCH) EXPORT_MAP := ../../rte_eal_version.map VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) -LIBABIVER := 8 +LIBABIVER := 9 VPATH += $(RTE_SDK)/lib/librte_eal/common @@ -70,10 +70,12 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += hotplug_mp.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_option.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c @@ -85,22 +87,6 @@ SRCS-y += rte_cycles.c CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) -CFLAGS_eal.o := -D_GNU_SOURCE -CFLAGS_eal_interrupts.o := -D_GNU_SOURCE -CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE -CFLAGS_eal_timer.o := -D_GNU_SOURCE -CFLAGS_eal_lcore.o := -D_GNU_SOURCE -CFLAGS_eal_memalloc.o := -D_GNU_SOURCE -CFLAGS_eal_thread.o := -D_GNU_SOURCE -CFLAGS_eal_log.o := -D_GNU_SOURCE -CFLAGS_eal_common_log.o := -D_GNU_SOURCE -CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE -CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE -CFLAGS_eal_common_options.o := -D_GNU_SOURCE -CFLAGS_eal_common_thread.o := -D_GNU_SOURCE -CFLAGS_eal_common_lcore.o := -D_GNU_SOURCE -CFLAGS_rte_cycles.o := -D_GNU_SOURCE - # workaround for a gcc bug with noreturn attribute # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c index e59ac657..361744d4 100644 --- a/lib/librte_eal/linuxapp/eal/eal.c +++ b/lib/librte_eal/linuxapp/eal/eal.c @@ -48,6 +48,7 @@ #include <rte_atomic.h> #include <malloc_heap.h> #include <rte_vfio.h> +#include <rte_option.h> #include "eal_private.h" #include "eal_thread.h" @@ -149,7 +150,7 @@ eal_create_runtime_dir(void) } const char * -eal_get_runtime_dir(void) +rte_eal_get_runtime_dir(void) { return runtime_dir; } @@ -263,6 +264,8 @@ rte_eal_config_create(void) * processes could later map the config into this exact location */ rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; + rte_config.mem_config->dma_maskbits = 0; + } /* attach to an existing shared memory config */ @@ -352,6 +355,24 @@ eal_proc_type_detect(void) return ptype; } +/* copies data from internal config to shared config */ +static void +eal_update_mem_config(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + mcfg->legacy_mem = internal_config.legacy_mem; + mcfg->single_file_segments = internal_config.single_file_segments; +} + +/* copies data from shared config to internal config */ +static void +eal_update_internal_config(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + internal_config.legacy_mem = mcfg->legacy_mem; + internal_config.single_file_segments = mcfg->single_file_segments; +} + /* Sets up rte_config structure with the pointer to shared memory config.*/ static void rte_config_init(void) @@ -361,11 +382,13 @@ rte_config_init(void) switch (rte_config.process_type){ case RTE_PROC_PRIMARY: rte_eal_config_create(); + eal_update_mem_config(); break; case RTE_PROC_SECONDARY: rte_eal_config_attach(); rte_eal_mcfg_wait_complete(rte_config.mem_config); rte_eal_config_reattach(); + eal_update_internal_config(); break; case RTE_PROC_AUTO: case RTE_PROC_INVALID: @@ -580,12 +603,20 @@ eal_parse_args(int argc, char **argv) argvopt = argv; optind = 1; + opterr = 0; while ((opt = getopt_long(argc, argvopt, eal_short_options, eal_long_options, &option_index)) != EOF) { - /* getopt is not happy, stop right now */ + /* + * getopt didn't recognise the option, lets parse the + * registered options to see if the flag is valid + */ if (opt == '?') { + ret = rte_option_parse(argv[optind-1]); + if (ret == 0) + continue; + eal_usage(prgname); ret = -1; goto out; @@ -725,6 +756,9 @@ check_socket(const struct rte_memseg_list *msl, void *arg) { int *socket_id = arg; + if (msl->external) + return 0; + return *socket_id == msl->socket_id; } @@ -793,7 +827,8 @@ rte_eal_init(int argc, char **argv) int i, fctret, ret; pthread_t thread_id; static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); - const char *logid; + const char *p; + static char logid[PATH_MAX]; char cpuset[RTE_CPU_AFFINITY_STR_LEN]; char thread_name[RTE_MAX_THREAD_NAME_LEN]; @@ -810,9 +845,8 @@ rte_eal_init(int argc, char **argv) return -1; } - logid = strrchr(argv[0], '/'); - logid = strdup(logid ? logid + 1: argv[0]); - + p = strrchr(argv[0], '/'); + strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid)); thread_id = pthread_self(); eal_reset_internal_config(&internal_config); @@ -835,7 +869,7 @@ rte_eal_init(int argc, char **argv) } if (eal_plugins_init() < 0) { - rte_eal_init_alert("Cannot init plugins\n"); + rte_eal_init_alert("Cannot init plugins"); rte_errno = EINVAL; rte_atomic32_clear(&run_once); return -1; @@ -850,7 +884,7 @@ rte_eal_init(int argc, char **argv) rte_config_init(); if (rte_eal_intr_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + rte_eal_init_alert("Cannot init interrupt-handling thread"); return -1; } @@ -858,30 +892,43 @@ rte_eal_init(int argc, char **argv) * bus through mp channel in the secondary process before the bus scan. */ if (rte_mp_channel_init() < 0) { - rte_eal_init_alert("failed to init mp channel\n"); + rte_eal_init_alert("failed to init mp channel"); if (rte_eal_process_type() == RTE_PROC_PRIMARY) { rte_errno = EFAULT; return -1; } } + /* register multi-process action callbacks for hotplug */ + if (rte_mp_dev_hotplug_init() < 0) { + rte_eal_init_alert("failed to register mp callback for hotplug"); + return -1; + } + if (rte_bus_scan()) { - rte_eal_init_alert("Cannot scan the buses for devices\n"); + rte_eal_init_alert("Cannot scan the buses for devices"); rte_errno = ENODEV; rte_atomic32_clear(&run_once); return -1; } - /* autodetect the iova mapping mode (default is iova_pa) */ - rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class(); - - /* Workaround for KNI which requires physical address to work */ - if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA && - rte_eal_check_module("rte_kni") == 1) { - rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA; - RTE_LOG(WARNING, EAL, - "Some devices want IOVA as VA but PA will be used because.. " - "KNI module inserted\n"); + /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */ + if (internal_config.iova_mode == RTE_IOVA_DC) { + /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */ + rte_eal_get_configuration()->iova_mode = + rte_bus_get_iommu_class(); + + /* Workaround for KNI which requires physical address to work */ + if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA && + rte_eal_check_module("rte_kni") == 1) { + rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA; + RTE_LOG(WARNING, EAL, + "Some devices want IOVA as VA but PA will be used because.. " + "KNI module inserted\n"); + } + } else { + rte_eal_get_configuration()->iova_mode = + internal_config.iova_mode; } if (internal_config.no_hugetlbfs == 0) { @@ -924,7 +971,7 @@ rte_eal_init(int argc, char **argv) #ifdef VFIO_PRESENT if (rte_eal_vfio_setup() < 0) { - rte_eal_init_alert("Cannot init VFIO\n"); + rte_eal_init_alert("Cannot init VFIO"); rte_errno = EAGAIN; rte_atomic32_clear(&run_once); return -1; @@ -935,13 +982,13 @@ rte_eal_init(int argc, char **argv) * initialize memzones first. */ if (rte_eal_memzone_init() < 0) { - rte_eal_init_alert("Cannot init memzone\n"); + rte_eal_init_alert("Cannot init memzone"); rte_errno = ENODEV; return -1; } if (rte_eal_memory_init() < 0) { - rte_eal_init_alert("Cannot init memory\n"); + rte_eal_init_alert("Cannot init memory"); rte_errno = ENOMEM; return -1; } @@ -950,25 +997,25 @@ rte_eal_init(int argc, char **argv) eal_hugedirs_unlock(); if (rte_eal_malloc_heap_init() < 0) { - rte_eal_init_alert("Cannot init malloc heap\n"); + rte_eal_init_alert("Cannot init malloc heap"); rte_errno = ENODEV; return -1; } if (rte_eal_tailqs_init() < 0) { - rte_eal_init_alert("Cannot init tail queues for objects\n"); + rte_eal_init_alert("Cannot init tail queues for objects"); rte_errno = EFAULT; return -1; } if (rte_eal_alarm_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + rte_eal_init_alert("Cannot init interrupt-handling thread"); /* rte_eal_alarm_init sets rte_errno on failure. */ return -1; } if (rte_eal_timer_init() < 0) { - rte_eal_init_alert("Cannot init HPET or TSC timers\n"); + rte_eal_init_alert("Cannot init HPET or TSC timers"); rte_errno = ENOTSUP; return -1; } @@ -979,8 +1026,8 @@ rte_eal_init(int argc, char **argv) ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n", - rte_config.master_lcore, (int)thread_id, cpuset, + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + rte_config.master_lcore, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "..."); RTE_LCORE_FOREACH_SLAVE(i) { @@ -1022,14 +1069,14 @@ rte_eal_init(int argc, char **argv) /* initialize services so vdevs register service during bus_probe. */ ret = rte_service_init(); if (ret) { - rte_eal_init_alert("rte_service_init() failed\n"); + rte_eal_init_alert("rte_service_init() failed"); rte_errno = ENOEXEC; return -1; } /* Probe all the buses and devices/drivers on them */ if (rte_bus_probe()) { - rte_eal_init_alert("Cannot probe devices\n"); + rte_eal_init_alert("Cannot probe devices"); rte_errno = ENOTSUP; return -1; } @@ -1051,6 +1098,9 @@ rte_eal_init(int argc, char **argv) rte_eal_mcfg_complete(); + /* Call each registered callback, if enabled */ + rte_option_init(); + return fctret; } @@ -1059,7 +1109,12 @@ mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg __rte_unused) { /* ms is const, so find this memseg */ - struct rte_memseg *found = rte_mem_virt2memseg(ms->addr, msl); + struct rte_memseg *found; + + if (msl->external) + return 0; + + found = rte_mem_virt2memseg(ms->addr, msl); found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE; diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c index 1cf6aebf..d589c692 100644 --- a/lib/librte_eal/linuxapp/eal/eal_dev.c +++ b/lib/librte_eal/linuxapp/eal/eal_dev.c @@ -4,6 +4,8 @@ #include <string.h> #include <unistd.h> +#include <fcntl.h> +#include <signal.h> #include <sys/socket.h> #include <linux/netlink.h> @@ -14,15 +16,32 @@ #include <rte_malloc.h> #include <rte_interrupts.h> #include <rte_alarm.h> +#include <rte_bus.h> +#include <rte_eal.h> +#include <rte_spinlock.h> +#include <rte_errno.h> #include "eal_private.h" static struct rte_intr_handle intr_handle = {.fd = -1 }; static bool monitor_started; +static bool hotplug_handle; #define EAL_UEV_MSG_LEN 4096 #define EAL_UEV_MSG_ELEM_LEN 128 +/* + * spinlock for device hot-unplug failure handling. If it try to access bus or + * device, such as handle sigbus on bus or handle memory failure for device + * just need to use this lock. It could protect the bus and the device to avoid + * race condition. + */ +static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; + +static struct sigaction sigbus_action_old; + +static int sigbus_need_recover; + static void dev_uev_handler(__rte_unused void *param); /* identify the system layer which reports this event. */ @@ -33,6 +52,55 @@ enum eal_dev_event_subsystem { EAL_DEV_EVENT_SUBSYSTEM_MAX }; +static void +sigbus_action_recover(void) +{ + if (sigbus_need_recover) { + sigaction(SIGBUS, &sigbus_action_old, NULL); + sigbus_need_recover = 0; + } +} + +static void sigbus_handler(int signum, siginfo_t *info, + void *ctx __rte_unused) +{ + int ret; + + RTE_LOG(DEBUG, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n", + (int)pthread_self(), info->si_addr); + + rte_spinlock_lock(&failure_handle_lock); + ret = rte_bus_sigbus_handler(info->si_addr); + rte_spinlock_unlock(&failure_handle_lock); + if (ret == -1) { + rte_exit(EXIT_FAILURE, + "Failed to handle SIGBUS for hot-unplug, " + "(rte_errno: %s)!", strerror(rte_errno)); + } else if (ret == 1) { + if (sigbus_action_old.sa_flags == SA_SIGINFO + && sigbus_action_old.sa_sigaction) { + (*(sigbus_action_old.sa_sigaction))(signum, + info, ctx); + } else if (sigbus_action_old.sa_flags != SA_SIGINFO + && sigbus_action_old.sa_handler) { + (*(sigbus_action_old.sa_handler))(signum); + } else { + rte_exit(EXIT_FAILURE, + "Failed to handle generic SIGBUS!"); + } + } + + RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n"); +} + +static int cmp_dev_name(const struct rte_device *dev, + const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + static int dev_uev_socket_fd_create(void) { @@ -147,6 +215,9 @@ dev_uev_handler(__rte_unused void *param) struct rte_dev_event uevent; int ret; char buf[EAL_UEV_MSG_LEN]; + struct rte_bus *bus; + struct rte_device *dev; + const char *busname = ""; memset(&uevent, 0, sizeof(struct rte_dev_event)); memset(buf, 0, EAL_UEV_MSG_LEN); @@ -171,8 +242,43 @@ dev_uev_handler(__rte_unused void *param) RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n", uevent.devname, uevent.type, uevent.subsystem); - if (uevent.devname) - dev_callback_process(uevent.devname, uevent.type); + switch (uevent.subsystem) { + case EAL_DEV_EVENT_SUBSYSTEM_PCI: + case EAL_DEV_EVENT_SUBSYSTEM_UIO: + busname = "pci"; + break; + default: + break; + } + + if (uevent.devname) { + if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) { + rte_spinlock_lock(&failure_handle_lock); + bus = rte_bus_find_by_name(busname); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", + busname); + return; + } + + dev = bus->find_device(NULL, cmp_dev_name, + uevent.devname); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find device (%s) on " + "bus (%s)\n", uevent.devname, busname); + return; + } + + ret = bus->hot_unplug_handler(dev); + rte_spinlock_unlock(&failure_handle_lock); + if (ret) { + RTE_LOG(ERR, EAL, "Can not handle hot-unplug " + "for device (%s)\n", dev->name); + return; + } + } + rte_dev_event_callback_process(uevent.devname, uevent.type); + } } int __rte_experimental @@ -220,5 +326,67 @@ rte_dev_event_monitor_stop(void) close(intr_handle.fd); intr_handle.fd = -1; monitor_started = false; + return 0; } + +int +dev_sigbus_handler_register(void) +{ + sigset_t mask; + struct sigaction action; + + rte_errno = 0; + + if (sigbus_need_recover) + return 0; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = SA_SIGINFO; + action.sa_mask = mask; + action.sa_sigaction = sigbus_handler; + sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old); + + return rte_errno; +} + +int +dev_sigbus_handler_unregister(void) +{ + rte_errno = 0; + + sigbus_action_recover(); + + return rte_errno; +} + +int __rte_experimental +rte_dev_hotplug_handle_enable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_register(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to register sigbus handler for devices.\n"); + + hotplug_handle = true; + + return ret; +} + +int __rte_experimental +rte_dev_hotplug_handle_disable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_unregister(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to unregister sigbus handler for devices.\n"); + + hotplug_handle = false; + + return ret; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c index 3a7d4b22..0eab1cf7 100644 --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c @@ -6,6 +6,7 @@ #include <sys/types.h> #include <sys/file.h> #include <dirent.h> +#include <fcntl.h> #include <stdint.h> #include <stdlib.h> #include <stdio.h> diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c index 4076c6d6..39252a88 100644 --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c @@ -33,6 +33,7 @@ #include <rte_errno.h> #include <rte_spinlock.h> #include <rte_pause.h> +#include <rte_vfio.h> #include "eal_private.h" #include "eal_vfio.h" @@ -308,6 +309,66 @@ vfio_disable_msix(const struct rte_intr_handle *intr_handle) { return ret; } + +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE +/* enable req notifier */ +static int +vfio_enable_req(const struct rte_intr_handle *intr_handle) +{ + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable req notifier */ +static int +vfio_disable_req(const struct rte_intr_handle *intr_handle) +{ + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n", + intr_handle->fd); + + return ret; +} +#endif #endif static int @@ -556,6 +617,12 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle) if (vfio_enable_intx(intr_handle)) return -1; break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_enable_req(intr_handle)) + return -1; + break; +#endif #endif /* not used at this moment */ case RTE_INTR_HANDLE_DEV_EVENT: @@ -606,6 +673,12 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle) if (vfio_disable_intx(intr_handle)) return -1; break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_disable_req(intr_handle)) + return -1; + break; +#endif #endif /* not used at this moment */ case RTE_INTR_HANDLE_DEV_EVENT: @@ -672,6 +745,12 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds) case RTE_INTR_HANDLE_VFIO_LEGACY: bytes_read = sizeof(buf.vfio_intr_count); break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + bytes_read = 0; + call = true; + break; +#endif #endif case RTE_INTR_HANDLE_VDEV: case RTE_INTR_HANDLE_EXT: diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c index aa95551a..48b9c736 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c +++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c @@ -34,6 +34,7 @@ #include <rte_log.h> #include <rte_eal_memconfig.h> #include <rte_eal.h> +#include <rte_errno.h> #include <rte_memory.h> #include <rte_spinlock.h> @@ -52,30 +53,55 @@ const int anonymous_hugepages_supported = #endif /* + * we don't actually care if memfd itself is supported - we only need to check + * if memfd supports hugetlbfs, as that already implies memfd support. + * + * also, this is not a constant, because while we may be *compiled* with memfd + * hugetlbfs support, we might not be *running* on a system that supports memfd + * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at + * runtime, and fall back to anonymous memory. + */ +static int memfd_create_supported = +#ifdef MFD_HUGETLB +#define MEMFD_SUPPORTED + 1; +#else + 0; +#endif + +/* * not all kernel version support fallocate on hugetlbfs, so fall back to * ftruncate and disallow deallocation if fallocate is not supported. */ static int fallocate_supported = -1; /* unknown */ -/* for single-file segments, we need some kind of mechanism to keep track of +/* + * we have two modes - single file segments, and file-per-page mode. + * + * for single-file segments, we need some kind of mechanism to keep track of * which hugepages can be freed back to the system, and which cannot. we cannot * use flock() because they don't allow locking parts of a file, and we cannot * use fcntl() due to issues with their semantics, so we will have to rely on a - * bunch of lockfiles for each page. + * bunch of lockfiles for each page. so, we will use 'fds' array to keep track + * of per-page lockfiles. we will store the actual segment list fd in the + * 'memseg_list_fd' field. + * + * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' + * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. * * we cannot know how many pages a system will have in advance, but we do know * that they come in lists, and we know lengths of these lists. so, simply store * a malloc'd array of fd's indexed by list and segment index. * * they will be initialized at startup, and filled as we allocate/deallocate - * segments. also, use this to track memseg list proper fd. + * segments. */ static struct { int *fds; /**< dynamically allocated array of segment lock fd's */ int memseg_list_fd; /**< memseg list fd */ int len; /**< total length of the array */ int count; /**< entries used in an array */ -} lock_fds[RTE_MAX_MEMSEG_LISTS]; +} fd_list[RTE_MAX_MEMSEG_LISTS]; /** local copy of a memory map, used to synchronize memory hotplug in MP */ static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; @@ -182,6 +208,31 @@ get_file_size(int fd) return st.st_size; } +static inline uint32_t +bsf64(uint64_t v) +{ + return (uint32_t)__builtin_ctzll(v); +} + +static inline uint32_t +log2_u64(uint64_t v) +{ + if (v == 0) + return 0; + v = rte_align64pow2(v); + return bsf64(v); +} + +static int +pagesz_flags(uint64_t page_sz) +{ + /* as per mmap() manpage, all page sizes are log2 of page size + * shifted by MAP_HUGE_SHIFT + */ + int log2 = log2_u64(page_sz); + return log2 << RTE_MAP_HUGE_SHIFT; +} + /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ static int lock(int fd, int type) { @@ -209,12 +260,12 @@ static int get_segment_lock_fd(int list_idx, int seg_idx) char path[PATH_MAX] = {0}; int fd; - if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds)) + if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list)) return -1; - if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len) + if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len) return -1; - fd = lock_fds[list_idx].fds[seg_idx]; + fd = fd_list[list_idx].fds[seg_idx]; /* does this lock already exist? */ if (fd >= 0) return fd; @@ -236,8 +287,8 @@ static int get_segment_lock_fd(int list_idx, int seg_idx) return -1; } /* store it for future reference */ - lock_fds[list_idx].fds[seg_idx] = fd; - lock_fds[list_idx].count++; + fd_list[list_idx].fds[seg_idx] = fd; + fd_list[list_idx].count++; return fd; } @@ -245,12 +296,12 @@ static int unlock_segment(int list_idx, int seg_idx) { int fd, ret; - if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds)) + if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list)) return -1; - if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len) + if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len) return -1; - fd = lock_fds[list_idx].fds[seg_idx]; + fd = fd_list[list_idx].fds[seg_idx]; /* upgrade lock to exclusive to see if we can remove the lockfile */ ret = lock(fd, LOCK_EX); @@ -270,8 +321,8 @@ static int unlock_segment(int list_idx, int seg_idx) * and remove it from list anyway. */ close(fd); - lock_fds[list_idx].fds[seg_idx] = -1; - lock_fds[list_idx].count--; + fd_list[list_idx].fds[seg_idx] = -1; + fd_list[list_idx].count--; if (ret < 0) return -1; @@ -279,16 +330,68 @@ static int unlock_segment(int list_idx, int seg_idx) } static int +get_seg_memfd(struct hugepage_info *hi __rte_unused, + unsigned int list_idx __rte_unused, + unsigned int seg_idx __rte_unused) +{ +#ifdef MEMFD_SUPPORTED + int fd; + char segname[250]; /* as per manpage, limit is 249 bytes plus null */ + + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + + if (fd < 0) { + int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); + + snprintf(segname, sizeof(segname), "seg_%i", list_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].memseg_list_fd = fd; + } + } else { + fd = fd_list[list_idx].fds[seg_idx]; + + if (fd < 0) { + int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); + + snprintf(segname, sizeof(segname), "seg_%i-%i", + list_idx, seg_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; + } + } + return fd; +#endif + return -1; +} + +static int get_seg_fd(char *path, int buflen, struct hugepage_info *hi, unsigned int list_idx, unsigned int seg_idx) { int fd; + /* for in-memory mode, we only make it here when we're sure we support + * memfd, and this is a special case. + */ + if (internal_config.in_memory) + return get_seg_memfd(hi, list_idx, seg_idx); + if (internal_config.single_file_segments) { /* create a hugepage file path */ eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); - fd = lock_fds[list_idx].memseg_list_fd; + fd = fd_list[list_idx].memseg_list_fd; if (fd < 0) { fd = open(path, O_CREAT | O_RDWR, 0600); @@ -304,24 +407,30 @@ get_seg_fd(char *path, int buflen, struct hugepage_info *hi, close(fd); return -1; } - lock_fds[list_idx].memseg_list_fd = fd; + fd_list[list_idx].memseg_list_fd = fd; } } else { /* create a hugepage file path */ eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); - fd = open(path, O_CREAT | O_RDWR, 0600); + + fd = fd_list[list_idx].fds[seg_idx]; + if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, - strerror(errno)); - return -1; - } - /* take out a read lock */ - if (lock(fd, LOCK_SH) < 0) { - RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", - __func__, strerror(errno)); - close(fd); - return -1; + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* take out a read lock */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; } } return fd; @@ -332,6 +441,33 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx, uint64_t fa_offset, uint64_t page_sz, bool grow) { bool again = false; + + /* in-memory mode is a special case, because we don't need to perform + * any locking, and we can be sure that fallocate() is supported. + */ + if (internal_config.in_memory) { + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret; + + /* grow or shrink the file */ + ret = fallocate(fd, flags, fa_offset, page_sz); + + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + /* increase/decrease total segment count */ + fd_list[list_idx].count += (grow ? 1 : -1); + if (!grow && fd_list[list_idx].count == 0) { + close(fd_list[list_idx].memseg_list_fd); + fd_list[list_idx].memseg_list_fd = -1; + } + return 0; + } + do { if (fallocate_supported == 0) { /* we cannot deallocate memory if fallocate() is not @@ -410,9 +546,9 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx, * page file fd, so that one of the processes * could then delete the file after shrinking. */ - if (ret < 1 && lock_fds[list_idx].count == 0) { + if (ret < 1 && fd_list[list_idx].count == 0) { close(fd); - lock_fds[list_idx].memseg_list_fd = -1; + fd_list[list_idx].memseg_list_fd = -1; } if (ret < 0) { @@ -448,13 +584,13 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx, * more segments active in this segment list, * and remove the file if there aren't. */ - if (lock_fds[list_idx].count == 0) { + if (fd_list[list_idx].count == 0) { if (unlink(path)) RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", __func__, path, strerror(errno)); close(fd); - lock_fds[list_idx].memseg_list_fd = -1; + fd_list[list_idx].memseg_list_fd = -1; } } } @@ -481,26 +617,34 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, void *new_addr; alloc_sz = hi->hugepage_sz; - if (!internal_config.single_file_segments && - internal_config.in_memory && - anonymous_hugepages_supported) { - int log2, flags; - - log2 = rte_log2_u32(alloc_sz); - /* as per mmap() manpage, all page sizes are log2 of page size - * shifted by MAP_HUGE_SHIFT - */ - flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED | + + /* these are checked at init, but code analyzers don't know that */ + if (internal_config.in_memory && !anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); + return -1; + } + if (internal_config.in_memory && !memfd_create_supported && + internal_config.single_file_segments) { + RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); + return -1; + } + + /* in-memory without memfd is a special case */ + int mmap_flags; + + if (internal_config.in_memory && !memfd_create_supported) { + int pagesz_flag, flags; + + pagesz_flag = pagesz_flags(alloc_sz); + flags = pagesz_flag | MAP_HUGETLB | MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS; fd = -1; - va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0); - - /* single-file segments codepath will never be active because - * in-memory mode is incompatible with it and it's stopped at - * EAL initialization stage, however the compiler doesn't know - * that and complains about map_offset being used uninitialized - * on failure codepaths while having in-memory mode enabled. so, - * assign a value here. + mmap_flags = flags; + + /* single-file segments codepath will never be active + * here because in-memory mode is incompatible with the + * fallback path, and it's stopped at EAL initialization + * stage. */ map_offset = 0; } else { @@ -524,7 +668,8 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, __func__, strerror(errno)); goto resized; } - if (internal_config.hugepage_unlink) { + if (internal_config.hugepage_unlink && + !internal_config.in_memory) { if (unlink(path)) { RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", __func__, strerror(errno)); @@ -532,16 +677,16 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, } } } - - /* - * map the segment, and populate page tables, the kernel fills - * this segment with zeros if it's a new page. - */ - va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, - map_offset); + mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; } + /* + * map the segment, and populate page tables, the kernel fills + * this segment with zeros if it's a new page. + */ + va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, + map_offset); + if (va == MAP_FAILED) { RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, strerror(errno)); @@ -593,10 +738,6 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, goto mapped; } #endif - /* for non-single file segments that aren't in-memory, we can close fd - * here */ - if (!internal_config.single_file_segments && !internal_config.in_memory) - close(fd); ms->addr = addr; ms->hugepage_sz = alloc_sz; @@ -626,7 +767,10 @@ unmapped: RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); } resized: - /* in-memory mode will never be single-file-segments mode */ + /* some codepaths will return negative fd, so exit early */ + if (fd < 0) + return -1; + if (internal_config.single_file_segments) { resize_hugefile(fd, path, list_idx, seg_idx, map_offset, alloc_sz, false); @@ -638,6 +782,7 @@ resized: lock(fd, LOCK_EX) == 1) unlink(path); close(fd); + fd_list[list_idx].fds[seg_idx] = -1; } return -1; } @@ -648,7 +793,8 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, { uint64_t map_offset; char path[PATH_MAX]; - int fd, ret; + int fd, ret = 0; + bool exit_early; /* erase page data */ memset(ms->addr, 0, ms->len); @@ -660,8 +806,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, return -1; } + exit_early = false; + + /* if we're using anonymous hugepages, nothing to be done */ + if (internal_config.in_memory && !memfd_create_supported) + exit_early = true; + /* if we've already unlinked the page, nothing needs to be done */ - if (internal_config.hugepage_unlink) { + if (!internal_config.in_memory && internal_config.hugepage_unlink) + exit_early = true; + + if (exit_early) { memset(ms, 0, sizeof(*ms)); return 0; } @@ -684,14 +839,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, /* if we're able to take out a write lock, we're the last one * holding onto this page. */ - ret = lock(fd, LOCK_EX); - if (ret >= 0) { - /* no one else is using this page */ - if (ret == 1) - unlink(path); + if (!internal_config.in_memory) { + ret = lock(fd, LOCK_EX); + if (ret >= 0) { + /* no one else is using this page */ + if (ret == 1) + unlink(path); + } } /* closing fd will drop the lock */ close(fd); + fd_list[list_idx].fds[seg_idx] = -1; } memset(ms, 0, sizeof(*ms)); @@ -828,7 +986,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg) int msl_idx, seg_idx, ret, dir_fd = -1; start_addr = (uintptr_t) msl->base_va; - end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz; + end_addr = start_addr + msl->len; if ((uintptr_t)wa->ms->addr < start_addr || (uintptr_t)wa->ms->addr >= end_addr) @@ -1250,6 +1408,9 @@ sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) unsigned int i; int msl_idx; + if (msl->external) + return 0; + msl_idx = msl - mcfg->memsegs; primary_msl = &mcfg->memsegs[msl_idx]; local_msl = &local_memsegs[msl_idx]; @@ -1298,6 +1459,9 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl, char name[PATH_MAX]; int msl_idx, ret; + if (msl->external) + return 0; + msl_idx = msl - mcfg->memsegs; primary_msl = &mcfg->memsegs[msl_idx]; local_msl = &local_memsegs[msl_idx]; @@ -1314,50 +1478,176 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl, return -1; } local_msl->base_va = primary_msl->base_va; + local_msl->len = primary_msl->len; return 0; } static int -secondary_lock_list_create_walk(const struct rte_memseg_list *msl, - void *arg __rte_unused) +alloc_list(int list_idx, int len) { - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - unsigned int i, len; - int msl_idx; int *data; + int i; - msl_idx = msl - mcfg->memsegs; - len = msl->memseg_arr.len; - - /* ensure we have space to store lock fd per each possible segment */ + /* ensure we have space to store fd per each possible segment */ data = malloc(sizeof(int) * len); if (data == NULL) { - RTE_LOG(ERR, EAL, "Unable to allocate space for lock descriptors\n"); + RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); return -1; } /* set all fd's as invalid */ for (i = 0; i < len; i++) data[i] = -1; - lock_fds[msl_idx].fds = data; - lock_fds[msl_idx].len = len; - lock_fds[msl_idx].count = 0; - lock_fds[msl_idx].memseg_list_fd = -1; + fd_list[list_idx].fds = data; + fd_list[list_idx].len = len; + fd_list[list_idx].count = 0; + fd_list[list_idx].memseg_list_fd = -1; + + return 0; +} + +static int +fd_list_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int len; + int msl_idx; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + len = msl->memseg_arr.len; + + return alloc_list(msl_idx, len); +} + +int +eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* if list is not allocated, allocate it */ + if (fd_list[list_idx].len == 0) { + int len = mcfg->memsegs[list_idx].memseg_arr.len; + + if (alloc_list(list_idx, len) < 0) + return -ENOMEM; + } + fd_list[list_idx].fds[seg_idx] = fd; return 0; } int +eal_memalloc_get_seg_fd(int list_idx, int seg_idx) +{ + int fd; + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + } else if (fd_list[list_idx].len == 0) { + /* list not initialized */ + fd = -1; + } else { + fd = fd_list[list_idx].fds[seg_idx]; + } + if (fd < 0) + return -ENODEV; + return fd; +} + +static int +test_memfd_create(void) +{ +#ifdef MEMFD_SUPPORTED + unsigned int i; + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz; + int pagesz_flag = pagesz_flags(pagesz); + int flags; + + flags = pagesz_flag | MFD_HUGETLB; + int fd = memfd_create("test", flags); + if (fd < 0) { + /* we failed - let memalloc know this isn't working */ + if (errno == EINVAL) { + memfd_create_supported = 0; + return 0; /* not supported */ + } + + /* we got other error - something's wrong */ + return -1; /* error */ + } + close(fd); + return 1; /* supported */ + } +#endif + return 0; /* not supported */ +} + +int +eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* fd_list not initialized? */ + if (fd_list[list_idx].len == 0) + return -ENODEV; + if (internal_config.single_file_segments) { + size_t pgsz = mcfg->memsegs[list_idx].page_sz; + + /* segment not active? */ + if (fd_list[list_idx].memseg_list_fd < 0) + return -ENOENT; + *offset = pgsz * seg_idx; + } else { + /* segment not active? */ + if (fd_list[list_idx].fds[seg_idx] < 0) + return -ENOENT; + *offset = 0; + } + return 0; +} + +int eal_memalloc_init(void) { if (rte_eal_process_type() == RTE_PROC_SECONDARY) if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) return -1; + if (rte_eal_process_type() == RTE_PROC_PRIMARY && + internal_config.in_memory) { + int mfd_res = test_memfd_create(); - /* initialize all of the lock fd lists */ - if (internal_config.single_file_segments) - if (rte_memseg_list_walk(secondary_lock_list_create_walk, NULL)) + if (mfd_res < 0) { + RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); + return -1; + } + if (mfd_res == 1) + RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); + else + RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); + + /* we only support single-file segments mode with in-memory mode + * if we support hugetlbfs with memfd_create. this code will + * test if we do. + */ + if (internal_config.single_file_segments && + mfd_res != 1) { + RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); return -1; + } + /* this cannot ever happen but better safe than sorry */ + if (!anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); + return -1; + } + } + + /* initialize all of the fd lists */ + if (rte_memseg_list_walk(fd_list_create_walk, NULL)) + return -1; return 0; } diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index dbf19499..fce86fda 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -5,6 +5,7 @@ #define _FILE_OFFSET_BITS 64 #include <errno.h> +#include <fcntl.h> #include <stdarg.h> #include <stdbool.h> #include <stdlib.h> @@ -17,6 +18,7 @@ #include <sys/stat.h> #include <sys/queue.h> #include <sys/file.h> +#include <sys/resource.h> #include <unistd.h> #include <limits.h> #include <sys/ioctl.h> @@ -263,7 +265,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, int node_id = -1; int essential_prev = 0; int oldpolicy; - struct bitmask *oldmask = numa_allocate_nodemask(); + struct bitmask *oldmask = NULL; bool have_numa = true; unsigned long maxnode = 0; @@ -275,6 +277,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, if (have_numa) { RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + oldmask = numa_allocate_nodemask(); if (get_mempolicy(&oldpolicy, oldmask->maskp, oldmask->size + 1, 0, 0) < 0) { RTE_LOG(ERR, EAL, @@ -402,7 +405,8 @@ out: numa_set_localalloc(); } } - numa_free_cpumask(oldmask); + if (oldmask != NULL) + numa_free_cpumask(oldmask); #endif return i; } @@ -584,7 +588,7 @@ unlink_hugepage_files(struct hugepage_file *hugepg_tbl, for (page = 0; page < nrpages; page++) { struct hugepage_file *hp = &hugepg_tbl[page]; - if (hp->final_va != NULL && unlink(hp->filepath)) { + if (hp->orig_va != NULL && unlink(hp->filepath)) { RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", __func__, hp->filepath, strerror(errno)); } @@ -771,7 +775,10 @@ remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end) rte_fbarray_set_used(arr, ms_idx); - close(fd); + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); } RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n", (seg_len * page_sz) >> 20, socket_id); @@ -857,6 +864,7 @@ alloc_va_space(struct rte_memseg_list *msl) return -1; } msl->base_va = addr; + msl->len = mem_sz; return 0; } @@ -1365,6 +1373,7 @@ eal_legacy_hugepage_init(void) msl->base_va = addr; msl->page_sz = page_sz; msl->socket_id = 0; + msl->len = internal_config.memory; /* populate memsegs. each memseg is one page long */ for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { @@ -1611,7 +1620,7 @@ eal_legacy_hugepage_init(void) if (msl->memseg_arr.count > 0) continue; /* this is an unused list, deallocate it */ - mem_sz = (size_t)msl->page_sz * msl->memseg_arr.len; + mem_sz = msl->len; munmap(msl->base_va, mem_sz); msl->base_va = NULL; @@ -1770,6 +1779,7 @@ getFileSize(int fd) static int eal_legacy_hugepage_attach(void) { + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct hugepage_file *hp = NULL; unsigned int num_hp = 0; unsigned int i = 0; @@ -1813,6 +1823,9 @@ eal_legacy_hugepage_attach(void) struct hugepage_file *hf = &hp[i]; size_t map_sz = hf->size; void *map_addr = hf->final_va; + int msl_idx, ms_idx; + struct rte_memseg_list *msl; + struct rte_memseg *ms; /* if size is zero, no more pages left */ if (map_sz == 0) @@ -1830,25 +1843,50 @@ eal_legacy_hugepage_attach(void) if (map_addr == MAP_FAILED) { RTE_LOG(ERR, EAL, "Could not map %s: %s\n", hf->filepath, strerror(errno)); - close(fd); - goto error; + goto fd_error; } /* set shared lock on the file. */ if (flock(fd, LOCK_SH) < 0) { RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n", __func__, strerror(errno)); - close(fd); - goto error; + goto fd_error; } - close(fd); + /* find segment data */ + msl = rte_mem_virt2memseg_list(map_addr); + if (msl == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n", + __func__); + goto fd_error; + } + ms = rte_mem_virt2memseg(map_addr, msl); + if (ms == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n", + __func__); + goto fd_error; + } + + msl_idx = msl - mcfg->memsegs; + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + if (ms_idx < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n", + __func__); + goto fd_error; + } + + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); } /* unmap the hugepage config file, since we are done using it */ munmap(hp, size); close(fd_hugepage); return 0; +fd_error: + close(fd); error: /* map all segments into memory to make sure we get the addrs */ cur_seg = 0; @@ -2093,18 +2131,65 @@ static int __rte_unused memseg_primary_init(void) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int i, socket_id, hpi_idx, msl_idx = 0; + struct memtype { + uint64_t page_sz; + int socket_id; + } *memtypes = NULL; + int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */ struct rte_memseg_list *msl; - uint64_t max_mem, total_mem; + uint64_t max_mem, max_mem_per_type; + unsigned int max_seglists_per_type; + unsigned int n_memtypes, cur_type; /* no-huge does not need this at all */ if (internal_config.no_hugetlbfs) return 0; - max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; - total_mem = 0; + /* + * figuring out amount of memory we're going to have is a long and very + * involved process. the basic element we're operating with is a memory + * type, defined as a combination of NUMA node ID and page size (so that + * e.g. 2 sockets with 2 page sizes yield 4 memory types in total). + * + * deciding amount of memory going towards each memory type is a + * balancing act between maximum segments per type, maximum memory per + * type, and number of detected NUMA nodes. the goal is to make sure + * each memory type gets at least one memseg list. + * + * the total amount of memory is limited by RTE_MAX_MEM_MB value. + * + * the total amount of memory per type is limited by either + * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number + * of detected NUMA nodes. additionally, maximum number of segments per + * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for + * smaller page sizes, it can take hundreds of thousands of segments to + * reach the above specified per-type memory limits. + * + * additionally, each type may have multiple memseg lists associated + * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger + * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones. + * + * the number of memseg lists per type is decided based on the above + * limits, and also taking number of detected NUMA nodes, to make sure + * that we don't run out of memseg lists before we populate all NUMA + * nodes with memory. + * + * we do this in three stages. first, we collect the number of types. + * then, we figure out memory constraints and populate the list of + * would-be memseg lists. then, we go ahead and allocate the memseg + * lists. + */ - /* create memseg lists */ + /* create space for mem types */ + n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count(); + memtypes = calloc(n_memtypes, sizeof(*memtypes)); + if (memtypes == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n"); + return -1; + } + + /* populate mem types */ + cur_type = 0; for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; hpi_idx++) { struct hugepage_info *hpi; @@ -2113,62 +2198,114 @@ memseg_primary_init(void) hpi = &internal_config.hugepage_info[hpi_idx]; hugepage_sz = hpi->hugepage_sz; - for (i = 0; i < (int) rte_socket_count(); i++) { - uint64_t max_type_mem, total_type_mem = 0; - int type_msl_idx, max_segs, total_segs = 0; - - socket_id = rte_socket_id_by_idx(i); + for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) { + int socket_id = rte_socket_id_by_idx(i); #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES if (socket_id > 0) break; #endif + memtypes[cur_type].page_sz = hugepage_sz; + memtypes[cur_type].socket_id = socket_id; - if (total_mem >= max_mem) - break; - - max_type_mem = RTE_MIN(max_mem - total_mem, - (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); - max_segs = RTE_MAX_MEMSEG_PER_TYPE; + RTE_LOG(DEBUG, EAL, "Detected memory type: " + "socket_id:%u hugepage_sz:%" PRIu64 "\n", + socket_id, hugepage_sz); + } + } - type_msl_idx = 0; - while (total_type_mem < max_type_mem && - total_segs < max_segs) { - uint64_t cur_max_mem, cur_mem; - unsigned int n_segs; + /* set up limits for types */ + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20, + max_mem / n_memtypes); + /* + * limit maximum number of segment lists per type to ensure there's + * space for memseg lists for all NUMA nodes with all page sizes + */ + max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes; - if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, - "No more space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - return -1; - } + if (max_seglists_per_type == 0) { + RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } - msl = &mcfg->memsegs[msl_idx++]; + /* go through all mem types and create segment lists */ + msl_idx = 0; + for (cur_type = 0; cur_type < n_memtypes; cur_type++) { + unsigned int cur_seglist, n_seglists, n_segs; + unsigned int max_segs_per_type, max_segs_per_list; + struct memtype *type = &memtypes[cur_type]; + uint64_t max_mem_per_list, pagesz; + int socket_id; - cur_max_mem = max_type_mem - total_type_mem; + pagesz = type->page_sz; + socket_id = type->socket_id; - cur_mem = get_mem_amount(hugepage_sz, - cur_max_mem); - n_segs = cur_mem / hugepage_sz; + /* + * we need to create segment lists for this type. we must take + * into account the following things: + * + * 1. total amount of memory we can use for this memory type + * 2. total amount of memory per memseg list allowed + * 3. number of segments needed to fit the amount of memory + * 4. number of segments allowed per type + * 5. number of segments allowed per memseg list + * 6. number of memseg lists we are allowed to take up + */ - if (alloc_memseg_list(msl, hugepage_sz, n_segs, - socket_id, type_msl_idx)) - return -1; + /* calculate how much segments we will need in total */ + max_segs_per_type = max_mem_per_type / pagesz; + /* limit number of segments to maximum allowed per type */ + max_segs_per_type = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_TYPE); + /* limit number of segments to maximum allowed per list */ + max_segs_per_list = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_LIST); + + /* calculate how much memory we can have per segment list */ + max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz, + (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20); + + /* calculate how many segments each segment list will have */ + n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz); + + /* calculate how many segment lists we can have */ + n_seglists = RTE_MIN(max_segs_per_type / n_segs, + max_mem_per_type / max_mem_per_list); + + /* limit number of segment lists according to our maximum */ + n_seglists = RTE_MIN(n_seglists, max_seglists_per_type); + + RTE_LOG(DEBUG, EAL, "Creating %i segment lists: " + "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n", + n_seglists, n_segs, socket_id, pagesz); + + /* create all segment lists */ + for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) { + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } + msl = &mcfg->memsegs[msl_idx++]; - total_segs += msl->memseg_arr.len; - total_type_mem = total_segs * hugepage_sz; - type_msl_idx++; + if (alloc_memseg_list(msl, pagesz, n_segs, + socket_id, cur_seglist)) + goto out; - if (alloc_va_space(msl)) { - RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); - return -1; - } + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + goto out; } - total_mem += total_type_mem; } } - return 0; + /* we're successful */ + ret = 0; +out: + free(memtypes); + return ret; } static int @@ -2204,6 +2341,25 @@ memseg_secondary_init(void) int rte_eal_memseg_init(void) { + /* increase rlimit to maximum */ + struct rlimit lim; + + if (getrlimit(RLIMIT_NOFILE, &lim) == 0) { + /* set limit to maximum */ + lim.rlim_cur = lim.rlim_max; + + if (setrlimit(RLIMIT_NOFILE, &lim) < 0) { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n", + strerror(errno)); + } else { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %" + PRIu64 "\n", + (uint64_t)lim.rlim_cur); + } + } else { + RTE_LOG(ERR, EAL, "Cannot get current resource limits\n"); + } + return rte_eal_process_type() == RTE_PROC_PRIMARY ? #ifndef RTE_ARCH_64 memseg_primary_init_32() : diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c index b496fc71..379773b6 100644 --- a/lib/librte_eal/linuxapp/eal/eal_thread.c +++ b/lib/librte_eal/linuxapp/eal/eal_thread.c @@ -121,8 +121,8 @@ eal_thread_loop(__attribute__((unused)) void *arg) ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n", - lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "..."); + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "..."); /* read on our pipe to get commands */ while (1) { diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c index 2766bd78..bc8f0519 100644 --- a/lib/librte_eal/linuxapp/eal/eal_timer.c +++ b/lib/librte_eal/linuxapp/eal/eal_timer.c @@ -87,7 +87,7 @@ static pthread_t msb_inc_thread_id; * containing used to process MSB of the HPET (unfortunately, we need * this because hpet is 32 bits by default under linux). */ -static void +static void * hpet_msb_inc(__attribute__((unused)) void *arg) { uint32_t t; @@ -98,6 +98,7 @@ hpet_msb_inc(__attribute__((unused)) void *arg) eal_hpet_msb ++; sleep(10); } + return NULL; } uint64_t @@ -178,7 +179,7 @@ rte_eal_hpet_init(int make_default) /* create a thread that will increment a global variable for * msb (hpet is 32 bits by default under linux) */ ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL, - (void *(*)(void *))hpet_msb_inc, NULL); + hpet_msb_inc, NULL); if (ret != 0) { RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); internal_config.no_hpet = 1; diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c index c68dc38e..0516b159 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c @@ -345,46 +345,13 @@ get_vfio_cfg_by_group_num(int iommu_group_num) return NULL; } -static struct vfio_config * -get_vfio_cfg_by_group_fd(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i, j; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - vfio_cfg = &vfio_cfgs[i]; - for (j = 0; j < VFIO_MAX_GROUPS; j++) - if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) - return vfio_cfg; - } - - return NULL; -} - -static struct vfio_config * -get_vfio_cfg_by_container_fd(int container_fd) -{ - int i; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - if (vfio_cfgs[i].vfio_container_fd == container_fd) - return &vfio_cfgs[i]; - } - - return NULL; -} - -int -rte_vfio_get_group_fd(int iommu_group_num) +static int +vfio_get_group_fd(struct vfio_config *vfio_cfg, + int iommu_group_num) { int i; int vfio_group_fd; struct vfio_group *cur_grp; - struct vfio_config *vfio_cfg; - - /* get the vfio_config it belongs to */ - vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); - vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; /* check if we already have the group descriptor open */ for (i = 0; i < VFIO_MAX_GROUPS; i++) @@ -423,6 +390,47 @@ rte_vfio_get_group_fd(int iommu_group_num) return vfio_group_fd; } +static struct vfio_config * +get_vfio_cfg_by_group_fd(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return vfio_cfg; + } + + return NULL; +} + +static struct vfio_config * +get_vfio_cfg_by_container_fd(int container_fd) +{ + int i; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == container_fd) + return &vfio_cfgs[i]; + } + + return NULL; +} + +int +rte_vfio_get_group_fd(int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + return vfio_get_group_fd(vfio_cfg, iommu_group_num); +} + static int get_vfio_group_idx(int vfio_group_fd) { @@ -509,7 +517,7 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, msl = rte_mem_virt2memseg_list(addr); /* for IOVA as VA mode, no need to care for IOVA addresses */ - if (rte_eal_iova_mode() == RTE_IOVA_VA) { + if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) { uint64_t vfio_va = (uint64_t)(uintptr_t)addr; if (type == RTE_MEM_EVENT_ALLOC) vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, @@ -523,13 +531,19 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, /* memsegs are contiguous in memory */ ms = rte_mem_virt2memseg(addr, msl); while (cur_len < len) { + /* some memory segments may have invalid IOVA */ + if (ms->iova == RTE_BAD_IOVA) { + RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n", + ms->addr); + goto next; + } if (type == RTE_MEM_EVENT_ALLOC) vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, ms->iova, ms->len, 1); else vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, ms->iova, ms->len, 0); - +next: cur_len += ms->len; ++ms; } @@ -896,7 +910,15 @@ rte_vfio_enable(const char *modname) return 0; } - default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd(); + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* open a new container */ + default_vfio_cfg->vfio_container_fd = + rte_vfio_get_container_fd(); + } else { + /* get the default container from the primary process */ + default_vfio_cfg->vfio_container_fd = + vfio_get_default_container_fd(); + } /* check if we have VFIO driver enabled */ if (default_vfio_cfg->vfio_container_fd != -1) { @@ -916,6 +938,45 @@ rte_vfio_is_enabled(const char *modname) return default_vfio_cfg->vfio_enabled && mod_available; } +int +vfio_get_default_container_fd(void) +{ + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + if (default_vfio_cfg->vfio_enabled) + return default_vfio_cfg->vfio_container_fd; + + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* if we were secondary process we would try requesting + * container fd from the primary, but we're the primary + * process so just exit here + */ + return -1; + } + + p->req = SOCKET_REQ_DEFAULT_CONTAINER; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + free(mp_reply.msgs); + return mp_rep->fds[0]; + } + free(mp_reply.msgs); + } + + RTE_LOG(ERR, EAL, " cannot request default container fd\n"); + return -1; +} + const struct vfio_iommu_type * vfio_set_iommu_type(int vfio_container_fd) { @@ -1028,8 +1089,9 @@ rte_vfio_get_container_fd(void) mp_rep = &mp_reply.msgs[0]; p = (struct vfio_mp_param *)mp_rep->param; if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + vfio_container_fd = mp_rep->fds[0]; free(mp_reply.msgs); - return mp_rep->fds[0]; + return vfio_container_fd; } free(mp_reply.msgs); } @@ -1082,11 +1144,14 @@ rte_vfio_get_group_num(const char *sysfs_base, } static int -type1_map(const struct rte_memseg_list *msl __rte_unused, - const struct rte_memseg *ms, void *arg) +type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) { int *vfio_container_fd = arg; + if (msl->external) + return 0; + return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, ms->len, 1); } @@ -1145,8 +1210,22 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, struct vfio_iommu_type1_dma_map dma_map; struct vfio_iommu_type1_dma_unmap dma_unmap; int ret; + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0 + }; + reg.vaddr = (uintptr_t) vaddr; + reg.size = len; if (do_map != 0) { + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + memset(&dma_map, 0, sizeof(dma_map)); dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); dma_map.vaddr = vaddr; @@ -1163,13 +1242,6 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, } } else { - struct vfio_iommu_spapr_register_memory reg = { - .argsz = sizeof(reg), - .flags = 0 - }; - reg.vaddr = (uintptr_t) vaddr; - reg.size = len; - ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); if (ret) { @@ -1196,12 +1268,15 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, } static int -vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused, +vfio_spapr_map_walk(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg) { int *vfio_container_fd = arg; - return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + if (msl->external) + return 0; + + return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, ms->len, 1); } @@ -1210,12 +1285,15 @@ struct spapr_walk_param { uint64_t hugepage_sz; }; static int -vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused, +vfio_spapr_window_size_walk(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg) { struct spapr_walk_param *param = arg; uint64_t max = ms->iova + ms->len; + if (msl->external) + return 0; + if (max > param->window_size) { param->hugepage_sz = ms->hugepage_sz; param->window_size = max; @@ -1670,9 +1748,6 @@ int rte_vfio_container_group_bind(int container_fd, int iommu_group_num) { struct vfio_config *vfio_cfg; - struct vfio_group *cur_grp; - int vfio_group_fd; - int i; vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); if (vfio_cfg == NULL) { @@ -1680,36 +1755,7 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num) return -1; } - /* Check room for new group */ - if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); - return -1; - } - - /* Get an index for the new group */ - for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg->vfio_groups[i].group_num == -1) { - cur_grp = &vfio_cfg->vfio_groups[i]; - break; - } - - /* This should not happen */ - if (i == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); - return -1; - } - - vfio_group_fd = vfio_open_group_fd(iommu_group_num); - if (vfio_group_fd < 0) { - RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); - return -1; - } - cur_grp->group_num = iommu_group_num; - cur_grp->fd = vfio_group_fd; - cur_grp->devices = 0; - vfio_cfg->vfio_active_groups++; - - return vfio_group_fd; + return vfio_get_group_fd(vfio_cfg, iommu_group_num); } int diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h index 68d4750a..63ae115c 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h @@ -115,6 +115,9 @@ struct vfio_iommu_type { vfio_dma_func_t dma_map_func; }; +/* get the vfio container that devices are bound to by default */ +int vfio_get_default_container_fd(void); + /* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ const struct vfio_iommu_type * vfio_set_iommu_type(int vfio_container_fd); @@ -129,6 +132,7 @@ int vfio_mp_sync_setup(void); #define SOCKET_REQ_CONTAINER 0x100 #define SOCKET_REQ_GROUP 0x200 +#define SOCKET_REQ_DEFAULT_CONTAINER 0x400 #define SOCKET_OK 0x0 #define SOCKET_NO_FD 0x1 #define SOCKET_ERR 0xFF diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c index 680a24aa..a1e8c834 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c +++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c @@ -66,6 +66,17 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer) reply.fds[0] = fd; } break; + case SOCKET_REQ_DEFAULT_CONTAINER: + r->req = SOCKET_REQ_DEFAULT_CONTAINER; + fd = vfio_get_default_container_fd(); + if (fd < 0) + r->result = SOCKET_ERR; + else { + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; default: RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); return -1; diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h index cfa9448b..5afa0871 100644 --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h @@ -8,6 +8,7 @@ #ifdef __KERNEL__ #include <linux/if.h> +#include <asm/barrier.h> #define RTE_STD_C11 #else #include <rte_common.h> @@ -54,8 +55,13 @@ struct rte_kni_request { * Writing should never overwrite the read position */ struct rte_kni_fifo { +#ifdef RTE_USE_C11_MEM_MODEL + unsigned write; /**< Next position to be written*/ + unsigned read; /**< Next position to be read */ +#else volatile unsigned write; /**< Next position to be written*/ volatile unsigned read; /**< Next position to be read */ +#endif unsigned len; /**< Circular buffer length */ unsigned elem_size; /**< Pointer size - for 32/64 bit OS */ void *volatile buffer[]; /**< The buffer contains mbuf pointers */ diff --git a/lib/librte_eal/meson.build b/lib/librte_eal/meson.build index e1fde15d..a18f3a82 100644 --- a/lib/librte_eal/meson.build +++ b/lib/librte_eal/meson.build @@ -21,11 +21,10 @@ else error('unsupported system type "@0@"'.format(host_machine.system())) endif -version = 8 # the version of the EAL API +version = 9 # the version of the EAL API allow_experimental_apis = true deps += 'compat' deps += 'kvargs' -cflags += '-D_GNU_SOURCE' sources = common_sources + env_sources objs = common_objs + env_objs headers = common_headers + env_headers diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map index 344a43d3..04f62424 100644 --- a/lib/librte_eal/rte_eal_version.map +++ b/lib/librte_eal/rte_eal_version.map @@ -19,9 +19,6 @@ DPDK_2.0 { rte_dump_tailq; rte_eal_alarm_cancel; rte_eal_alarm_set; - rte_eal_devargs_add; - rte_eal_devargs_dump; - rte_eal_devargs_type_count; rte_eal_get_configuration; rte_eal_get_lcore_state; rte_eal_get_physmem_size; @@ -32,7 +29,6 @@ DPDK_2.0 { rte_eal_lcore_role; rte_eal_mp_remote_launch; rte_eal_mp_wait_lcore; - rte_eal_parse_devargs_str; rte_eal_process_type; rte_eal_remote_launch; rte_eal_tailq_lookup; @@ -134,8 +130,6 @@ DPDK_16.11 { rte_delay_us_block; rte_delay_us_callback_register; - rte_eal_dev_attach; - rte_eal_dev_detach; } DPDK_16.07; @@ -262,6 +256,16 @@ DPDK_18.08 { } DPDK_18.05; +DPDK_18.11 { + global: + + rte_eal_get_runtime_dir; + rte_eal_hotplug_add; + rte_eal_hotplug_remove; + rte_strscpy; + +} DPDK_18.08; + EXPERIMENTAL { global: @@ -270,12 +274,19 @@ EXPERIMENTAL { rte_class_register; rte_class_unregister; rte_ctrl_thread_create; + rte_delay_us_sleep; + rte_dev_event_callback_process; rte_dev_event_callback_register; rte_dev_event_callback_unregister; rte_dev_event_monitor_start; rte_dev_event_monitor_stop; + rte_dev_hotplug_handle_disable; + rte_dev_hotplug_handle_enable; + rte_dev_is_probed; rte_dev_iterator_init; rte_dev_iterator_next; + rte_dev_probe; + rte_dev_remove; rte_devargs_add; rte_devargs_dump; rte_devargs_insert; @@ -284,9 +295,8 @@ EXPERIMENTAL { rte_devargs_parsef; rte_devargs_remove; rte_devargs_type_count; + rte_eal_check_dma_mask; rte_eal_cleanup; - rte_eal_hotplug_add; - rte_eal_hotplug_remove; rte_fbarray_attach; rte_fbarray_destroy; rte_fbarray_detach; @@ -311,6 +321,14 @@ EXPERIMENTAL { rte_fbarray_set_used; rte_log_register_type_and_pick_level; rte_malloc_dump_heaps; + rte_malloc_heap_create; + rte_malloc_heap_destroy; + rte_malloc_heap_get_socket; + rte_malloc_heap_memory_add; + rte_malloc_heap_memory_attach; + rte_malloc_heap_memory_detach; + rte_malloc_heap_memory_remove; + rte_malloc_heap_socket_is_external; rte_mem_alloc_validator_register; rte_mem_alloc_validator_unregister; rte_mem_event_callback_register; @@ -320,6 +338,10 @@ EXPERIMENTAL { rte_mem_virt2memseg_list; rte_memseg_contig_walk; rte_memseg_contig_walk_thread_unsafe; + rte_memseg_get_fd; + rte_memseg_get_fd_offset; + rte_memseg_get_fd_thread_unsafe; + rte_memseg_get_fd_offset_thread_unsafe; rte_memseg_list_walk; rte_memseg_list_walk_thread_unsafe; rte_memseg_walk; @@ -330,6 +352,7 @@ EXPERIMENTAL { rte_mp_request_sync; rte_mp_request_async; rte_mp_sendmsg; + rte_option_register; rte_service_lcore_attr_get; rte_service_lcore_attr_reset_all; rte_service_may_be_active; |