diff options
Diffstat (limited to 'lib/librte_eal/common')
48 files changed, 2662 insertions, 346 deletions
diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile index cca68826..87d8c455 100644 --- a/lib/librte_eal/common/Makefile +++ b/lib/librte_eal/common/Makefile @@ -12,6 +12,7 @@ INC += rte_tailq.h rte_interrupts.h rte_alarm.h INC += rte_string_fns.h rte_version.h INC += rte_eal_memconfig.h rte_malloc_heap.h INC += rte_hexdump.h rte_devargs.h rte_bus.h rte_dev.h rte_class.h +INC += rte_option.h INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h INC += rte_malloc.h rte_keepalive.h rte_time.h INC += rte_service.h rte_service_component.h diff --git a/lib/librte_eal/common/arch/arm/meson.build b/lib/librte_eal/common/arch/arm/meson.build index c6bd9227..79731e1a 100644 --- a/lib/librte_eal/common/arch/arm/meson.build +++ b/lib/librte_eal/common/arch/arm/meson.build @@ -2,4 +2,4 @@ # Copyright(c) 2017 Intel Corporation. eal_common_arch_sources = files('rte_cpuflags.c', - 'rte_cycles.c') + 'rte_cycles.c', 'rte_hypervisor.c') diff --git a/lib/librte_eal/common/arch/ppc_64/meson.build b/lib/librte_eal/common/arch/ppc_64/meson.build new file mode 100644 index 00000000..40b3dc53 --- /dev/null +++ b/lib/librte_eal/common/arch/ppc_64/meson.build @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Luca Boccassi <bluca@debian.org> + +eal_common_arch_sources = files('rte_cpuflags.c', + 'rte_cycles.c', 'rte_hypervisor.c') diff --git a/lib/librte_eal/common/arch/x86/meson.build b/lib/librte_eal/common/arch/x86/meson.build index 4e0f7790..14bf204c 100644 --- a/lib/librte_eal/common/arch/x86/meson.build +++ b/lib/librte_eal/common/arch/x86/meson.build @@ -2,4 +2,4 @@ # Copyright(c) 2017 Intel Corporation eal_common_arch_sources = files('rte_spinlock.c', 'rte_cpuflags.c', - 'rte_cycles.c') + 'rte_cycles.c', 'rte_hypervisor.c') diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c index 0943851c..c8f1901f 100644 --- a/lib/librte_eal/common/eal_common_bus.c +++ b/lib/librte_eal/common/eal_common_bus.c @@ -37,10 +37,11 @@ #include <rte_bus.h> #include <rte_debug.h> #include <rte_string_fns.h> +#include <rte_errno.h> #include "eal_private.h" -struct rte_bus_list rte_bus_list = +static struct rte_bus_list rte_bus_list = TAILQ_HEAD_INITIALIZER(rte_bus_list); void @@ -242,3 +243,45 @@ rte_bus_get_iommu_class(void) } return mode; } + +static int +bus_handle_sigbus(const struct rte_bus *bus, + const void *failure_addr) +{ + int ret; + + if (!bus->sigbus_handler) + return -1; + + ret = bus->sigbus_handler(failure_addr); + + /* find bus but handle failed, keep the errno be set. */ + if (ret < 0 && rte_errno == 0) + rte_errno = ENOTSUP; + + return ret > 0; +} + +int +rte_bus_sigbus_handler(const void *failure_addr) +{ + struct rte_bus *bus; + + int ret = 0; + int old_errno = rte_errno; + + rte_errno = 0; + + bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr); + /* can not find bus. */ + if (!bus) + return 1; + /* find bus but handle failed, pass on the new errno. */ + else if (rte_errno != 0) + return -1; + + /* restore the old errno. */ + rte_errno = old_errno; + + return ret; +} diff --git a/lib/librte_eal/common/eal_common_class.c b/lib/librte_eal/common/eal_common_class.c index 404a9065..d922266d 100644 --- a/lib/librte_eal/common/eal_common_class.c +++ b/lib/librte_eal/common/eal_common_class.c @@ -9,7 +9,7 @@ #include <rte_class.h> #include <rte_debug.h> -struct rte_class_list rte_class_list = +static struct rte_class_list rte_class_list = TAILQ_HEAD_INITIALIZER(rte_class_list); __rte_experimental void diff --git a/lib/librte_eal/common/eal_common_dev.c b/lib/librte_eal/common/eal_common_dev.c index 678dbcac..62e9ed47 100644 --- a/lib/librte_eal/common/eal_common_dev.c +++ b/lib/librte_eal/common/eal_common_dev.c @@ -19,8 +19,10 @@ #include <rte_log.h> #include <rte_spinlock.h> #include <rte_malloc.h> +#include <rte_string_fns.h> #include "eal_private.h" +#include "hotplug_mp.h" /** * The device event callback description. @@ -74,119 +76,110 @@ static int cmp_dev_name(const struct rte_device *dev, const void *_name) return strcmp(dev->name, name); } -int rte_eal_dev_attach(const char *name, const char *devargs) +int __rte_experimental +rte_dev_is_probed(const struct rte_device *dev) { - struct rte_bus *bus; + /* The field driver should be set only when the probe is successful. */ + return dev->driver != NULL; +} - if (name == NULL || devargs == NULL) { - RTE_LOG(ERR, EAL, "Invalid device or arguments provided\n"); +/* helper function to build devargs, caller should free the memory */ +static int +build_devargs(const char *busname, const char *devname, + const char *drvargs, char **devargs) +{ + int length; + + length = snprintf(NULL, 0, "%s:%s,%s", busname, devname, drvargs); + if (length < 0) return -EINVAL; - } - bus = rte_bus_find_by_device_name(name); - if (bus == NULL) { - RTE_LOG(ERR, EAL, "Unable to find a bus for the device '%s'\n", - name); + *devargs = malloc(length + 1); + if (*devargs == NULL) + return -ENOMEM; + + length = snprintf(*devargs, length + 1, "%s:%s,%s", + busname, devname, drvargs); + if (length < 0) { + free(*devargs); return -EINVAL; } - if (strcmp(bus->name, "pci") == 0 || strcmp(bus->name, "vdev") == 0) - return rte_eal_hotplug_add(bus->name, name, devargs); - - RTE_LOG(ERR, EAL, - "Device attach is only supported for PCI and vdev devices.\n"); - return -ENOTSUP; + return 0; } -int rte_eal_dev_detach(struct rte_device *dev) +int +rte_eal_hotplug_add(const char *busname, const char *devname, + const char *drvargs) { - struct rte_bus *bus; - int ret; - if (dev == NULL) { - RTE_LOG(ERR, EAL, "Invalid device provided.\n"); - return -EINVAL; - } + char *devargs; + int ret; - bus = rte_bus_find_by_device(dev); - if (bus == NULL) { - RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n", - dev->name); - return -EINVAL; - } + ret = build_devargs(busname, devname, drvargs, &devargs); + if (ret != 0) + return ret; - if (bus->unplug == NULL) { - RTE_LOG(ERR, EAL, "Bus function not supported\n"); - return -ENOTSUP; - } + ret = rte_dev_probe(devargs); + free(devargs); - ret = bus->unplug(dev); - if (ret) - RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", - dev->name); return ret; } -int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname, - const char *devargs) +/* probe device at local process. */ +int +local_dev_probe(const char *devargs, struct rte_device **new_dev) { - struct rte_bus *bus; struct rte_device *dev; struct rte_devargs *da; int ret; - bus = rte_bus_find_by_name(busname); - if (bus == NULL) { - RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname); - return -ENOENT; - } - - if (bus->plug == NULL) { - RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n", - bus->name); - return -ENOTSUP; - } - + *new_dev = NULL; da = calloc(1, sizeof(*da)); if (da == NULL) return -ENOMEM; - ret = rte_devargs_parsef(da, "%s:%s,%s", - busname, devname, devargs); + ret = rte_devargs_parse(da, devargs); if (ret) goto err_devarg; + if (da->bus->plug == NULL) { + RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n", + da->bus->name); + ret = -ENOTSUP; + goto err_devarg; + } + ret = rte_devargs_insert(da); if (ret) goto err_devarg; - ret = bus->scan(); + ret = da->bus->scan(); if (ret) goto err_devarg; - dev = bus->find_device(NULL, cmp_dev_name, devname); + dev = da->bus->find_device(NULL, cmp_dev_name, da->name); if (dev == NULL) { RTE_LOG(ERR, EAL, "Cannot find device (%s)\n", - devname); + da->name); ret = -ENODEV; goto err_devarg; } - if (dev->driver != NULL) { - RTE_LOG(ERR, EAL, "Device is already plugged\n"); - return -EEXIST; - } - - ret = bus->plug(dev); + ret = dev->bus->plug(dev); if (ret) { + if (rte_dev_is_probed(dev)) /* if already succeeded earlier */ + return ret; /* no rollback */ RTE_LOG(ERR, EAL, "Driver cannot attach the device (%s)\n", dev->name); goto err_devarg; } + + *new_dev = dev; return 0; err_devarg: - if (rte_devargs_remove(busname, devname)) { + if (rte_devargs_remove(da) != 0) { free(da->args); free(da); } @@ -194,40 +187,235 @@ err_devarg: } int __rte_experimental -rte_eal_hotplug_remove(const char *busname, const char *devname) +rte_dev_probe(const char *devargs) { - struct rte_bus *bus; + struct eal_dev_mp_req req; struct rte_device *dev; int ret; + memset(&req, 0, sizeof(req)); + req.t = EAL_DEV_REQ_TYPE_ATTACH; + strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN); + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + /** + * If in secondary process, just send IPC request to + * primary process. + */ + ret = eal_dev_hotplug_request_to_primary(&req); + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug request to primary\n"); + return -ENOMSG; + } + if (req.result != 0) + RTE_LOG(ERR, EAL, + "Failed to hotplug add device\n"); + return req.result; + } + + /* attach a shared device from primary start from here: */ + + /* primary attach the new device itself. */ + ret = local_dev_probe(devargs, &dev); + + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to attach device on primary process\n"); + + /** + * it is possible that secondary process failed to attached a + * device that primary process have during initialization, + * so for -EEXIST case, we still need to sync with secondary + * process. + */ + if (ret != -EEXIST) + return ret; + } + + /* primary send attach sync request to secondary. */ + ret = eal_dev_hotplug_request_to_secondary(&req); + + /* if any communication error, we need to rollback. */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug add request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + /** + * if any secondary failed to attach, we need to consider if rollback + * is necessary. + */ + if (req.result != 0) { + RTE_LOG(ERR, EAL, + "Failed to attach device on secondary process\n"); + ret = req.result; + + /* for -EEXIST, we don't need to rollback. */ + if (ret == -EEXIST) + return ret; + goto rollback; + } + + return 0; + +rollback: + req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK; + + /* primary send rollback request to secondary. */ + if (eal_dev_hotplug_request_to_secondary(&req) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device attach on secondary." + "Devices in secondary may not sync with primary\n"); + + /* primary rollback itself. */ + if (local_dev_remove(dev) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device attach on primary." + "Devices in secondary may not sync with primary\n"); + + return ret; +} + +int +rte_eal_hotplug_remove(const char *busname, const char *devname) +{ + struct rte_device *dev; + struct rte_bus *bus; + bus = rte_bus_find_by_name(busname); if (bus == NULL) { RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname); return -ENOENT; } - if (bus->unplug == NULL) { - RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n", - bus->name); - return -ENOTSUP; - } - dev = bus->find_device(NULL, cmp_dev_name, devname); if (dev == NULL) { RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", devname); return -EINVAL; } - if (dev->driver == NULL) { - RTE_LOG(ERR, EAL, "Device is already unplugged\n"); - return -ENOENT; + return rte_dev_remove(dev); +} + +/* remove device at local process. */ +int +local_dev_remove(struct rte_device *dev) +{ + int ret; + + if (dev->bus->unplug == NULL) { + RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n", + dev->bus->name); + return -ENOTSUP; } - ret = bus->unplug(dev); - if (ret) + ret = dev->bus->unplug(dev); + if (ret) { RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", dev->name); - rte_devargs_remove(busname, devname); + return ret; + } + + return 0; +} + +int __rte_experimental +rte_dev_remove(struct rte_device *dev) +{ + struct eal_dev_mp_req req; + char *devargs; + int ret; + + if (!rte_dev_is_probed(dev)) { + RTE_LOG(ERR, EAL, "Device is not probed\n"); + return -ENOENT; + } + + ret = build_devargs(dev->bus->name, dev->name, "", &devargs); + if (ret != 0) + return ret; + + memset(&req, 0, sizeof(req)); + req.t = EAL_DEV_REQ_TYPE_DETACH; + strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN); + free(devargs); + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + /** + * If in secondary process, just send IPC request to + * primary process. + */ + ret = eal_dev_hotplug_request_to_primary(&req); + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug request to primary\n"); + return -ENOMSG; + } + if (req.result != 0) + RTE_LOG(ERR, EAL, + "Failed to hotplug remove device\n"); + return req.result; + } + + /* detach a device from primary start from here: */ + + /* primary send detach sync request to secondary */ + ret = eal_dev_hotplug_request_to_secondary(&req); + + /** + * if communication error, we need to rollback, because it is possible + * part of the secondary processes still detached it successfully. + */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send device detach request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + /** + * if any secondary failed to detach, we need to consider if rollback + * is necessary. + */ + if (req.result != 0) { + RTE_LOG(ERR, EAL, + "Failed to detach device on secondary process\n"); + ret = req.result; + /** + * if -ENOENT, we don't need to rollback, since devices is + * already detached on secondary process. + */ + if (ret != -ENOENT) + goto rollback; + } + + /* primary detach the device itself. */ + ret = local_dev_remove(dev); + + /* if primary failed, still need to consider if rollback is necessary */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to detach device on primary process\n"); + /* if -ENOENT, we don't need to rollback */ + if (ret == -ENOENT) + return ret; + goto rollback; + } + + return 0; + +rollback: + req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK; + + /* primary send rollback request to secondary. */ + if (eal_dev_hotplug_request_to_secondary(&req) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device detach on secondary." + "Devices in secondary may not sync with primary\n"); + return ret; } @@ -342,8 +530,9 @@ rte_dev_event_callback_unregister(const char *device_name, return ret; } -void -dev_callback_process(char *device_name, enum rte_dev_event_type event) +void __rte_experimental +rte_dev_event_callback_process(const char *device_name, + enum rte_dev_event_type event) { struct dev_event_callback *cb_lst; diff --git a/lib/librte_eal/common/eal_common_devargs.c b/lib/librte_eal/common/eal_common_devargs.c index dac2402a..b7b9cb69 100644 --- a/lib/librte_eal/common/eal_common_devargs.c +++ b/lib/librte_eal/common/eal_common_devargs.c @@ -4,9 +4,6 @@ /* This file manages the list of devices and their arguments, as given * by the user at startup - * - * Code here should not call rte_log since the EAL environment - * may not be initialized. */ #include <stdio.h> @@ -28,39 +25,9 @@ TAILQ_HEAD(rte_devargs_list, rte_devargs); /** Global list of user devices */ -struct rte_devargs_list devargs_list = +static struct rte_devargs_list devargs_list = TAILQ_HEAD_INITIALIZER(devargs_list); -int -rte_eal_parse_devargs_str(const char *devargs_str, - char **drvname, char **drvargs) -{ - char *sep; - - if ((devargs_str) == NULL || (drvname) == NULL || (drvargs == NULL)) - return -1; - - *drvname = strdup(devargs_str); - if (*drvname == NULL) - return -1; - - /* set the first ',' to '\0' to split name and arguments */ - sep = strchr(*drvname, ','); - if (sep != NULL) { - sep[0] = '\0'; - *drvargs = strdup(sep + 1); - } else { - *drvargs = strdup(""); - } - - if (*drvargs == NULL) { - free(*drvname); - *drvname = NULL; - return -1; - } - return 0; -} - static size_t devargs_layer_count(const char *s) { @@ -270,6 +237,7 @@ rte_devargs_parsef(struct rte_devargs *da, const char *format, ...) va_list ap; size_t len; char *dev; + int ret; if (da == NULL) return -EINVAL; @@ -288,7 +256,10 @@ rte_devargs_parsef(struct rte_devargs *da, const char *format, ...) vsnprintf(dev, len + 1, format, ap); va_end(ap); - return rte_devargs_parse(da, dev); + ret = rte_devargs_parse(da, dev); + + free(dev); + return ret; } int __rte_experimental @@ -296,7 +267,7 @@ rte_devargs_insert(struct rte_devargs *da) { int ret; - ret = rte_devargs_remove(da->bus->name, da->name); + ret = rte_devargs_remove(da); if (ret < 0) return ret; TAILQ_INSERT_TAIL(&devargs_list, da, next); @@ -342,14 +313,17 @@ fail: } int __rte_experimental -rte_devargs_remove(const char *busname, const char *devname) +rte_devargs_remove(struct rte_devargs *devargs) { struct rte_devargs *d; void *tmp; + if (devargs == NULL || devargs->bus == NULL) + return -1; + TAILQ_FOREACH_SAFE(d, &devargs_list, next, tmp) { - if (strcmp(d->bus->name, busname) == 0 && - strcmp(d->name, devname) == 0) { + if (strcmp(d->bus->name, devargs->bus->name) == 0 && + strcmp(d->name, devargs->name) == 0) { TAILQ_REMOVE(&devargs_list, d, next); free(d->args); free(d); diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c index 43caf3ce..ea0735cb 100644 --- a/lib/librte_eal/common/eal_common_fbarray.c +++ b/lib/librte_eal/common/eal_common_fbarray.c @@ -2,6 +2,7 @@ * Copyright(c) 2017-2018 Intel Corporation */ +#include <fcntl.h> #include <inttypes.h> #include <limits.h> #include <sys/mman.h> @@ -878,6 +879,10 @@ rte_fbarray_destroy(struct rte_fbarray *arr) if (ret) return ret; + /* with no shconf, there were never any files to begin with */ + if (internal_config.no_shconf) + return 0; + /* try deleting the file */ eal_get_fbarray_path(path, sizeof(path), arr->name); diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c index fbfb1b05..12dcedf5 100644 --- a/lib/librte_eal/common/eal_common_memory.c +++ b/lib/librte_eal/common/eal_common_memory.c @@ -2,6 +2,7 @@ * Copyright(c) 2010-2014 Intel Corporation */ +#include <fcntl.h> #include <errno.h> #include <stdio.h> #include <stdint.h> @@ -37,6 +38,23 @@ static void *next_baseaddr; static uint64_t system_page_sz; +#ifdef RTE_ARCH_64 +/* + * Linux kernel uses a really high address as starting address for serving + * mmaps calls. If there exists addressing limitations and IOVA mode is VA, + * this starting address is likely too high for those devices. However, it + * is possible to use a lower address in the process virtual address space + * as with 64 bits there is a lot of available space. + * + * Current known limitations are 39 or 40 bits. Setting the starting address + * at 4GB implies there are 508GB or 1020GB for mapping the available + * hugepages. This is likely enough for most systems, although a device with + * addressing limitations should call rte_eal_check_dma_mask for ensuring all + * memory is within supported range. + */ +static uint64_t baseaddr = 0x100000000; +#endif + void * eal_get_virtual_area(void *requested_addr, size_t *size, size_t page_sz, int flags, int mmap_flags) @@ -60,6 +78,11 @@ eal_get_virtual_area(void *requested_addr, size_t *size, rte_eal_process_type() == RTE_PROC_PRIMARY) next_baseaddr = (void *) internal_config.base_virtaddr; +#ifdef RTE_ARCH_64 + if (next_baseaddr == NULL && internal_config.base_virtaddr == 0 && + rte_eal_process_type() == RTE_PROC_PRIMARY) + next_baseaddr = (void *) baseaddr; +#endif if (requested_addr == NULL && next_baseaddr != NULL) { requested_addr = next_baseaddr; requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); @@ -91,7 +114,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size, mmap_flags, -1, 0); if (mapped_addr == MAP_FAILED && allow_shrink) *size -= page_sz; - } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0); + + if (mapped_addr != MAP_FAILED && addr_is_hint && + mapped_addr != requested_addr) { + /* hint was not used. Try with another offset */ + munmap(mapped_addr, map_sz); + mapped_addr = MAP_FAILED; + next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz); + requested_addr = next_baseaddr; + } + } while ((allow_shrink || addr_is_hint) && + mapped_addr == MAP_FAILED && *size > 0); /* align resulting address - if map failed, we will ignore the value * anyway, so no need to add additional checks. @@ -171,7 +204,7 @@ virt2memseg(const void *addr, const struct rte_memseg_list *msl) /* a memseg list was specified, check if it's the right one */ start = msl->base_va; - end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len); + end = RTE_PTR_ADD(start, msl->len); if (addr < start || addr >= end) return NULL; @@ -194,8 +227,7 @@ virt2memseg_list(const void *addr) msl = &mcfg->memsegs[msl_idx]; start = msl->base_va; - end = RTE_PTR_ADD(start, - (size_t)msl->page_sz * msl->memseg_arr.len); + end = RTE_PTR_ADD(start, msl->len); if (addr >= start && addr < end) break; } @@ -273,6 +305,9 @@ physmem_size(const struct rte_memseg_list *msl, void *arg) { uint64_t *total_len = arg; + if (msl->external) + return 0; + *total_len += msl->memseg_arr.count * msl->page_sz; return 0; @@ -294,7 +329,7 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int msl_idx, ms_idx; + int msl_idx, ms_idx, fd; FILE *f = arg; msl_idx = msl - mcfg->memsegs; @@ -305,10 +340,11 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, if (ms_idx < 0) return -1; + fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx); fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, " "virt:%p, socket_id:%"PRId32", " "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " - "nrank:%"PRIx32"\n", + "nrank:%"PRIx32" fd:%i\n", msl_idx, ms_idx, ms->iova, ms->len, @@ -316,7 +352,8 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, ms->socket_id, ms->hugepage_sz, ms->nchannel, - ms->nrank); + ms->nrank, + fd); return 0; } @@ -383,6 +420,66 @@ rte_dump_physmem_layout(FILE *f) rte_memseg_walk(dump_memseg, f); } +static int +check_iova(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + uint64_t *mask = arg; + rte_iova_t iova; + + /* higher address within segment */ + iova = (ms->iova + ms->len) - 1; + if (!(iova & *mask)) + return 0; + + RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n", + ms->iova, ms->len); + + RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask); + return 1; +} + +#if defined(RTE_ARCH_64) +#define MAX_DMA_MASK_BITS 63 +#else +#define MAX_DMA_MASK_BITS 31 +#endif + +/* check memseg iovas are within the required range based on dma mask */ +int __rte_experimental +rte_eal_check_dma_mask(uint8_t maskbits) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint64_t mask; + + /* sanity check */ + if (maskbits > MAX_DMA_MASK_BITS) { + RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n", + maskbits, MAX_DMA_MASK_BITS); + return -1; + } + + /* create dma mask */ + mask = ~((1ULL << maskbits) - 1); + + if (rte_memseg_walk(check_iova, &mask)) + /* + * Dma mask precludes hugepage usage. + * This device can not be used and we do not need to keep + * the dma mask. + */ + return 1; + + /* + * we need to keep the more restricted maskbit for checking + * potential dynamic memory allocation in the future. + */ + mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : + RTE_MIN(mcfg->dma_maskbits, maskbits); + + return 0; +} + /* return the number of memory channels */ unsigned rte_memory_get_nchannel(void) { @@ -548,6 +645,105 @@ rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) return ret; } +int __rte_experimental +rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int msl_idx, seg_idx, ret; + + if (ms == NULL) { + rte_errno = EINVAL; + return -1; + } + + msl = rte_mem_virt2memseg_list(ms->addr); + if (msl == NULL) { + rte_errno = EINVAL; + return -1; + } + arr = &msl->memseg_arr; + + msl_idx = msl - mcfg->memsegs; + seg_idx = rte_fbarray_find_idx(arr, ms); + + if (!rte_fbarray_is_used(arr, seg_idx)) { + rte_errno = ENOENT; + return -1; + } + + ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx); + if (ret < 0) { + rte_errno = -ret; + ret = -1; + } + return ret; +} + +int __rte_experimental +rte_memseg_get_fd(const struct rte_memseg *ms) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_get_fd_thread_unsafe(ms); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int __rte_experimental +rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, + size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int msl_idx, seg_idx, ret; + + if (ms == NULL || offset == NULL) { + rte_errno = EINVAL; + return -1; + } + + msl = rte_mem_virt2memseg_list(ms->addr); + if (msl == NULL) { + rte_errno = EINVAL; + return -1; + } + arr = &msl->memseg_arr; + + msl_idx = msl - mcfg->memsegs; + seg_idx = rte_fbarray_find_idx(arr, ms); + + if (!rte_fbarray_is_used(arr, seg_idx)) { + rte_errno = ENOENT; + return -1; + } + + ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset); + if (ret < 0) { + rte_errno = -ret; + ret = -1; + } + return ret; +} + +int __rte_experimental +rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + /* init memory subsystem */ int rte_eal_memory_init(void) diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c index 7300fe05..b7081afb 100644 --- a/lib/librte_eal/common/eal_common_memzone.c +++ b/lib/librte_eal/common/eal_common_memzone.c @@ -120,13 +120,15 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, return NULL; } - if ((socket_id != SOCKET_ID_ANY) && - (socket_id >= RTE_MAX_NUMA_NODES || socket_id < 0)) { + if ((socket_id != SOCKET_ID_ANY) && socket_id < 0) { rte_errno = EINVAL; return NULL; } - if (!rte_eal_has_hugepages()) + /* only set socket to SOCKET_ID_ANY if we aren't allocating for an + * external heap. + */ + if (!rte_eal_has_hugepages() && socket_id < RTE_MAX_NUMA_NODES) socket_id = SOCKET_ID_ANY; contig = (flags & RTE_MEMZONE_IOVA_CONTIG) != 0; diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c index dd5f9740..b82f3ddd 100644 --- a/lib/librte_eal/common/eal_common_options.c +++ b/lib/librte_eal/common/eal_common_options.c @@ -58,6 +58,7 @@ eal_long_options[] = { {OPT_HELP, 0, NULL, OPT_HELP_NUM }, {OPT_HUGE_DIR, 1, NULL, OPT_HUGE_DIR_NUM }, {OPT_HUGE_UNLINK, 0, NULL, OPT_HUGE_UNLINK_NUM }, + {OPT_IOVA_MODE, 1, NULL, OPT_IOVA_MODE_NUM }, {OPT_LCORES, 1, NULL, OPT_LCORES_NUM }, {OPT_LOG_LEVEL, 1, NULL, OPT_LOG_LEVEL_NUM }, {OPT_MASTER_LCORE, 1, NULL, OPT_MASTER_LCORE_NUM }, @@ -205,6 +206,7 @@ eal_reset_internal_config(struct internal_config *internal_cfg) #endif internal_cfg->vmware_tsc_map = 0; internal_cfg->create_uio_dev = 0; + internal_cfg->iova_mode = RTE_IOVA_DC; internal_cfg->user_mbuf_pool_ops_name = NULL; internal_cfg->init_complete = 0; } @@ -1075,6 +1077,25 @@ eal_parse_proc_type(const char *arg) return RTE_PROC_INVALID; } +static int +eal_parse_iova_mode(const char *name) +{ + int mode; + + if (name == NULL) + return -1; + + if (!strcmp("pa", name)) + mode = RTE_IOVA_PA; + else if (!strcmp("va", name)) + mode = RTE_IOVA_VA; + else + return -1; + + internal_config.iova_mode = mode; + return 0; +} + int eal_parse_common_option(int opt, const char *optarg, struct internal_config *conf) @@ -1281,6 +1302,13 @@ eal_parse_common_option(int opt, const char *optarg, case OPT_SINGLE_FILE_SEGMENTS_NUM: conf->single_file_segments = 1; break; + case OPT_IOVA_MODE_NUM: + if (eal_parse_iova_mode(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_IOVA_MODE "\n"); + return -1; + } + break; /* don't know what to do, leave this to caller */ default: @@ -1384,10 +1412,16 @@ eal_check_common_options(struct internal_config *internal_cfg) " is only supported in non-legacy memory mode\n"); } if (internal_cfg->single_file_segments && - internal_cfg->hugepage_unlink) { + internal_cfg->hugepage_unlink && + !internal_cfg->in_memory) { RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE_SEGMENTS" is " - "not compatible with neither --"OPT_IN_MEMORY" nor " - "--"OPT_HUGE_UNLINK"\n"); + "not compatible with --"OPT_HUGE_UNLINK"\n"); + return -1; + } + if (internal_cfg->legacy_mem && + internal_cfg->in_memory) { + RTE_LOG(ERR, EAL, "Option --"OPT_LEGACY_MEM" is not compatible " + "with --"OPT_IN_MEMORY"\n"); return -1; } @@ -1428,6 +1462,8 @@ eal_common_usage(void) " --"OPT_VDEV" Add a virtual device.\n" " The argument format is <driver><id>[,key=val,...]\n" " (ex: --vdev=net_pcap0,iface=eth2).\n" + " --"OPT_IOVA_MODE" Set IOVA mode. 'pa' for IOVA_PA\n" + " 'va' for IOVA_VA\n" " -d LIB.so|DIR Add a driver or driver directory\n" " (can be used multiple times)\n" " --"OPT_VMWARE_TSC_MAP" Use VMware TSC map instead of native RDTSC\n" diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c index 9fcb9121..97663d3b 100644 --- a/lib/librte_eal/common/eal_common_proc.c +++ b/lib/librte_eal/common/eal_common_proc.c @@ -939,13 +939,17 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, if (check_input(req) == false) return -1; + reply->nb_sent = 0; + reply->nb_received = 0; + reply->msgs = NULL; + if (internal_config.no_shconf) { RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); return 0; } if (gettimeofday(&now, NULL) < 0) { - RTE_LOG(ERR, EAL, "Faile to get current time\n"); + RTE_LOG(ERR, EAL, "Failed to get current time\n"); rte_errno = errno; return -1; } @@ -954,10 +958,6 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, end.tv_sec = now.tv_sec + ts->tv_sec + (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000; - reply->nb_sent = 0; - reply->nb_received = 0; - reply->msgs = NULL; - /* for secondary process, send request to the primary process only */ if (rte_eal_process_type() == RTE_PROC_SECONDARY) { pthread_mutex_lock(&pending_requests.lock); diff --git a/lib/librte_eal/common/eal_common_string_fns.c b/lib/librte_eal/common/eal_common_string_fns.c index 6ac5f828..60c5dd66 100644 --- a/lib/librte_eal/common/eal_common_string_fns.c +++ b/lib/librte_eal/common/eal_common_string_fns.c @@ -38,3 +38,29 @@ einval_error: errno = EINVAL; return -1; } + +/* Copy src string into dst. + * + * Return negative value and NUL-terminate if dst is too short, + * Otherwise return number of bytes copied. + */ +ssize_t +rte_strscpy(char *dst, const char *src, size_t dsize) +{ + size_t nleft = dsize; + size_t res = 0; + + /* Copy as many bytes as will fit. */ + while (nleft != 0) { + dst[res] = src[res]; + if (src[res] == '\0') + return res; + res++; + nleft--; + } + + /* Not enough room in dst, set NUL and return error. */ + if (res != 0) + dst[res - 1] = '\0'; + return -E2BIG; +} diff --git a/lib/librte_eal/common/eal_common_timer.c b/lib/librte_eal/common/eal_common_timer.c index 2e2b770f..dcf26bfe 100644 --- a/lib/librte_eal/common/eal_common_timer.c +++ b/lib/librte_eal/common/eal_common_timer.c @@ -7,9 +7,11 @@ #include <unistd.h> #include <inttypes.h> #include <sys/types.h> +#include <time.h> #include <errno.h> #include <rte_common.h> +#include <rte_compat.h> #include <rte_log.h> #include <rte_cycles.h> #include <rte_pause.h> @@ -31,6 +33,28 @@ rte_delay_us_block(unsigned int us) rte_pause(); } +void __rte_experimental +rte_delay_us_sleep(unsigned int us) +{ + struct timespec wait[2]; + int ind = 0; + + wait[0].tv_sec = 0; + if (us >= US_PER_S) { + wait[0].tv_sec = us / US_PER_S; + us -= wait[0].tv_sec * US_PER_S; + } + wait[0].tv_nsec = 1000 * us; + + while (nanosleep(&wait[ind], &wait[1 - ind]) && errno == EINTR) { + /* + * Sleep was interrupted. Flip the index, so the 'remainder' + * will become the 'request' for a next call. + */ + ind = 1 - ind; + } +} + uint64_t rte_get_tsc_hz(void) { diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h index de05febf..b3e8ae5e 100644 --- a/lib/librte_eal/common/eal_filesystem.h +++ b/lib/librte_eal/common/eal_filesystem.h @@ -27,7 +27,7 @@ eal_create_runtime_dir(void); /* returns runtime dir */ const char * -eal_get_runtime_dir(void); +rte_eal_get_runtime_dir(void); #define RUNTIME_CONFIG_FNAME "config" static inline const char * @@ -35,7 +35,7 @@ eal_runtime_config_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), RUNTIME_CONFIG_FNAME); return buffer; } @@ -47,7 +47,7 @@ eal_mp_socket_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), MP_SOCKET_FNAME); return buffer; } @@ -55,7 +55,8 @@ eal_mp_socket_path(void) #define FBARRAY_NAME_FMT "%s/fbarray_%s" static inline const char * eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) { - snprintf(buffer, buflen, FBARRAY_NAME_FMT, eal_get_runtime_dir(), name); + snprintf(buffer, buflen, FBARRAY_NAME_FMT, rte_eal_get_runtime_dir(), + name); return buffer; } @@ -66,7 +67,7 @@ eal_hugepage_info_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), HUGEPAGE_INFO_FNAME); return buffer; } @@ -78,7 +79,7 @@ eal_hugepage_data_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(), HUGEPAGE_DATA_FNAME); return buffer; } @@ -99,7 +100,7 @@ eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id static inline const char * eal_get_hugefile_lock_path(char *buffer, size_t buflen, int f_id) { - snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, eal_get_runtime_dir(), + snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, rte_eal_get_runtime_dir(), f_id); buffer[buflen - 1] = '\0'; return buffer; diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h index 00ee6e06..737f17e3 100644 --- a/lib/librte_eal/common/eal_internal_cfg.h +++ b/lib/librte_eal/common/eal_internal_cfg.h @@ -70,6 +70,7 @@ struct internal_config { /**< user defined mbuf pool ops name */ unsigned num_hugepage_sizes; /**< how many sizes on this system */ struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES]; + enum rte_iova_mode iova_mode ; /**< Set IOVA mode on this system */ volatile unsigned int init_complete; /**< indicates whether EAL has completed initialization */ }; diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h index 36bb1a02..af917c2f 100644 --- a/lib/librte_eal/common/eal_memalloc.h +++ b/lib/librte_eal/common/eal_memalloc.h @@ -76,6 +76,17 @@ eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id); int eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len); +/* returns fd or -errno */ +int +eal_memalloc_get_seg_fd(int list_idx, int seg_idx); + +/* returns 0 or -errno */ +int +eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd); + +int +eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset); + int eal_memalloc_init(void); diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h index 96e16678..5271f944 100644 --- a/lib/librte_eal/common/eal_options.h +++ b/lib/librte_eal/common/eal_options.h @@ -63,6 +63,8 @@ enum { OPT_LEGACY_MEM_NUM, #define OPT_SINGLE_FILE_SEGMENTS "single-file-segments" OPT_SINGLE_FILE_SEGMENTS_NUM, +#define OPT_IOVA_MODE "iova-mode" + OPT_IOVA_MODE_NUM, OPT_LONG_MAX_NUM }; diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h index 4f809a83..442c6dc4 100644 --- a/lib/librte_eal/common/eal_private.h +++ b/lib/librte_eal/common/eal_private.h @@ -259,18 +259,6 @@ struct rte_bus *rte_bus_find_by_device_name(const char *str); int rte_mp_channel_init(void); /** - * Internal Executes all the user application registered callbacks for - * the specific device. It is for DPDK internal user only. User - * application should not call it directly. - * - * @param device_name - * The device name. - * @param event - * the device event type. - */ -void dev_callback_process(char *device_name, enum rte_dev_event_type event); - -/** * @internal * Parse a device string and store its information in an * rte_devargs structure. @@ -304,4 +292,82 @@ int rte_devargs_layers_parse(struct rte_devargs *devargs, const char *devstr); +/* + * probe a device at local process. + * + * @param devargs + * Device arguments including bus, class and driver properties. + * @param new_dev + * new device be probed as output. + * @return + * 0 on success, negative on error. + */ +int local_dev_probe(const char *devargs, struct rte_device **new_dev); + +/** + * Hotplug remove a given device from a specific bus at local process. + * + * @param dev + * Data structure of the device to remove. + * @return + * 0 on success, negative on error. + */ +int local_dev_remove(struct rte_device *dev); + +/** + * Iterate over all buses to find the corresponding bus to handle the sigbus + * error. + * @param failure_addr + * Pointer of the fault address of the sigbus error. + * + * @return + * 0 success to handle the sigbus. + * -1 failed to handle the sigbus + * 1 no bus can handler the sigbus + */ +int rte_bus_sigbus_handler(const void *failure_addr); + +/** + * @internal + * Register the sigbus handler. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +dev_sigbus_handler_register(void); + +/** + * @internal + * Unregister the sigbus handler. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +dev_sigbus_handler_unregister(void); + +/** + * Check if the option is registered. + * + * @param option + * The option to be parsed. + * + * @return + * 0 on success + * @return + * -1 on fail + */ +int +rte_option_parse(const char *opt); + +/** + * Iterate through the registered options and execute the associated + * callback if enabled. + */ +void +rte_option_init(void); + #endif /* _EAL_PRIVATE_H_ */ diff --git a/lib/librte_eal/common/hotplug_mp.c b/lib/librte_eal/common/hotplug_mp.c new file mode 100644 index 00000000..84f59d95 --- /dev/null +++ b/lib/librte_eal/common/hotplug_mp.c @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ +#include <string.h> + +#include <rte_eal.h> +#include <rte_alarm.h> +#include <rte_string_fns.h> +#include <rte_devargs.h> + +#include "hotplug_mp.h" +#include "eal_private.h" + +#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */ + +struct mp_reply_bundle { + struct rte_mp_msg msg; + void *peer; +}; + +static int cmp_dev_name(const struct rte_device *dev, const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + +/** + * Secondary to primary request. + * start from function eal_dev_hotplug_request_to_primary. + * + * device attach on secondary: + * a) secondary send sync request to the primary. + * b) primary receive the request and attach the new device if + * failed goto i). + * c) primary forward attach sync request to all secondary. + * d) secondary receive the request and attach the device and send a reply. + * e) primary check the reply if all success goes to j). + * f) primary send attach rollback sync request to all secondary. + * g) secondary receive the request and detach the device and send a reply. + * h) primary receive the reply and detach device as rollback action. + * i) send attach fail to secondary as a reply of step a), goto k). + * j) send attach success to secondary as a reply of step a). + * k) secondary receive reply and return. + * + * device detach on secondary: + * a) secondary send sync request to the primary. + * b) primary send detach sync request to all secondary. + * c) secondary detach the device and send a reply. + * d) primary check the reply if all success goes to g). + * e) primary send detach rollback sync request to all secondary. + * f) secondary receive the request and attach back device. goto h). + * g) primary detach the device if success goto i), else goto e). + * h) primary send detach fail to secondary as a reply of step a), goto j). + * i) primary send detach success to secondary as a reply of step a). + * j) secondary receive reply and return. + */ + +static int +send_response_to_secondary(const struct eal_dev_mp_req *req, + int result, + const void *peer) +{ + struct rte_mp_msg mp_resp; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + int ret; + + memset(&mp_resp, 0, sizeof(mp_resp)); + mp_resp.len_param = sizeof(*resp); + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + memcpy(resp, req, sizeof(*req)); + resp->result = result; + + ret = rte_mp_reply(&mp_resp, peer); + if (ret != 0) + RTE_LOG(ERR, EAL, "failed to send response to secondary\n"); + + return ret; +} + +static void +__handle_secondary_request(void *param) +{ + struct mp_reply_bundle *bundle = param; + const struct rte_mp_msg *msg = &bundle->msg; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct eal_dev_mp_req tmp_req; + struct rte_devargs *da; + struct rte_device *dev; + struct rte_bus *bus; + int ret = 0; + + tmp_req = *req; + + if (req->t == EAL_DEV_REQ_TYPE_ATTACH) { + ret = local_dev_probe(req->devargs, &dev); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug add device on primary\n"); + if (ret != -EEXIST) + goto finish; + } + ret = eal_dev_hotplug_request_to_secondary(&tmp_req); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + if (tmp_req.result != 0) { + ret = tmp_req.result; + RTE_LOG(ERR, EAL, "Failed to hotplug add device on secondary\n"); + if (ret != -EEXIST) + goto rollback; + } + } else if (req->t == EAL_DEV_REQ_TYPE_DETACH) { + da = calloc(1, sizeof(*da)); + if (da == NULL) { + ret = -ENOMEM; + goto finish; + } + + ret = rte_devargs_parse(da, req->devargs); + if (ret != 0) + goto finish; + + ret = eal_dev_hotplug_request_to_secondary(&tmp_req); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + bus = rte_bus_find_by_name(da->bus->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name); + ret = -ENOENT; + goto finish; + } + + dev = bus->find_device(NULL, cmp_dev_name, da->name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name); + ret = -ENOENT; + goto finish; + } + + if (tmp_req.result != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug remove device on secondary\n"); + ret = tmp_req.result; + if (ret != -ENOENT) + goto rollback; + } + + ret = local_dev_remove(dev); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug remove device on primary\n"); + if (ret != -ENOENT) + goto rollback; + } + } else { + RTE_LOG(ERR, EAL, "unsupported secondary to primary request\n"); + ret = -ENOTSUP; + } + goto finish; + +rollback: + if (req->t == EAL_DEV_REQ_TYPE_ATTACH) { + tmp_req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK; + eal_dev_hotplug_request_to_secondary(&tmp_req); + local_dev_remove(dev); + } else { + tmp_req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK; + eal_dev_hotplug_request_to_secondary(&tmp_req); + } + +finish: + ret = send_response_to_secondary(&tmp_req, ret, bundle->peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send response to secondary\n"); + + free(bundle->peer); + free(bundle); +} + +static int +handle_secondary_request(const struct rte_mp_msg *msg, const void *peer) +{ + struct mp_reply_bundle *bundle; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + int ret = 0; + + bundle = malloc(sizeof(*bundle)); + if (bundle == NULL) { + RTE_LOG(ERR, EAL, "not enough memory\n"); + return send_response_to_secondary(req, -ENOMEM, peer); + } + + bundle->msg = *msg; + /** + * We need to send reply on interrupt thread, but peer can't be + * parsed directly, so this is a temporal hack, need to be fixed + * when it is ready. + */ + bundle->peer = strdup(peer); + + /** + * We are at IPC callback thread, sync IPC is not allowed due to + * dead lock, so we delegate the task to interrupt thread. + */ + ret = rte_eal_alarm_set(1, __handle_secondary_request, bundle); + if (ret != 0) { + RTE_LOG(ERR, EAL, "failed to add mp task\n"); + return send_response_to_secondary(req, ret, peer); + } + return 0; +} + +static void __handle_primary_request(void *param) +{ + struct mp_reply_bundle *bundle = param; + struct rte_mp_msg *msg = &bundle->msg; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct rte_mp_msg mp_resp; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + struct rte_devargs *da; + struct rte_device *dev; + struct rte_bus *bus; + int ret = 0; + + memset(&mp_resp, 0, sizeof(mp_resp)); + + switch (req->t) { + case EAL_DEV_REQ_TYPE_ATTACH: + case EAL_DEV_REQ_TYPE_DETACH_ROLLBACK: + ret = local_dev_probe(req->devargs, &dev); + break; + case EAL_DEV_REQ_TYPE_DETACH: + case EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK: + da = calloc(1, sizeof(*da)); + if (da == NULL) { + ret = -ENOMEM; + goto quit; + } + + ret = rte_devargs_parse(da, req->devargs); + if (ret != 0) + goto quit; + + bus = rte_bus_find_by_name(da->bus->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name); + ret = -ENOENT; + goto quit; + } + + dev = bus->find_device(NULL, cmp_dev_name, da->name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name); + ret = -ENOENT; + goto quit; + } + + ret = local_dev_remove(dev); +quit: + break; + default: + ret = -EINVAL; + } + + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + mp_resp.len_param = sizeof(*req); + memcpy(resp, req, sizeof(*resp)); + resp->result = ret; + if (rte_mp_reply(&mp_resp, bundle->peer) < 0) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + + free(bundle->peer); + free(bundle); +} + +static int +handle_primary_request(const struct rte_mp_msg *msg, const void *peer) +{ + struct rte_mp_msg mp_resp; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + struct mp_reply_bundle *bundle; + int ret = 0; + + memset(&mp_resp, 0, sizeof(mp_resp)); + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + mp_resp.len_param = sizeof(*req); + memcpy(resp, req, sizeof(*resp)); + + bundle = calloc(1, sizeof(*bundle)); + if (bundle == NULL) { + resp->result = -ENOMEM; + ret = rte_mp_reply(&mp_resp, peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + + bundle->msg = *msg; + /** + * We need to send reply on interrupt thread, but peer can't be + * parsed directly, so this is a temporal hack, need to be fixed + * when it is ready. + */ + bundle->peer = (void *)strdup(peer); + + /** + * We are at IPC callback thread, sync IPC is not allowed due to + * dead lock, so we delegate the task to interrupt thread. + */ + ret = rte_eal_alarm_set(1, __handle_primary_request, bundle); + if (ret != 0) { + resp->result = ret; + ret = rte_mp_reply(&mp_resp, peer); + if (ret != 0) { + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + } + return 0; +} + +int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req) +{ + struct rte_mp_msg mp_req; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0}; + struct eal_dev_mp_req *resp; + int ret; + + memset(&mp_req, 0, sizeof(mp_req)); + memcpy(mp_req.param, req, sizeof(*req)); + mp_req.len_param = sizeof(*req); + strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name)); + + ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts); + if (ret || mp_reply.nb_received != 1) { + RTE_LOG(ERR, EAL, "cannot send request to primary"); + if (!ret) + return -1; + return ret; + } + + resp = (struct eal_dev_mp_req *)mp_reply.msgs[0].param; + req->result = resp->result; + + return ret; +} + +int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req) +{ + struct rte_mp_msg mp_req; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0}; + int ret; + int i; + + memset(&mp_req, 0, sizeof(mp_req)); + memcpy(mp_req.param, req, sizeof(*req)); + mp_req.len_param = sizeof(*req); + strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name)); + + ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts); + if (ret != 0) { + RTE_LOG(ERR, EAL, "rte_mp_request_sync failed\n"); + return ret; + } + + if (mp_reply.nb_sent != mp_reply.nb_received) { + RTE_LOG(ERR, EAL, "not all secondary reply\n"); + return -1; + } + + req->result = 0; + for (i = 0; i < mp_reply.nb_received; i++) { + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_reply.msgs[i].param; + if (resp->result != 0) { + req->result = resp->result; + if (req->t == EAL_DEV_REQ_TYPE_ATTACH && + req->result != -EEXIST) + break; + if (req->t == EAL_DEV_REQ_TYPE_DETACH && + req->result != -ENOENT) + break; + } + } + + return 0; +} + +int rte_mp_dev_hotplug_init(void) +{ + int ret; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST, + handle_secondary_request); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + EAL_DEV_MP_ACTION_REQUEST); + return ret; + } + } else { + ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST, + handle_primary_request); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + EAL_DEV_MP_ACTION_REQUEST); + return ret; + } + } + + return 0; +} diff --git a/lib/librte_eal/common/hotplug_mp.h b/lib/librte_eal/common/hotplug_mp.h new file mode 100644 index 00000000..597fde3d --- /dev/null +++ b/lib/librte_eal/common/hotplug_mp.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _HOTPLUG_MP_H_ +#define _HOTPLUG_MP_H_ + +#include "rte_dev.h" +#include "rte_bus.h" + +#define EAL_DEV_MP_ACTION_REQUEST "eal_dev_mp_request" +#define EAL_DEV_MP_ACTION_RESPONSE "eal_dev_mp_response" + +#define EAL_DEV_MP_DEV_NAME_MAX_LEN RTE_DEV_NAME_MAX_LEN +#define EAL_DEV_MP_BUS_NAME_MAX_LEN 32 +#define EAL_DEV_MP_DEV_ARGS_MAX_LEN 128 + +enum eal_dev_req_type { + EAL_DEV_REQ_TYPE_ATTACH, + EAL_DEV_REQ_TYPE_DETACH, + EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK, + EAL_DEV_REQ_TYPE_DETACH_ROLLBACK, +}; + +struct eal_dev_mp_req { + enum eal_dev_req_type t; + char devargs[EAL_DEV_MP_DEV_ARGS_MAX_LEN]; + int result; +}; + +/** + * This is a synchronous wrapper for secondary process send + * request to primary process, this is invoked when an attach + * or detach request is issued from primary process. + */ +int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req); + +/** + * this is a synchronous wrapper for primary process send + * request to secondary process, this is invoked when an attach + * or detach request issued from secondary process. + */ +int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req); + + +#endif /* _HOTPLUG_MP_H_ */ diff --git a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h index c4f974fe..859b0974 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h +++ b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h @@ -29,8 +29,8 @@ extern "C" { #ifndef RTE_ARM_EAL_RDTSC_USE_PMU /** - * This call is easily portable to any ARM architecture, however, - * it may be damn slow and inprecise for some tasks. + * This call is easily portable to any architecture, however, + * it may require a system call and inprecise for some tasks. */ static inline uint64_t __rte_rdtsc_syscall(void) diff --git a/lib/librte_eal/common/include/arch/ppc_64/meson.build b/lib/librte_eal/common/include/arch/ppc_64/meson.build new file mode 100644 index 00000000..00f96117 --- /dev/null +++ b/lib/librte_eal/common/include/arch/ppc_64/meson.build @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2018 Luca Boccassi <bluca@debian.org> + +install_headers( + 'rte_atomic.h', + 'rte_byteorder.h', + 'rte_cpuflags.h', + 'rte_cycles.h', + 'rte_io.h', + 'rte_memcpy.h', + 'rte_pause.h', + 'rte_prefetch.h', + 'rte_rwlock.h', + 'rte_spinlock.h', + 'rte_vect.h', + subdir: get_option('include_subdir_arch')) diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h b/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h index 8bd83576..16e47ce2 100644 --- a/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h +++ b/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h @@ -9,10 +9,17 @@ extern "C" { #endif +#include "rte_atomic.h" + #include "generic/rte_pause.h" static inline void rte_pause(void) { + /* Set hardware multi-threading low priority */ + asm volatile("or 1,1,1"); + /* Set hardware multi-threading medium priority */ + asm volatile("or 2,2,2"); + rte_compiler_barrier(); } #ifdef __cplusplus diff --git a/lib/librte_eal/common/include/generic/rte_cycles.h b/lib/librte_eal/common/include/generic/rte_cycles.h index 0ff1af50..ac379e87 100644 --- a/lib/librte_eal/common/include/generic/rte_cycles.h +++ b/lib/librte_eal/common/include/generic/rte_cycles.h @@ -13,6 +13,7 @@ */ #include <stdint.h> +#include <rte_compat.h> #include <rte_debug.h> #include <rte_atomic.h> @@ -158,6 +159,16 @@ rte_delay_ms(unsigned ms) void rte_delay_us_block(unsigned int us); /** + * Delay function that uses system sleep. + * Does not block the CPU core. + * + * @param us + * Number of microseconds to wait. + */ +void __rte_experimental +rte_delay_us_sleep(unsigned int us); + +/** * Replace rte_delay_us with user defined function. * * @param userfunc diff --git a/lib/librte_eal/common/include/rte_bitmap.h b/lib/librte_eal/common/include/rte_bitmap.h index d9facc64..7a36ce73 100644 --- a/lib/librte_eal/common/include/rte_bitmap.h +++ b/lib/librte_eal/common/include/rte_bitmap.h @@ -88,7 +88,7 @@ __rte_bitmap_index1_inc(struct rte_bitmap *bmp) static inline uint64_t __rte_bitmap_mask1_get(struct rte_bitmap *bmp) { - return (~1lu) << bmp->offset1; + return (~1llu) << bmp->offset1; } static inline void @@ -317,7 +317,7 @@ rte_bitmap_get(struct rte_bitmap *bmp, uint32_t pos) index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2; offset2 = pos & RTE_BITMAP_SLAB_BIT_MASK; slab2 = bmp->array2 + index2; - return (*slab2) & (1lu << offset2); + return (*slab2) & (1llu << offset2); } /** @@ -342,8 +342,8 @@ rte_bitmap_set(struct rte_bitmap *bmp, uint32_t pos) slab2 = bmp->array2 + index2; slab1 = bmp->array1 + index1; - *slab2 |= 1lu << offset2; - *slab1 |= 1lu << offset1; + *slab2 |= 1llu << offset2; + *slab1 |= 1llu << offset1; } /** @@ -370,7 +370,7 @@ rte_bitmap_set_slab(struct rte_bitmap *bmp, uint32_t pos, uint64_t slab) slab1 = bmp->array1 + index1; *slab2 |= slab; - *slab1 |= 1lu << offset1; + *slab1 |= 1llu << offset1; } static inline uint64_t @@ -408,7 +408,7 @@ rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos) slab2 = bmp->array2 + index2; /* Return if array2 slab is not all-zeros */ - *slab2 &= ~(1lu << offset2); + *slab2 &= ~(1llu << offset2); if (*slab2){ return; } @@ -424,7 +424,7 @@ rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos) index1 = pos >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 + RTE_BITMAP_CL_BIT_SIZE_LOG2); offset1 = (pos >> RTE_BITMAP_CL_BIT_SIZE_LOG2) & RTE_BITMAP_SLAB_BIT_MASK; slab1 = bmp->array1 + index1; - *slab1 &= ~(1lu << offset1); + *slab1 &= ~(1llu << offset1); return; } diff --git a/lib/librte_eal/common/include/rte_bus.h b/lib/librte_eal/common/include/rte_bus.h index b7b5b084..6be4b5ca 100644 --- a/lib/librte_eal/common/include/rte_bus.h +++ b/lib/librte_eal/common/include/rte_bus.h @@ -168,6 +168,35 @@ typedef int (*rte_bus_unplug_t)(struct rte_device *dev); typedef int (*rte_bus_parse_t)(const char *name, void *addr); /** + * Implement a specific hot-unplug handler, which is responsible for + * handle the failure when device be hot-unplugged. When the event of + * hot-unplug be detected, it could call this function to handle + * the hot-unplug failure and avoid app crash. + * @param dev + * Pointer of the device structure. + * + * @return + * 0 on success. + * !0 on error. + */ +typedef int (*rte_bus_hot_unplug_handler_t)(struct rte_device *dev); + +/** + * Implement a specific sigbus handler, which is responsible for handling + * the sigbus error which is either original memory error, or specific memory + * error that caused of device be hot-unplugged. When sigbus error be captured, + * it could call this function to handle sigbus error. + * @param failure_addr + * Pointer of the fault address of the sigbus error. + * + * @return + * 0 for success handle the sigbus for hot-unplug. + * 1 for not process it, because it is a generic sigbus error. + * -1 for failed to handle the sigbus for hot-unplug. + */ +typedef int (*rte_bus_sigbus_handler_t)(const void *failure_addr); + +/** * Bus scan policies */ enum rte_bus_scan_mode { @@ -212,6 +241,11 @@ struct rte_bus { struct rte_bus_conf conf; /**< Bus configuration */ rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu class */ rte_dev_iterate_t dev_iterate; /**< Device iterator. */ + rte_bus_hot_unplug_handler_t hot_unplug_handler; + /**< handle hot-unplug failure on the bus */ + rte_bus_sigbus_handler_t sigbus_handler; + /**< handle sigbus error on the bus */ + }; /** diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h index 069c13ec..cba7bbc1 100644 --- a/lib/librte_eal/common/include/rte_common.h +++ b/lib/librte_eal/common/include/rte_common.h @@ -68,6 +68,11 @@ typedef uint16_t unaligned_uint16_t; /******* Macro to mark functions and fields scheduled for removal *****/ #define __rte_deprecated __attribute__((__deprecated__)) +/** + * Mark a function or variable to a weak reference. + */ +#define __rte_weak __attribute__((__weak__)) + /*********** Macros to eliminate unused variable warnings ********/ /** @@ -164,6 +169,12 @@ static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void) */ #define RTE_PTR_DIFF(ptr1, ptr2) ((uintptr_t)(ptr1) - (uintptr_t)(ptr2)) +/** + * Workaround to cast a const field of a structure to non-const type. + */ +#define RTE_CAST_FIELD(var, field, type) \ + (*(type *)((uintptr_t)(var) + offsetof(typeof(*(var)), field))) + /*********** Macros/static functions for doing alignment ********/ diff --git a/lib/librte_eal/common/include/rte_dev.h b/lib/librte_eal/common/include/rte_dev.h index b80a8059..cd6c187c 100644 --- a/lib/librte_eal/common/include/rte_dev.h +++ b/lib/librte_eal/common/include/rte_dev.h @@ -39,7 +39,7 @@ struct rte_dev_event { char *devname; /**< device name */ }; -typedef void (*rte_dev_event_cb_fn)(char *device_name, +typedef void (*rte_dev_event_cb_fn)(const char *device_name, enum rte_dev_event_type event, void *cb_arg); @@ -156,63 +156,67 @@ struct rte_driver { struct rte_device { TAILQ_ENTRY(rte_device) next; /**< Next device */ const char *name; /**< Device name */ - const struct rte_driver *driver;/**< Associated driver */ + const struct rte_driver *driver; /**< Driver assigned after probing */ + const struct rte_bus *bus; /**< Bus handle assigned on scan */ int numa_node; /**< NUMA node connection */ - struct rte_devargs *devargs; /**< Device user arguments */ + struct rte_devargs *devargs; /**< Arguments for latest probing */ }; /** - * Attach a device to a registered driver. + * @warning + * @b EXPERIMENTAL: this API may change without prior notice * - * @param name - * The device name, that refers to a pci device (or some private - * way of designating a vdev device). Based on this device name, eal - * will identify a driver capable of handling it and pass it to the - * driver probing function. - * @param devargs - * Device arguments to be passed to the driver. - * @return - * 0 on success, negative on error. - */ -__rte_deprecated -int rte_eal_dev_attach(const char *name, const char *devargs); - -/** - * Detach a device from its driver. + * Query status of a device. * * @param dev - * A pointer to a rte_device structure. + * Generic device pointer. * @return - * 0 on success, negative on error. + * (int)true if already probed successfully, 0 otherwise. */ -__rte_deprecated -int rte_eal_dev_detach(struct rte_device *dev); +__rte_experimental +int rte_dev_is_probed(const struct rte_device *dev); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Hotplug add a given device to a specific bus. * + * In multi-process, it will request other processes to add the same device. + * A failure, in any process, will rollback the action + * * @param busname * The bus name the device is added to. * @param devname * The device name. Based on this device name, eal will identify a driver * capable of handling it and pass it to the driver probing function. - * @param devargs + * @param drvargs * Device arguments to be passed to the driver. * @return * 0 on success, negative on error. */ -int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname, - const char *devargs); +int rte_eal_hotplug_add(const char *busname, const char *devname, + const char *drvargs); /** * @warning * @b EXPERIMENTAL: this API may change without prior notice * + * Add matching devices. + * + * In multi-process, it will request other processes to add the same device. + * A failure, in any process, will rollback the action + * + * @param devargs + * Device arguments including bus, class and driver properties. + * @return + * 0 on success, negative on error. + */ +int __rte_experimental rte_dev_probe(const char *devargs); + +/** * Hotplug remove a given device from a specific bus. * + * In multi-process, it will request other processes to remove the same device. + * A failure, in any process, will rollback the action + * * @param busname * The bus name the device is removed from. * @param devname @@ -220,8 +224,23 @@ int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devn * @return * 0 on success, negative on error. */ -int __rte_experimental rte_eal_hotplug_remove(const char *busname, - const char *devname); +int rte_eal_hotplug_remove(const char *busname, const char *devname); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Remove one device. + * + * In multi-process, it will request other processes to remove the same device. + * A failure, in any process, will rollback the action + * + * @param dev + * Data structure of the device to remove. + * @return + * 0 on success, negative on error. + */ +int __rte_experimental rte_dev_remove(struct rte_device *dev); /** * Device comparison function. @@ -438,6 +457,22 @@ rte_dev_event_callback_unregister(const char *device_name, * @warning * @b EXPERIMENTAL: this API may change without prior notice * + * Executes all the user application registered callbacks for + * the specific device. + * + * @param device_name + * The device name. + * @param event + * the device event type. + */ +void __rte_experimental +rte_dev_event_callback_process(const char *device_name, + enum rte_dev_event_type event); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * * Start the device event monitoring. * * @return @@ -460,4 +495,30 @@ rte_dev_event_monitor_start(void); int __rte_experimental rte_dev_event_monitor_stop(void); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Enable hotplug handling for devices. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_hotplug_handle_enable(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Disable hotplug handling for devices. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_hotplug_handle_disable(void); + #endif /* _RTE_DEV_H_ */ diff --git a/lib/librte_eal/common/include/rte_devargs.h b/lib/librte_eal/common/include/rte_devargs.h index 097a4ce7..b1f121f8 100644 --- a/lib/librte_eal/common/include/rte_devargs.h +++ b/lib/librte_eal/common/include/rte_devargs.h @@ -67,36 +67,6 @@ struct rte_devargs { }; /** - * @deprecated - * Parse a devargs string. - * - * For PCI devices, the format of arguments string is "PCI_ADDR" or - * "PCI_ADDR,key=val,key2=val2,...". Examples: "08:00.1", "0000:5:00.0", - * "04:00.0,arg=val". - * - * For virtual devices, the format of arguments string is "DRIVER_NAME*" - * or "DRIVER_NAME*,key=val,key2=val2,...". Examples: "net_ring", - * "net_ring0", "net_pmdAnything,arg=0:arg2=1". - * - * The function parses the arguments string to get driver name and driver - * arguments. - * - * @param devargs_str - * The arguments as given by the user. - * @param drvname - * The pointer to the string to store parsed driver name. - * @param drvargs - * The pointer to the string to store parsed driver arguments. - * - * @return - * - 0 on success - * - A negative value on error - */ -__rte_deprecated -int rte_eal_parse_devargs_str(const char *devargs_str, - char **drvname, char **drvargs); - -/** * Parse a device string. * * Verify that a bus is capable of handling the device passed @@ -202,32 +172,12 @@ __rte_experimental int rte_devargs_add(enum rte_devtype devtype, const char *devargs_str); /** - * @deprecated - * Add a device to the user device list - * See rte_devargs_parse() for details. - * - * @param devtype - * The type of the device. - * @param devargs_str - * The arguments as given by the user. - * - * @return - * - 0 on success - * - A negative value on error - */ -__rte_deprecated -int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str); - -/** * Remove a device from the user device list. * Its resources are freed. * If the devargs cannot be found, nothing happens. * - * @param busname - * bus name of the devargs to remove. - * - * @param devname - * device name of the devargs to remove. + * @param devargs + * The instance or a copy of devargs to remove. * * @return * 0 on success. @@ -235,8 +185,7 @@ int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str); * >0 if the devargs was not within the user device list. */ __rte_experimental -int rte_devargs_remove(const char *busname, - const char *devname); +int rte_devargs_remove(struct rte_devargs *devargs); /** * Count the number of user devices of a specified type @@ -252,20 +201,6 @@ unsigned int rte_devargs_type_count(enum rte_devtype devtype); /** - * @deprecated - * Count the number of user devices of a specified type - * - * @param devtype - * The type of the devices to counted. - * - * @return - * The number of devices. - */ -__rte_deprecated -unsigned int -rte_eal_devargs_type_count(enum rte_devtype devtype); - -/** * This function dumps the list of user device and their arguments. * * @param f @@ -275,16 +210,6 @@ __rte_experimental void rte_devargs_dump(FILE *f); /** - * @deprecated - * This function dumps the list of user device and their arguments. - * - * @param f - * A pointer to a file for output - */ -__rte_deprecated -void rte_eal_devargs_dump(FILE *f); - -/** * Find next rte_devargs matching the provided bus name. * * @param busname diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h index e114dcbd..a0cedd57 100644 --- a/lib/librte_eal/common/include/rte_eal.h +++ b/lib/librte_eal/common/include/rte_eal.h @@ -316,7 +316,7 @@ rte_mp_sendmsg(struct rte_mp_msg *msg); * * @param reply * The reply argument will be for storing all the replied messages; - * the caller is responsible for free reply->replies. + * the caller is responsible for free reply->msgs. * * @param ts * The ts argument specifies how long we can wait for the peer(s) to reply. @@ -378,6 +378,15 @@ int __rte_experimental rte_mp_reply(struct rte_mp_msg *msg, const char *peer); /** + * Register all mp action callbacks for hotplug. + * + * @return + * 0 on success, negative on error. + */ +int __rte_experimental +rte_mp_dev_hotplug_init(void); + +/** * Usage function typedef used by the application usage function. * * Use this function typedef to define and call rte_set_application_usage_hook() @@ -498,6 +507,15 @@ enum rte_iova_mode rte_eal_iova_mode(void); const char * rte_eal_mbuf_user_pool_ops(void); +/** + * Get the runtime directory of DPDK + * + * @return + * The runtime directory path of DPDK + */ +const char * +rte_eal_get_runtime_dir(void); + #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_eal_interrupts.h b/lib/librte_eal/common/include/rte_eal_interrupts.h index 6eb49327..9d302f41 100644 --- a/lib/librte_eal/common/include/rte_eal_interrupts.h +++ b/lib/librte_eal/common/include/rte_eal_interrupts.h @@ -35,6 +35,7 @@ enum rte_intr_handle_type { RTE_INTR_HANDLE_EXT, /**< external handler */ RTE_INTR_HANDLE_VDEV, /**< virtual device */ RTE_INTR_HANDLE_DEV_EVENT, /**< device event handle */ + RTE_INTR_HANDLE_VFIO_REQ, /**< VFIO request handle */ RTE_INTR_HANDLE_MAX /**< count of elements */ }; diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h index aff0688d..84aabe36 100644 --- a/lib/librte_eal/common/include/rte_eal_memconfig.h +++ b/lib/librte_eal/common/include/rte_eal_memconfig.h @@ -30,9 +30,11 @@ struct rte_memseg_list { uint64_t addr_64; /**< Makes sure addr is always 64-bits */ }; - int socket_id; /**< Socket ID for all memsegs in this list. */ uint64_t page_sz; /**< Page size for all memsegs in this list. */ + int socket_id; /**< Socket ID for all memsegs in this list. */ volatile uint32_t version; /**< version number for multiprocess sync. */ + size_t len; /**< Length of memory area covered by this memseg list. */ + unsigned int external; /**< 1 if this list points to external memory */ struct rte_fbarray memseg_arr; }; @@ -70,13 +72,23 @@ struct rte_mem_config { struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */ - /* Heaps of Malloc per socket */ - struct malloc_heap malloc_heaps[RTE_MAX_NUMA_NODES]; + /* Heaps of Malloc */ + struct malloc_heap malloc_heaps[RTE_MAX_HEAPS]; + + /* next socket ID for external malloc heap */ + int next_socket_id; /* address of mem_config in primary process. used to map shared config into * exact same address the primary process maps it. */ uint64_t mem_cfg_addr; + + /* legacy mem and single file segments options are shared */ + uint32_t legacy_mem; + uint32_t single_file_segments; + + /* keeps the more restricted dma mask */ + uint8_t dma_maskbits; } __attribute__((__packed__)); diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h index a9fb7e45..7249e6aa 100644 --- a/lib/librte_eal/common/include/rte_malloc.h +++ b/lib/librte_eal/common/include/rte_malloc.h @@ -264,6 +264,198 @@ rte_malloc_get_socket_stats(int socket, struct rte_malloc_socket_stats *socket_stats); /** + * Add memory chunk to a heap with specified name. + * + * @note Multiple memory chunks can be added to the same heap + * + * @note Before accessing this memory in other processes, it needs to be + * attached in each of those processes by calling + * ``rte_malloc_heap_memory_attach`` in each other process. + * + * @note Memory must be previously allocated for DPDK to be able to use it as a + * malloc heap. Failing to do so will result in undefined behavior, up to and + * including segmentation faults. + * + * @note Calling this function will erase any contents already present at the + * supplied memory address. + * + * @param heap_name + * Name of the heap to add memory chunk to + * @param va_addr + * Start of virtual area to add to the heap + * @param len + * Length of virtual area to add to the heap + * @param iova_addrs + * Array of page IOVA addresses corresponding to each page in this memory + * area. Can be NULL, in which case page IOVA addresses will be set to + * RTE_BAD_IOVA. + * @param n_pages + * Number of elements in the iova_addrs array. Ignored if ``iova_addrs`` + * is NULL. + * @param page_sz + * Page size of the underlying memory + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to add memory to a reserved heap + * ENOSPC - no more space in internal config to store a new memory chunk + */ +int __rte_experimental +rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz); + +/** + * Remove memory chunk from heap with specified name. + * + * @note Memory chunk being removed must be the same as one that was added; + * partially removing memory chunks is not supported + * + * @note Memory area must not contain any allocated elements to allow its + * removal from the heap + * + * @note All other processes must detach from the memory chunk prior to it being + * removed from the heap. + * + * @param heap_name + * Name of the heap to remove memory from + * @param va_addr + * Virtual address to remove from the heap + * @param len + * Length of virtual area to remove from the heap + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to remove memory from a reserved heap + * ENOENT - heap or memory chunk was not found + * EBUSY - memory chunk still contains data + */ +int __rte_experimental +rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len); + +/** + * Attach to an already existing chunk of external memory in another process. + * + * @note This function must be called before any attempt is made to use an + * already existing external memory chunk. This function does *not* need to + * be called if a call to ``rte_malloc_heap_memory_add`` was made in the + * current process. + * + * @param heap_name + * Heap name to which this chunk of memory belongs + * @param va_addr + * Start address of memory chunk to attach to + * @param len + * Length of memory chunk to attach to + * @return + * 0 on successful attach + * -1 on unsuccessful attach, with rte_errno set to indicate cause for error: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to attach memory to a reserved heap + * ENOENT - heap or memory chunk was not found + */ +int __rte_experimental +rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len); + +/** + * Detach from a chunk of external memory in secondary process. + * + * @note This function must be called in before any attempt is made to remove + * external memory from the heap in another process. This function does *not* + * need to be called if a call to ``rte_malloc_heap_memory_remove`` will be + * called in current process. + * + * @param heap_name + * Heap name to which this chunk of memory belongs + * @param va_addr + * Start address of memory chunk to attach to + * @param len + * Length of memory chunk to attach to + * @return + * 0 on successful detach + * -1 on unsuccessful detach, with rte_errno set to indicate cause for error: + * EINVAL - one of the parameters was invalid + * EPERM - attempted to detach memory from a reserved heap + * ENOENT - heap or memory chunk was not found + */ +int __rte_experimental +rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len); + +/** + * Creates a new empty malloc heap with a specified name. + * + * @note Heaps created via this call will automatically get assigned a unique + * socket ID, which can be found using ``rte_malloc_heap_get_socket()`` + * + * @param heap_name + * Name of the heap to create. + * + * @return + * - 0 on successful creation + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``heap_name`` was NULL, empty or too long + * EEXIST - heap by name of ``heap_name`` already exists + * ENOSPC - no more space in internal config to store a new heap + */ +int __rte_experimental +rte_malloc_heap_create(const char *heap_name); + +/** + * Destroys a previously created malloc heap with specified name. + * + * @note This function will return a failure result if not all memory allocated + * from the heap has been freed back to the heap + * + * @note This function will return a failure result if not all memory segments + * were removed from the heap prior to its destruction + * + * @param heap_name + * Name of the heap to create. + * + * @return + * - 0 on success + * - -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``heap_name`` was NULL, empty or too long + * ENOENT - heap by the name of ``heap_name`` was not found + * EPERM - attempting to destroy reserved heap + * EBUSY - heap still contains data + */ +int __rte_experimental +rte_malloc_heap_destroy(const char *heap_name); + +/** + * Find socket ID corresponding to a named heap. + * + * @param name + * Heap name to find socket ID for + * @return + * Socket ID in case of success (a non-negative number) + * -1 in case of error, with rte_errno set to one of the following: + * EINVAL - ``name`` was NULL + * ENOENT - heap identified by the name ``name`` was not found + */ +int __rte_experimental +rte_malloc_heap_get_socket(const char *name); + +/** + * Check if a given socket ID refers to externally allocated memory. + * + * @note Passing SOCKET_ID_ANY will return 0. + * + * @param socket_id + * Socket ID to check + * @return + * 1 if socket ID refers to externally allocated memory + * 0 if socket ID refers to internal DPDK memory + * -1 if socket ID is invalid + */ +int __rte_experimental +rte_malloc_heap_socket_is_external(int socket_id); + +/** * Dump statistics. * * Dump for the specified type to a file. If the type argument is diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h index d43fa909..4a7e0eb1 100644 --- a/lib/librte_eal/common/include/rte_malloc_heap.h +++ b/lib/librte_eal/common/include/rte_malloc_heap.h @@ -12,6 +12,7 @@ /* Number of free lists per heap, grouped by size. */ #define RTE_HEAP_NUM_FREELISTS 13 +#define RTE_HEAP_NAME_MAX_LEN 32 /* dummy definition, for pointers */ struct malloc_elem; @@ -26,7 +27,9 @@ struct malloc_heap { struct malloc_elem *volatile last; unsigned alloc_count; + unsigned int socket_id; size_t total_size; + char name[RTE_HEAP_NAME_MAX_LEN]; } __rte_cache_aligned; #endif /* _RTE_MALLOC_HEAP_H_ */ diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h index c4b7f4cf..ce937058 100644 --- a/lib/librte_eal/common/include/rte_memory.h +++ b/lib/librte_eal/common/include/rte_memory.h @@ -215,6 +215,9 @@ typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl, * @note This function read-locks the memory hotplug subsystem, and thus cannot * be used within memory-related callback functions. * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * * @param func * Iterator function * @param arg @@ -233,6 +236,9 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg); * @note This function read-locks the memory hotplug subsystem, and thus cannot * be used within memory-related callback functions. * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * * @param func * Iterator function * @param arg @@ -251,6 +257,9 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg); * @note This function read-locks the memory hotplug subsystem, and thus cannot * be used within memory-related callback functions. * + * @note This function will also walk through externally allocated segments. It + * is up to the user to decide whether to skip through these segments. + * * @param func * Iterator function * @param arg @@ -318,6 +327,103 @@ int __rte_experimental rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg); /** + * Return file descriptor associated with a particular memseg (if available). + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @note This returns an internal file descriptor. Performing any operations on + * this file descriptor is inherently dangerous, so it should be treated + * as read-only for all intents and purposes. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd(const struct rte_memseg *ms); + +/** + * Return file descriptor associated with a particular memseg (if available). + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @note This returns an internal file descriptor. Performing any operations on + * this file descriptor is inherently dangerous, so it should be treated + * as read-only for all intents and purposes. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms); + +/** + * Get offset into segment file descriptor associated with a particular memseg + * (if available). + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * @param offset + * A pointer to offset value where the result will be stored. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - EINVAL - ``offset`` pointer was NULL + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset); + +/** + * Get offset into segment file descriptor associated with a particular memseg + * (if available). + * + * @note This function does not perform any locking, and is only safe to call + * from within memory-related callback functions. + * + * @param ms + * A pointer to memseg for which to get file descriptor. + * @param offset + * A pointer to offset value where the result will be stored. + * + * @return + * Valid file descriptor in case of success. + * -1 in case of error, with ``rte_errno`` set to the following values: + * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg + * - EINVAL - ``offset`` pointer was NULL + * - ENODEV - ``ms`` fd is not available + * - ENOENT - ``ms`` is an unused segment + * - ENOTSUP - segment fd's are not supported + */ +int __rte_experimental +rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, + size_t *offset); + +/** * Dump the physical memory layout to a file. * * @note This function read-locks the memory hotplug subsystem, and thus cannot @@ -357,6 +463,9 @@ unsigned rte_memory_get_nchannel(void); */ unsigned rte_memory_get_nrank(void); +/* check memsegs iovas are within a range based on dma mask */ +int __rte_experimental rte_eal_check_dma_mask(uint8_t maskbits); + /** * Drivers based on uio will not load unless physical * addresses are obtainable. It is only possible to get diff --git a/lib/librte_eal/common/include/rte_option.h b/lib/librte_eal/common/include/rte_option.h new file mode 100644 index 00000000..8957b970 --- /dev/null +++ b/lib/librte_eal/common/include/rte_option.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation. + */ + +#ifndef __INCLUDE_RTE_OPTION_H__ +#define __INCLUDE_RTE_OPTION_H__ + +/** + * @file + * + * This API offers the ability to register options to the EAL command line and + * map those options to functions that will be executed at the end of EAL + * initialization. These options will be available as part of the EAL command + * line of applications and are dynamically managed. + * + * This is used primarily by DPDK libraries offering command line options. + * Currently, this API is limited to registering options without argument. + * + * The register API can be used to resolve circular dependency issues + * between EAL and the library. The library uses EAL, but is also initialized + * by EAL. Hence, EAL depends on the init function of the library. The API + * introduced in rte_option allows us to register the library init with EAL + * (passing a function pointer) and avoid the circular dependency. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int (*rte_option_cb)(void); + +/* + * Structure describing the EAL command line option being registered. + */ +struct rte_option { + TAILQ_ENTRY(rte_option) next; /**< Next entry in the list. */ + char *opt_str; /**< The option name. */ + rte_option_cb cb; /**< Function called when option is used. */ + int enabled; /**< Set when the option is used. */ +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register an option to the EAL command line. + * When recognized, the associated function will be executed at the end of EAL + * initialization. + * + * The associated structure must be available the whole time this option is + * registered (i.e. not stack memory). + * + * @param opt + * Structure describing the option to parse. + */ +void __rte_experimental +rte_option_register(struct rte_option *opt); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/librte_eal/common/include/rte_string_fns.h b/lib/librte_eal/common/include/rte_string_fns.h index 97597a14..9a2a1ff9 100644 --- a/lib/librte_eal/common/include/rte_string_fns.h +++ b/lib/librte_eal/common/include/rte_string_fns.h @@ -16,6 +16,7 @@ extern "C" { #endif #include <stdio.h> +#include <string.h> /** * Takes string "string" parameter and splits it at character "delim" @@ -60,12 +61,10 @@ rte_strlcpy(char *dst, const char *src, size_t size) /* pull in a strlcpy function */ #ifdef RTE_EXEC_ENV_BSDAPP -#include <string.h> #ifndef __BSD_VISIBLE /* non-standard functions are hidden */ #define strlcpy(dst, src, size) rte_strlcpy(dst, src, size) #endif - #else /* non-BSD platforms */ #ifdef RTE_USE_LIBBSD #include <bsd/string.h> @@ -76,6 +75,29 @@ rte_strlcpy(char *dst, const char *src, size_t size) #endif /* RTE_USE_LIBBSD */ #endif /* BSDAPP */ +/** + * Copy string src to buffer dst of size dsize. + * At most dsize-1 chars will be copied. + * Always NUL-terminates, unless (dsize == 0). + * Returns number of bytes copied (terminating NUL-byte excluded) on success ; + * negative errno on error. + * + * @param dst + * The destination string. + * + * @param src + * The input string to be copied. + * + * @param dsize + * Length in bytes of the destination buffer. + * + * @return + * The number of bytes copied on success + * -E2BIG if the destination buffer is too small. + */ +ssize_t +rte_strscpy(char *dst, const char *src, size_t dsize); + #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_version.h b/lib/librte_eal/common/include/rte_version.h index 7c6714a2..412ed2db 100644 --- a/lib/librte_eal/common/include/rte_version.h +++ b/lib/librte_eal/common/include/rte_version.h @@ -32,7 +32,7 @@ extern "C" { /** * Minor version/month number i.e. the mm in yy.mm.z */ -#define RTE_VER_MONTH 8 +#define RTE_VER_MONTH 11 /** * Patch level number i.e. the z in yy.mm.z @@ -42,14 +42,14 @@ extern "C" { /** * Extra string to be appended to version number */ -#define RTE_VER_SUFFIX "" +#define RTE_VER_SUFFIX "-rc" /** * Patch release number * 0-15 = release candidates * 16 = release */ -#define RTE_VER_RELEASE 16 +#define RTE_VER_RELEASE 1 /** * Macro to compute a version number usable for comparisons diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h index 5ca13fcc..cae96fab 100644 --- a/lib/librte_eal/common/include/rte_vfio.h +++ b/lib/librte_eal/common/include/rte_vfio.h @@ -14,6 +14,8 @@ extern "C" { #endif +#include <stdint.h> + /* * determine if VFIO is present on the system */ @@ -22,6 +24,9 @@ extern "C" { #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) #define VFIO_PRESENT #endif /* kernel version >= 3.6.0 */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0) +#define HAVE_VFIO_DEV_REQ_INTERFACE +#endif /* kernel version >= 4.0.0 */ #endif /* RTE_EAL_VFIO */ #ifdef VFIO_PRESENT @@ -44,6 +49,30 @@ extern "C" { #define RTE_VFIO_NOIOMMU 8 #endif +/* + * capabilities are only supported on kernel 4.6+. there were also some API + * changes as well, so add a macro to get cap offset. + */ +#ifdef VFIO_REGION_INFO_FLAG_CAPS +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS +#define VFIO_CAP_OFFSET(x) (x->cap_offset) +#else +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3) +#define VFIO_CAP_OFFSET(x) (x->resv) +struct vfio_info_cap_header { + uint16_t id; + uint16_t version; + uint32_t next; +}; +#endif + +/* kernels 4.16+ can map BAR containing MSI-X table */ +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE +#else +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3 +#endif + #else /* not VFIO_PRESENT */ /* we don't need an actual definition, only pointer is used */ @@ -227,7 +256,7 @@ rte_vfio_get_group_num(const char *sysfs_base, const char *dev_addr, int *iommu_group_num); /** - * Open VFIO container fd or get an existing one + * Open a new VFIO container fd * * This function is only relevant to linux and will return * an error on BSD. diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c index e0a8ed15..1a74660d 100644 --- a/lib/librte_eal/common/malloc_elem.c +++ b/lib/librte_eal/common/malloc_elem.c @@ -39,10 +39,14 @@ malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align) contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align); /* if we're in IOVA as VA mode, or if we're in legacy mode with - * hugepages, all elements are IOVA-contiguous. + * hugepages, all elements are IOVA-contiguous. however, we can only + * make these assumptions about internal memory - externally allocated + * segments have to be checked. */ - if (rte_eal_iova_mode() == RTE_IOVA_VA || - (internal_config.legacy_mem && rte_eal_has_hugepages())) + if (!elem->msl->external && + (rte_eal_iova_mode() == RTE_IOVA_VA || + (internal_config.legacy_mem && + rte_eal_has_hugepages()))) return RTE_PTR_DIFF(data_end, contig_seg_start); cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz); diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c index 12aaf2d7..1973b6e6 100644 --- a/lib/librte_eal/common/malloc_heap.c +++ b/lib/librte_eal/common/malloc_heap.c @@ -29,6 +29,10 @@ #include "malloc_heap.h" #include "malloc_mp.h" +/* start external socket ID's at a very high number */ +#define CONST_MAX(a, b) (a > b ? a : b) /* RTE_MAX is not a constant */ +#define EXTERNAL_HEAP_MIN_SOCKET_ID (CONST_MAX((1 << 8), RTE_MAX_NUMA_NODES)) + static unsigned check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) { @@ -66,6 +70,21 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) return check_flag & flags; } +int +malloc_socket_to_heap_id(unsigned int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i; + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + + if (heap->socket_id == socket_id) + return i; + } + return -1; +} + /* * Expand the heap with a memory area. */ @@ -93,9 +112,17 @@ malloc_add_seg(const struct rte_memseg_list *msl, struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_memseg_list *found_msl; struct malloc_heap *heap; - int msl_idx; + int msl_idx, heap_idx; + + if (msl->external) + return 0; - heap = &mcfg->malloc_heaps[msl->socket_id]; + heap_idx = malloc_socket_to_heap_id(msl->socket_id); + if (heap_idx < 0) { + RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n"); + return -1; + } + heap = &mcfg->malloc_heaps[heap_idx]; /* msl is const, so find it */ msl_idx = msl - mcfg->memsegs; @@ -165,7 +192,9 @@ find_biggest_element(struct malloc_heap *heap, size_t *size, for (elem = LIST_FIRST(&heap->free_head[idx]); !!elem; elem = LIST_NEXT(elem, free_list)) { size_t cur_size; - if (!check_hugepage_sz(flags, elem->msl->page_sz)) + if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 && + !check_hugepage_sz(flags, + elem->msl->page_sz)) continue; if (contig) { cur_size = @@ -259,11 +288,13 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, int socket, unsigned int flags, size_t align, size_t bound, bool contig, struct rte_memseg **ms, int n_segs) { + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct rte_memseg_list *msl; struct malloc_elem *elem = NULL; size_t alloc_sz; int allocd_pages; void *ret, *map_addr; + uint64_t mask; alloc_sz = (size_t)pg_sz * n_segs; @@ -291,6 +322,16 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, goto fail; } + if (mcfg->dma_maskbits) { + mask = ~((1ULL << mcfg->dma_maskbits) - 1); + if (rte_eal_check_dma_mask(mask)) { + RTE_LOG(ERR, EAL, + "%s(): couldn't allocate memory due to DMA mask\n", + __func__); + goto fail; + } + } + /* add newly minted memsegs to malloc heap */ elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz); @@ -326,11 +367,9 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz, /* we can't know in advance how many pages we'll need, so we malloc */ ms = malloc(sizeof(*ms) * n_segs); - - memset(ms, 0, sizeof(*ms) * n_segs); - if (ms == NULL) return -1; + memset(ms, 0, sizeof(*ms) * n_segs); elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align, bound, contig, ms, n_segs); @@ -560,12 +599,14 @@ alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket, /* this will try lower page sizes first */ static void * -heap_alloc_on_socket(const char *type, size_t size, int socket, - unsigned int flags, size_t align, size_t bound, bool contig) +malloc_heap_alloc_on_heap_id(const char *type, size_t size, + unsigned int heap_id, unsigned int flags, size_t align, + size_t bound, bool contig) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct malloc_heap *heap = &mcfg->malloc_heaps[socket]; + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; + int socket_id; void *ret; rte_spinlock_lock(&(heap->lock)); @@ -583,12 +624,28 @@ heap_alloc_on_socket(const char *type, size_t size, int socket, * we may still be able to allocate memory from appropriate page sizes, * we just need to request more memory first. */ + + socket_id = rte_socket_id_by_idx(heap_id); + /* + * if socket ID is negative, we cannot find a socket ID for this heap - + * which means it's an external heap. those can have unexpected page + * sizes, so if the user asked to allocate from there - assume user + * knows what they're doing, and allow allocating from there with any + * page size flags. + */ + if (socket_id < 0) + size_flags |= RTE_MEMZONE_SIZE_HINT_ONLY; + ret = heap_alloc(heap, type, size, size_flags, align, bound, contig); if (ret != NULL) goto alloc_unlock; - if (!alloc_more_mem_on_socket(heap, size, socket, flags, align, bound, - contig)) { + /* if socket ID is invalid, this is an external heap */ + if (socket_id < 0) + goto alloc_unlock; + + if (!alloc_more_mem_on_socket(heap, size, socket_id, flags, align, + bound, contig)) { ret = heap_alloc(heap, type, size, flags, align, bound, contig); /* this should have succeeded */ @@ -604,14 +661,14 @@ void * malloc_heap_alloc(const char *type, size_t size, int socket_arg, unsigned int flags, size_t align, size_t bound, bool contig) { - int socket, i, cur_socket; + int socket, heap_id, i; void *ret; /* return NULL if size is 0 or alignment is not power-of-2 */ if (size == 0 || (align && !rte_is_power_of_2(align))) return NULL; - if (!rte_eal_has_hugepages()) + if (!rte_eal_has_hugepages() && socket_arg < RTE_MAX_NUMA_NODES) socket_arg = SOCKET_ID_ANY; if (socket_arg == SOCKET_ID_ANY) @@ -619,22 +676,25 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg, else socket = socket_arg; - /* Check socket parameter */ - if (socket >= RTE_MAX_NUMA_NODES) + /* turn socket ID into heap ID */ + heap_id = malloc_socket_to_heap_id(socket); + /* if heap id is negative, socket ID was invalid */ + if (heap_id < 0) return NULL; - ret = heap_alloc_on_socket(type, size, socket, flags, align, bound, - contig); + ret = malloc_heap_alloc_on_heap_id(type, size, heap_id, flags, align, + bound, contig); if (ret != NULL || socket_arg != SOCKET_ID_ANY) return ret; - /* try other heaps */ + /* try other heaps. we are only iterating through native DPDK sockets, + * so external heaps won't be included. + */ for (i = 0; i < (int) rte_socket_count(); i++) { - cur_socket = rte_socket_id_by_idx(i); - if (cur_socket == socket) + if (i == heap_id) continue; - ret = heap_alloc_on_socket(type, size, cur_socket, flags, - align, bound, contig); + ret = malloc_heap_alloc_on_heap_id(type, size, i, flags, align, + bound, contig); if (ret != NULL) return ret; } @@ -642,11 +702,11 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg, } static void * -heap_alloc_biggest_on_socket(const char *type, int socket, unsigned int flags, - size_t align, bool contig) +heap_alloc_biggest_on_heap_id(const char *type, unsigned int heap_id, + unsigned int flags, size_t align, bool contig) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct malloc_heap *heap = &mcfg->malloc_heaps[socket]; + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; void *ret; rte_spinlock_lock(&(heap->lock)); @@ -664,7 +724,7 @@ void * malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, size_t align, bool contig) { - int socket, i, cur_socket; + int socket, i, cur_socket, heap_id; void *ret; /* return NULL if align is not power-of-2 */ @@ -679,11 +739,13 @@ malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, else socket = socket_arg; - /* Check socket parameter */ - if (socket >= RTE_MAX_NUMA_NODES) + /* turn socket ID into heap ID */ + heap_id = malloc_socket_to_heap_id(socket); + /* if heap id is negative, socket ID was invalid */ + if (heap_id < 0) return NULL; - ret = heap_alloc_biggest_on_socket(type, socket, flags, align, + ret = heap_alloc_biggest_on_heap_id(type, heap_id, flags, align, contig); if (ret != NULL || socket_arg != SOCKET_ID_ANY) return ret; @@ -693,8 +755,8 @@ malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, cur_socket = rte_socket_id_by_idx(i); if (cur_socket == socket) continue; - ret = heap_alloc_biggest_on_socket(type, cur_socket, flags, - align, contig); + ret = heap_alloc_biggest_on_heap_id(type, i, flags, align, + contig); if (ret != NULL) return ret; } @@ -756,8 +818,10 @@ malloc_heap_free(struct malloc_elem *elem) /* anything after this is a bonus */ ret = 0; - /* ...of which we can't avail if we are in legacy mode */ - if (internal_config.legacy_mem) + /* ...of which we can't avail if we are in legacy mode, or if this is an + * externally allocated segment. + */ + if (internal_config.legacy_mem || (msl->external > 0)) goto free_unlock; /* check if we can free any memory back to the system */ @@ -914,7 +978,7 @@ malloc_heap_resize(struct malloc_elem *elem, size_t size) } /* - * Function to retrieve data for heap on given socket + * Function to retrieve data for a given heap */ int malloc_heap_get_stats(struct malloc_heap *heap, @@ -952,7 +1016,7 @@ malloc_heap_get_stats(struct malloc_heap *heap, } /* - * Function to retrieve data for heap on given socket + * Function to retrieve data for a given heap */ void malloc_heap_dump(struct malloc_heap *heap, FILE *f) @@ -973,10 +1037,216 @@ malloc_heap_dump(struct malloc_heap *heap, FILE *f) rte_spinlock_unlock(&heap->lock); } +static int +destroy_seg(struct malloc_elem *elem, size_t len) +{ + struct malloc_heap *heap = elem->heap; + struct rte_memseg_list *msl; + + msl = elem->msl; + + /* notify all subscribers that a memory area is going to be removed */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, elem, len); + + /* this element can be removed */ + malloc_elem_free_list_remove(elem); + malloc_elem_hide_region(elem, elem, len); + + heap->total_size -= len; + + memset(elem, 0, sizeof(*elem)); + + /* destroy the fbarray backing this memory */ + if (rte_fbarray_destroy(&msl->memseg_arr) < 0) + return -1; + + /* reset the memseg list */ + memset(msl, 0, sizeof(*msl)); + + return 0; +} + +int +malloc_heap_add_external_memory(struct malloc_heap *heap, void *va_addr, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + char fbarray_name[RTE_FBARRAY_NAME_LEN]; + struct rte_memseg_list *msl = NULL; + struct rte_fbarray *arr; + size_t seg_len = n_pages * page_sz; + unsigned int i; + + /* first, find a free memseg list */ + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *tmp = &mcfg->memsegs[i]; + if (tmp->base_va == NULL) { + msl = tmp; + break; + } + } + if (msl == NULL) { + RTE_LOG(ERR, EAL, "Couldn't find empty memseg list\n"); + rte_errno = ENOSPC; + return -1; + } + + snprintf(fbarray_name, sizeof(fbarray_name) - 1, "%s_%p", + heap->name, va_addr); + + /* create the backing fbarray */ + if (rte_fbarray_init(&msl->memseg_arr, fbarray_name, n_pages, + sizeof(struct rte_memseg)) < 0) { + RTE_LOG(ERR, EAL, "Couldn't create fbarray backing the memseg list\n"); + return -1; + } + arr = &msl->memseg_arr; + + /* fbarray created, fill it up */ + for (i = 0; i < n_pages; i++) { + struct rte_memseg *ms; + + rte_fbarray_set_used(arr, i); + ms = rte_fbarray_get(arr, i); + ms->addr = RTE_PTR_ADD(va_addr, i * page_sz); + ms->iova = iova_addrs == NULL ? RTE_BAD_IOVA : iova_addrs[i]; + ms->hugepage_sz = page_sz; + ms->len = page_sz; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + ms->socket_id = heap->socket_id; + } + + /* set up the memseg list */ + msl->base_va = va_addr; + msl->page_sz = page_sz; + msl->socket_id = heap->socket_id; + msl->len = seg_len; + msl->version = 0; + msl->external = 1; + + /* erase contents of new memory */ + memset(va_addr, 0, seg_len); + + /* now, add newly minted memory to the malloc heap */ + malloc_heap_add_memory(heap, msl, va_addr, seg_len); + + heap->total_size += seg_len; + + /* all done! */ + RTE_LOG(DEBUG, EAL, "Added segment for heap %s starting at %p\n", + heap->name, va_addr); + + /* notify all subscribers that a new memory area has been added */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + va_addr, seg_len); + + return 0; +} + +int +malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, + size_t len) +{ + struct malloc_elem *elem = heap->first; + + /* find element with specified va address */ + while (elem != NULL && elem != va_addr) { + elem = elem->next; + /* stop if we've blown past our VA */ + if (elem > (struct malloc_elem *)va_addr) { + rte_errno = ENOENT; + return -1; + } + } + /* check if element was found */ + if (elem == NULL || elem->msl->len != len) { + rte_errno = ENOENT; + return -1; + } + /* if element's size is not equal to segment len, segment is busy */ + if (elem->state == ELEM_BUSY || elem->size != len) { + rte_errno = EBUSY; + return -1; + } + return destroy_seg(elem, len); +} + +int +malloc_heap_create(struct malloc_heap *heap, const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint32_t next_socket_id = mcfg->next_socket_id; + + /* prevent overflow. did you really create 2 billion heaps??? */ + if (next_socket_id > INT32_MAX) { + RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); + rte_errno = ENOSPC; + return -1; + } + + /* initialize empty heap */ + heap->alloc_count = 0; + heap->first = NULL; + heap->last = NULL; + LIST_INIT(heap->free_head); + rte_spinlock_init(&heap->lock); + heap->total_size = 0; + heap->socket_id = next_socket_id; + + /* we hold a global mem hotplug writelock, so it's safe to increment */ + mcfg->next_socket_id++; + + /* set up name */ + strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); + return 0; +} + +int +malloc_heap_destroy(struct malloc_heap *heap) +{ + if (heap->alloc_count != 0) { + RTE_LOG(ERR, EAL, "Heap is still in use\n"); + rte_errno = EBUSY; + return -1; + } + if (heap->first != NULL || heap->last != NULL) { + RTE_LOG(ERR, EAL, "Heap still contains memory segments\n"); + rte_errno = EBUSY; + return -1; + } + if (heap->total_size != 0) + RTE_LOG(ERR, EAL, "Total size not zero, heap is likely corrupt\n"); + + /* after this, the lock will be dropped */ + memset(heap, 0, sizeof(*heap)); + + return 0; +} + int rte_eal_malloc_heap_init(void) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* assign min socket ID to external heaps */ + mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID; + + /* assign names to default DPDK heaps */ + for (i = 0; i < rte_socket_count(); i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + char heap_name[RTE_HEAP_NAME_MAX_LEN]; + int socket_id = rte_socket_id_by_idx(i); + + snprintf(heap_name, sizeof(heap_name) - 1, + "socket_%i", socket_id); + strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); + heap->socket_id = socket_id; + } + } + if (register_mp_requests()) { RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n"); diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h index f52cb555..e48996d5 100644 --- a/lib/librte_eal/common/malloc_heap.h +++ b/lib/librte_eal/common/malloc_heap.h @@ -34,6 +34,20 @@ malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags, size_t align, bool contig); int +malloc_heap_create(struct malloc_heap *heap, const char *heap_name); + +int +malloc_heap_destroy(struct malloc_heap *heap); + +int +malloc_heap_add_external_memory(struct malloc_heap *heap, void *va_addr, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz); + +int +malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, + size_t len); + +int malloc_heap_free(struct malloc_elem *elem); int @@ -47,6 +61,9 @@ void malloc_heap_dump(struct malloc_heap *heap, FILE *f); int +malloc_socket_to_heap_id(unsigned int socket_id); + +int rte_eal_malloc_heap_init(void); #ifdef __cplusplus diff --git a/lib/librte_eal/common/malloc_mp.c b/lib/librte_eal/common/malloc_mp.c index 931c14bc..5f2d4e0b 100644 --- a/lib/librte_eal/common/malloc_mp.c +++ b/lib/librte_eal/common/malloc_mp.c @@ -194,13 +194,11 @@ handle_alloc_request(const struct malloc_mp_req *m, /* we can't know in advance how many pages we'll need, so we malloc */ ms = malloc(sizeof(*ms) * n_segs); - - memset(ms, 0, sizeof(*ms) * n_segs); - if (ms == NULL) { RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n"); goto fail; } + memset(ms, 0, sizeof(*ms) * n_segs); elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket, ar->flags, ar->align, ar->bound, ar->contig, ms, diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build index 56005bea..2a10d57d 100644 --- a/lib/librte_eal/common/meson.build +++ b/lib/librte_eal/common/meson.build @@ -14,6 +14,7 @@ common_sources = files( 'eal_common_errno.c', 'eal_common_fbarray.c', 'eal_common_hexdump.c', + 'eal_common_hypervisor.c', 'eal_common_launch.c', 'eal_common_lcore.c', 'eal_common_log.c', @@ -27,11 +28,13 @@ common_sources = files( 'eal_common_thread.c', 'eal_common_timer.c', 'eal_common_uuid.c', + 'hotplug_mp.c', 'malloc_elem.c', 'malloc_heap.c', 'malloc_mp.c', 'rte_keepalive.c', 'rte_malloc.c', + 'rte_option.c', 'rte_reciprocal.c', 'rte_service.c' ) @@ -59,6 +62,7 @@ common_headers = files( 'include/rte_errno.h', 'include/rte_fbarray.h', 'include/rte_hexdump.h', + 'include/rte_hypervisor.h', 'include/rte_interrupts.h', 'include/rte_keepalive.h', 'include/rte_launch.h', @@ -68,6 +72,7 @@ common_headers = files( 'include/rte_malloc_heap.h', 'include/rte_memory.h', 'include/rte_memzone.h', + 'include/rte_option.h', 'include/rte_pci_dev_feature_defs.h', 'include/rte_pci_dev_features.h', 'include/rte_per_lcore.h', diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c index b51a6d11..9e61dc41 100644 --- a/lib/librte_eal/common/rte_malloc.c +++ b/lib/librte_eal/common/rte_malloc.c @@ -8,6 +8,7 @@ #include <string.h> #include <sys/queue.h> +#include <rte_errno.h> #include <rte_memcpy.h> #include <rte_memory.h> #include <rte_eal.h> @@ -23,6 +24,7 @@ #include <rte_malloc.h> #include "malloc_elem.h" #include "malloc_heap.h" +#include "eal_memalloc.h" /* Free the memory space back to heap */ @@ -44,13 +46,15 @@ rte_malloc_socket(const char *type, size_t size, unsigned int align, if (size == 0 || (align && !rte_is_power_of_2(align))) return NULL; - if (!rte_eal_has_hugepages()) + /* if there are no hugepages and if we are not allocating from an + * external heap, use memory from any socket available. checking for + * socket being external may return -1 in case of invalid socket, but + * that's OK - if there are no hugepages, it doesn't matter. + */ + if (rte_malloc_heap_socket_is_external(socket_arg) != 1 && + !rte_eal_has_hugepages()) socket_arg = SOCKET_ID_ANY; - /* Check socket parameter */ - if (socket_arg >= RTE_MAX_NUMA_NODES) - return NULL; - return malloc_heap_alloc(type, size, socket_arg, 0, align == 0 ? 1 : align, 0, false); } @@ -152,11 +156,20 @@ rte_malloc_get_socket_stats(int socket, struct rte_malloc_socket_stats *socket_stats) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int heap_idx, ret = -1; - if (socket >= RTE_MAX_NUMA_NODES || socket < 0) - return -1; + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + heap_idx = malloc_socket_to_heap_id(socket); + if (heap_idx < 0) + goto unlock; - return malloc_heap_get_stats(&mcfg->malloc_heaps[socket], socket_stats); + ret = malloc_heap_get_stats(&mcfg->malloc_heaps[heap_idx], + socket_stats); +unlock: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; } /* @@ -168,12 +181,75 @@ rte_malloc_dump_heaps(FILE *f) struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; unsigned int idx; - for (idx = 0; idx < rte_socket_count(); idx++) { - unsigned int socket = rte_socket_id_by_idx(idx); - fprintf(f, "Heap on socket %i:\n", socket); - malloc_heap_dump(&mcfg->malloc_heaps[socket], f); + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + fprintf(f, "Heap id: %u\n", idx); + malloc_heap_dump(&mcfg->malloc_heaps[idx], f); + } + + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); +} + +int +rte_malloc_heap_get_socket(const char *name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + unsigned int idx; + int ret; + + if (name == NULL || + strnlen(name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[idx]; + + if (!strncmp(name, tmp->name, RTE_HEAP_NAME_MAX_LEN)) { + heap = tmp; + break; + } + } + + if (heap != NULL) { + ret = heap->socket_id; + } else { + rte_errno = ENOENT; + ret = -1; + } + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int +rte_malloc_heap_socket_is_external(int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int idx; + int ret = -1; + + if (socket_id == SOCKET_ID_ANY) + return 0; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[idx]; + + if ((int)tmp->socket_id == socket_id) { + /* external memory always has large socket ID's */ + ret = tmp->socket_id >= RTE_MAX_NUMA_NODES; + break; + } } + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return ret; } /* @@ -182,14 +258,20 @@ rte_malloc_dump_heaps(FILE *f) void rte_malloc_dump_stats(FILE *f, __rte_unused const char *type) { - unsigned int socket; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int heap_id; struct rte_malloc_socket_stats sock_stats; + + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + /* Iterate through all initialised heaps */ - for (socket=0; socket< RTE_MAX_NUMA_NODES; socket++) { - if ((rte_malloc_get_socket_stats(socket, &sock_stats) < 0)) - continue; + for (heap_id = 0; heap_id < RTE_MAX_HEAPS; heap_id++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; + + malloc_heap_get_stats(heap, &sock_stats); - fprintf(f, "Socket:%u\n", socket); + fprintf(f, "Heap id:%u\n", heap_id); + fprintf(f, "\tHeap name:%s\n", heap->name); fprintf(f, "\tHeap_size:%zu,\n", sock_stats.heap_totalsz_bytes); fprintf(f, "\tFree_size:%zu,\n", sock_stats.heap_freesz_bytes); fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes); @@ -198,6 +280,7 @@ rte_malloc_dump_stats(FILE *f, __rte_unused const char *type) fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count); fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count); } + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); return; } @@ -223,7 +306,7 @@ rte_malloc_virt2iova(const void *addr) if (elem == NULL) return RTE_BAD_IOVA; - if (rte_eal_iova_mode() == RTE_IOVA_VA) + if (!elem->msl->external && rte_eal_iova_mode() == RTE_IOVA_VA) return (uintptr_t) addr; ms = rte_mem_virt2memseg(addr, elem->msl); @@ -235,3 +318,320 @@ rte_malloc_virt2iova(const void *addr) return ms->iova + RTE_PTR_DIFF(addr, ms->addr); } + +static struct malloc_heap * +find_named_heap(const char *name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i; + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + + if (!strncmp(name, heap->name, RTE_HEAP_NAME_MAX_LEN)) + return heap; + } + return NULL; +} + +int +rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + unsigned int n; + int ret; + + if (heap_name == NULL || va_addr == NULL || + page_sz == 0 || !rte_is_power_of_2(page_sz) || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + ret = -1; + goto unlock; + } + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + /* cannot add memory to internal heaps */ + rte_errno = EPERM; + ret = -1; + goto unlock; + } + n = len / page_sz; + if (n != n_pages && iova_addrs != NULL) { + rte_errno = EINVAL; + ret = -1; + goto unlock; + } + + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_add_external_memory(heap, va_addr, iova_addrs, n, + page_sz); + rte_spinlock_unlock(&heap->lock); + +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int +rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + int ret; + + if (heap_name == NULL || va_addr == NULL || len == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + /* cannot remove memory from internal heaps */ + rte_errno = EPERM; + ret = -1; + goto unlock; + } + + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_remove_external_memory(heap, va_addr, len); + rte_spinlock_unlock(&heap->lock); + +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +struct sync_mem_walk_arg { + void *va_addr; + size_t len; + int result; + bool attach; +}; + +static int +sync_mem_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct sync_mem_walk_arg *wa = arg; + size_t len = msl->page_sz * msl->memseg_arr.len; + + if (msl->base_va == wa->va_addr && + len == wa->len) { + struct rte_memseg_list *found_msl; + int msl_idx, ret; + + /* msl is const */ + msl_idx = msl - mcfg->memsegs; + found_msl = &mcfg->memsegs[msl_idx]; + + if (wa->attach) { + ret = rte_fbarray_attach(&found_msl->memseg_arr); + } else { + /* notify all subscribers that a memory area is about to + * be removed + */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + msl->base_va, msl->len); + ret = rte_fbarray_detach(&found_msl->memseg_arr); + } + + if (ret < 0) { + wa->result = -rte_errno; + } else { + /* notify all subscribers that a new memory area was + * added + */ + if (wa->attach) + eal_memalloc_mem_event_notify( + RTE_MEM_EVENT_ALLOC, + msl->base_va, msl->len); + wa->result = 0; + } + return 1; + } + return 0; +} + +static int +sync_memory(const char *heap_name, void *va_addr, size_t len, bool attach) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + struct sync_mem_walk_arg wa; + int ret; + + if (heap_name == NULL || va_addr == NULL || len == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + /* we shouldn't be able to sync to internal heaps */ + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + rte_errno = EPERM; + ret = -1; + goto unlock; + } + + /* find corresponding memseg list to sync to */ + wa.va_addr = va_addr; + wa.len = len; + wa.result = -ENOENT; /* fail unless explicitly told to succeed */ + wa.attach = attach; + + /* we're already holding a read lock */ + rte_memseg_list_walk_thread_unsafe(sync_mem_walk, &wa); + + if (wa.result < 0) { + rte_errno = -wa.result; + ret = -1; + } else { + /* notify all subscribers that a new memory area was added */ + if (attach) + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + va_addr, len); + ret = 0; + } +unlock: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return ret; +} + +int +rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len) +{ + return sync_memory(heap_name, va_addr, len, true); +} + +int +rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len) +{ + return sync_memory(heap_name, va_addr, len, false); +} + +int +rte_malloc_heap_create(const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + int i, ret; + + if (heap_name == NULL || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + /* check if there is space in the heap list, or if heap with this name + * already exists. + */ + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[i]; + /* existing heap */ + if (strncmp(heap_name, tmp->name, + RTE_HEAP_NAME_MAX_LEN) == 0) { + RTE_LOG(ERR, EAL, "Heap %s already exists\n", + heap_name); + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + /* empty heap */ + if (strnlen(tmp->name, RTE_HEAP_NAME_MAX_LEN) == 0) { + heap = tmp; + break; + } + } + if (heap == NULL) { + RTE_LOG(ERR, EAL, "Cannot create new heap: no space\n"); + rte_errno = ENOSPC; + ret = -1; + goto unlock; + } + + /* we're sure that we can create a new heap, so do it */ + ret = malloc_heap_create(heap, heap_name); +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} + +int +rte_malloc_heap_destroy(const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + int ret; + + if (heap_name == NULL || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + /* start from non-socket heaps */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + RTE_LOG(ERR, EAL, "Heap %s not found\n", heap_name); + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + /* we shouldn't be able to destroy internal heaps */ + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + rte_errno = EPERM; + ret = -1; + goto unlock; + } + /* sanity checks done, now we can destroy the heap */ + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_destroy(heap); + + /* if we failed, lock is still active */ + if (ret < 0) + rte_spinlock_unlock(&heap->lock); +unlock: + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + + return ret; +} diff --git a/lib/librte_eal/common/rte_option.c b/lib/librte_eal/common/rte_option.c new file mode 100644 index 00000000..02d59a86 --- /dev/null +++ b/lib/librte_eal/common/rte_option.c @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation. + */ + +#include <unistd.h> +#include <string.h> + +#include <rte_eal.h> +#include <rte_option.h> + +#include "eal_private.h" + +TAILQ_HEAD(rte_option_list, rte_option); + +struct rte_option_list rte_option_list = + TAILQ_HEAD_INITIALIZER(rte_option_list); + +static struct rte_option *option; + +int +rte_option_parse(const char *opt) +{ + /* Check if the option is registered */ + TAILQ_FOREACH(option, &rte_option_list, next) { + if (strcmp(opt, option->opt_str) == 0) { + option->enabled = 1; + return 0; + } + } + + return -1; +} + +void __rte_experimental +rte_option_register(struct rte_option *opt) +{ + TAILQ_FOREACH(option, &rte_option_list, next) { + if (strcmp(opt->opt_str, option->opt_str) == 0) + RTE_LOG(INFO, EAL, "Option %s has already been registered.", + opt->opt_str); + return; + } + + TAILQ_INSERT_HEAD(&rte_option_list, opt, next); +} + +void +rte_option_init(void) +{ + TAILQ_FOREACH(option, &rte_option_list, next) { + if (option->enabled) + option->cb(); + } +} |