From 1bd9b61222f3a81ffe770fc00b70ded6e760c42b Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Fri, 1 Jun 2018 09:09:08 +0200 Subject: New upstream version 18.05 Change-Id: Icd4170ddc4f63aeae5d0559490e5195b5349f9c2 Signed-off-by: Christian Ehrhardt --- lib/librte_eal/common/Makefile | 2 +- lib/librte_eal/common/arch/arm/rte_cpuflags.c | 54 +- lib/librte_eal/common/arch/arm/rte_hypervisor.c | 2 +- lib/librte_eal/common/arch/ppc_64/rte_cpuflags.c | 15 +- lib/librte_eal/common/arch/ppc_64/rte_hypervisor.c | 2 +- lib/librte_eal/common/arch/x86/rte_hypervisor.c | 2 +- lib/librte_eal/common/eal_common_bus.c | 3 +- lib/librte_eal/common/eal_common_dev.c | 200 ++++- lib/librte_eal/common/eal_common_devargs.c | 53 +- lib/librte_eal/common/eal_common_fbarray.c | 879 ++++++++++++++++++++ lib/librte_eal/common/eal_common_hypervisor.c | 2 +- lib/librte_eal/common/eal_common_lcore.c | 75 +- lib/librte_eal/common/eal_common_log.c | 119 ++- lib/librte_eal/common/eal_common_memalloc.c | 364 +++++++++ lib/librte_eal/common/eal_common_memory.c | 892 ++++++++++++++++++++- lib/librte_eal/common/eal_common_memzone.c | 240 +++--- lib/librte_eal/common/eal_common_options.c | 114 ++- lib/librte_eal/common/eal_common_proc.c | 749 ++++++++++++++--- lib/librte_eal/common/eal_common_thread.c | 97 ++- lib/librte_eal/common/eal_filesystem.h | 62 +- lib/librte_eal/common/eal_hugepages.h | 11 +- lib/librte_eal/common/eal_internal_cfg.h | 14 +- lib/librte_eal/common/eal_memalloc.h | 82 ++ lib/librte_eal/common/eal_options.h | 4 + lib/librte_eal/common/eal_private.h | 53 ++ .../common/include/arch/arm/rte_atomic.h | 32 +- .../common/include/arch/arm/rte_atomic_32.h | 32 +- .../common/include/arch/arm/rte_byteorder.h | 32 +- .../common/include/arch/arm/rte_cpuflags.h | 32 +- .../common/include/arch/arm/rte_cpuflags_32.h | 32 +- .../common/include/arch/arm/rte_cycles.h | 32 +- .../common/include/arch/arm/rte_cycles_32.h | 32 +- .../common/include/arch/arm/rte_memcpy.h | 32 +- .../common/include/arch/arm/rte_memcpy_32.h | 32 +- .../common/include/arch/arm/rte_prefetch.h | 32 +- .../common/include/arch/arm/rte_prefetch_32.h | 32 +- .../common/include/arch/arm/rte_rwlock.h | 2 + .../common/include/arch/arm/rte_spinlock.h | 32 +- .../common/include/arch/ppc_64/rte_atomic.h | 23 +- .../common/include/arch/ppc_64/rte_rwlock.h | 2 + .../common/include/arch/x86/rte_atomic.h | 24 + .../common/include/arch/x86/rte_atomic_32.h | 12 + .../common/include/arch/x86/rte_atomic_64.h | 12 + .../common/include/arch/x86/rte_memcpy.h | 24 +- .../common/include/arch/x86/rte_spinlock.h | 4 +- lib/librte_eal/common/include/generic/rte_atomic.h | 90 +++ .../common/include/generic/rte_byteorder.h | 6 +- .../common/include/generic/rte_cpuflags.h | 21 + lib/librte_eal/common/include/generic/rte_rwlock.h | 4 +- lib/librte_eal/common/include/rte_bus.h | 2 +- lib/librte_eal/common/include/rte_common.h | 141 +++- lib/librte_eal/common/include/rte_dev.h | 108 ++- lib/librte_eal/common/include/rte_devargs.h | 126 ++- lib/librte_eal/common/include/rte_eal.h | 42 +- lib/librte_eal/common/include/rte_eal_interrupts.h | 1 + lib/librte_eal/common/include/rte_eal_memconfig.h | 28 +- lib/librte_eal/common/include/rte_fbarray.h | 356 ++++++++ lib/librte_eal/common/include/rte_hypervisor.h | 2 +- lib/librte_eal/common/include/rte_lcore.h | 60 +- lib/librte_eal/common/include/rte_log.h | 40 +- lib/librte_eal/common/include/rte_malloc.h | 10 + lib/librte_eal/common/include/rte_malloc_heap.h | 6 + lib/librte_eal/common/include/rte_memory.h | 276 ++++++- lib/librte_eal/common/include/rte_memzone.h | 33 +- .../common/include/rte_pci_dev_feature_defs.h | 58 +- .../common/include/rte_pci_dev_features.h | 58 +- lib/librte_eal/common/include/rte_random.h | 6 +- lib/librte_eal/common/include/rte_service.h | 117 +-- .../common/include/rte_service_component.h | 38 +- lib/librte_eal/common/include/rte_string_fns.h | 31 + lib/librte_eal/common/include/rte_version.h | 4 +- lib/librte_eal/common/include/rte_vfio.h | 261 +++++- lib/librte_eal/common/malloc_elem.c | 398 +++++++-- lib/librte_eal/common/malloc_elem.h | 45 +- lib/librte_eal/common/malloc_heap.c | 744 ++++++++++++++++- lib/librte_eal/common/malloc_heap.h | 15 +- lib/librte_eal/common/malloc_mp.c | 743 +++++++++++++++++ lib/librte_eal/common/malloc_mp.h | 86 ++ lib/librte_eal/common/meson.build | 4 + lib/librte_eal/common/rte_malloc.c | 85 +- lib/librte_eal/common/rte_service.c | 55 +- 81 files changed, 7342 insertions(+), 1335 deletions(-) create mode 100644 lib/librte_eal/common/eal_common_fbarray.c create mode 100644 lib/librte_eal/common/eal_common_memalloc.c create mode 100644 lib/librte_eal/common/eal_memalloc.h create mode 100644 lib/librte_eal/common/include/rte_fbarray.h create mode 100644 lib/librte_eal/common/malloc_mp.c create mode 100644 lib/librte_eal/common/malloc_mp.h (limited to 'lib/librte_eal/common') diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile index ea824a3a..48f870f2 100644 --- a/lib/librte_eal/common/Makefile +++ b/lib/librte_eal/common/Makefile @@ -16,7 +16,7 @@ INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h INC += rte_malloc.h rte_keepalive.h rte_time.h INC += rte_service.h rte_service_component.h INC += rte_bitmap.h rte_vfio.h rte_hypervisor.h rte_test.h -INC += rte_reciprocal.h +INC += rte_reciprocal.h rte_fbarray.h GENERIC_INC := rte_atomic.h rte_byteorder.h rte_cycles.h rte_prefetch.h GENERIC_INC += rte_spinlock.h rte_memcpy.h rte_cpuflags.h rte_rwlock.h diff --git a/lib/librte_eal/common/arch/arm/rte_cpuflags.c b/lib/librte_eal/common/arch/arm/rte_cpuflags.c index 88f1cbe3..caf3dc83 100644 --- a/lib/librte_eal/common/arch/arm/rte_cpuflags.c +++ b/lib/librte_eal/common/arch/arm/rte_cpuflags.c @@ -1,34 +1,6 @@ -/* - * BSD LICENSE - * - * Copyright (C) Cavium, Inc. 2015. - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Cavium, Inc nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) Cavium, Inc. 2015. + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #include "rte_cpuflags.h" @@ -133,22 +105,10 @@ const struct feature_entry rte_cpu_feature_table[] = { static void rte_cpu_get_features(hwcap_registers_t out) { - int auxv_fd; - _Elfx_auxv_t auxv; - - auxv_fd = open("/proc/self/auxv", O_RDONLY); - assert(auxv_fd != -1); - while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) { - if (auxv.a_type == AT_HWCAP) { - out[REG_HWCAP] = auxv.a_un.a_val; - } else if (auxv.a_type == AT_HWCAP2) { - out[REG_HWCAP2] = auxv.a_un.a_val; - } else if (auxv.a_type == AT_PLATFORM) { - if (!strcmp((const char *)auxv.a_un.a_val, PLATFORM_STR)) - out[REG_PLATFORM] = 0x0001; - } - } - close(auxv_fd); + out[REG_HWCAP] = rte_cpu_getauxval(AT_HWCAP); + out[REG_HWCAP2] = rte_cpu_getauxval(AT_HWCAP2); + if (!rte_cpu_strcmp_auxval(AT_PLATFORM, PLATFORM_STR)) + out[REG_PLATFORM] = 0x0001; } /* diff --git a/lib/librte_eal/common/arch/arm/rte_hypervisor.c b/lib/librte_eal/common/arch/arm/rte_hypervisor.c index 3792fe2c..08a1c97d 100644 --- a/lib/librte_eal/common/arch/arm/rte_hypervisor.c +++ b/lib/librte_eal/common/arch/arm/rte_hypervisor.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright 2017 Mellanox Technologies, Ltd. + * Copyright 2017 Mellanox Technologies, Ltd */ #include "rte_hypervisor.h" diff --git a/lib/librte_eal/common/arch/ppc_64/rte_cpuflags.c b/lib/librte_eal/common/arch/ppc_64/rte_cpuflags.c index 970a61c5..e7a82452 100644 --- a/lib/librte_eal/common/arch/ppc_64/rte_cpuflags.c +++ b/lib/librte_eal/common/arch/ppc_64/rte_cpuflags.c @@ -104,19 +104,8 @@ const struct feature_entry rte_cpu_feature_table[] = { static void rte_cpu_get_features(hwcap_registers_t out) { - int auxv_fd; - Elf64_auxv_t auxv; - - auxv_fd = open("/proc/self/auxv", O_RDONLY); - assert(auxv_fd != -1); - while (read(auxv_fd, &auxv, - sizeof(Elf64_auxv_t)) == sizeof(Elf64_auxv_t)) { - if (auxv.a_type == AT_HWCAP) - out[REG_HWCAP] = auxv.a_un.a_val; - else if (auxv.a_type == AT_HWCAP2) - out[REG_HWCAP2] = auxv.a_un.a_val; - } - close(auxv_fd); + out[REG_HWCAP] = rte_cpu_getauxval(AT_HWCAP); + out[REG_HWCAP2] = rte_cpu_getauxval(AT_HWCAP2); } /* diff --git a/lib/librte_eal/common/arch/ppc_64/rte_hypervisor.c b/lib/librte_eal/common/arch/ppc_64/rte_hypervisor.c index 3792fe2c..08a1c97d 100644 --- a/lib/librte_eal/common/arch/ppc_64/rte_hypervisor.c +++ b/lib/librte_eal/common/arch/ppc_64/rte_hypervisor.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright 2017 Mellanox Technologies, Ltd. + * Copyright 2017 Mellanox Technologies, Ltd */ #include "rte_hypervisor.h" diff --git a/lib/librte_eal/common/arch/x86/rte_hypervisor.c b/lib/librte_eal/common/arch/x86/rte_hypervisor.c index edf07be1..c38cfc09 100644 --- a/lib/librte_eal/common/arch/x86/rte_hypervisor.c +++ b/lib/librte_eal/common/arch/x86/rte_hypervisor.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright 2017 Mellanox Technologies, Ltd. + * Copyright 2017 Mellanox Technologies, Ltd */ #include "rte_hypervisor.h" diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c index 3e022d51..0943851c 100644 --- a/lib/librte_eal/common/eal_common_bus.c +++ b/lib/librte_eal/common/eal_common_bus.c @@ -36,6 +36,7 @@ #include #include +#include #include "eal_private.h" @@ -212,7 +213,7 @@ rte_bus_find_by_device_name(const char *str) char name[RTE_DEV_NAME_MAX_LEN]; char *c; - snprintf(name, sizeof(name), "%s", str); + strlcpy(name, str, sizeof(name)); c = strchr(name, ','); if (c != NULL) c[0] = '\0'; diff --git a/lib/librte_eal/common/eal_common_dev.c b/lib/librte_eal/common/eal_common_dev.c index cd071442..61cb3b16 100644 --- a/lib/librte_eal/common/eal_common_dev.c +++ b/lib/librte_eal/common/eal_common_dev.c @@ -14,9 +14,34 @@ #include #include #include +#include +#include #include "eal_private.h" +/** + * The device event callback description. + * + * It contains callback address to be registered by user application, + * the pointer to the parameters for callback, and the device name. + */ +struct dev_event_callback { + TAILQ_ENTRY(dev_event_callback) next; /**< Callbacks list */ + rte_dev_event_cb_fn cb_fn; /**< Callback address */ + void *cb_arg; /**< Callback parameter */ + char *dev_name; /**< Callback device name, NULL is for all device */ + uint32_t active; /**< Callback is executing */ +}; + +/** @internal Structure to keep track of registered callbacks */ +TAILQ_HEAD(dev_event_cb_list, dev_event_callback); + +/* The device event callback list for all registered callbacks. */ +static struct dev_event_cb_list dev_event_cbs; + +/* spinlock for device callbacks */ +static rte_spinlock_t dev_event_lock = RTE_SPINLOCK_INITIALIZER; + static int cmp_detached_dev_name(const struct rte_device *dev, const void *_name) { @@ -89,29 +114,12 @@ int rte_eal_dev_detach(struct rte_device *dev) return ret; } -static char * -full_dev_name(const char *bus, const char *dev, const char *args) -{ - char *name; - size_t len; - - len = snprintf(NULL, 0, "%s:%s,%s", bus, dev, args) + 1; - name = calloc(1, len); - if (name == NULL) { - RTE_LOG(ERR, EAL, "Could not allocate full device name\n"); - return NULL; - } - snprintf(name, len, "%s:%s,%s", bus, dev, args); - return name; -} - int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname, const char *devargs) { struct rte_bus *bus; struct rte_device *dev; struct rte_devargs *da; - char *name; int ret; bus = rte_bus_find_by_name(busname); @@ -126,21 +134,16 @@ int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devn return -ENOTSUP; } - name = full_dev_name(busname, devname, devargs); - if (name == NULL) - return -ENOMEM; - da = calloc(1, sizeof(*da)); - if (da == NULL) { - ret = -ENOMEM; - goto err_name; - } + if (da == NULL) + return -ENOMEM; - ret = rte_eal_devargs_parse(name, da); + ret = rte_devargs_parse(da, "%s:%s,%s", + busname, devname, devargs); if (ret) goto err_devarg; - ret = rte_eal_devargs_insert(da); + ret = rte_devargs_insert(da); if (ret) goto err_devarg; @@ -162,16 +165,13 @@ int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devn dev->name); goto err_devarg; } - free(name); return 0; err_devarg: - if (rte_eal_devargs_remove(busname, devname)) { + if (rte_devargs_remove(busname, devname)) { free(da->args); free(da); } -err_name: - free(name); return ret; } @@ -204,6 +204,142 @@ rte_eal_hotplug_remove(const char *busname, const char *devname) if (ret) RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", dev->name); - rte_eal_devargs_remove(busname, devname); + rte_devargs_remove(busname, devname); return ret; } + +int __rte_experimental +rte_dev_event_callback_register(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg) +{ + struct dev_event_callback *event_cb; + int ret; + + if (!cb_fn) + return -EINVAL; + + rte_spinlock_lock(&dev_event_lock); + + if (TAILQ_EMPTY(&dev_event_cbs)) + TAILQ_INIT(&dev_event_cbs); + + TAILQ_FOREACH(event_cb, &dev_event_cbs, next) { + if (event_cb->cb_fn == cb_fn && event_cb->cb_arg == cb_arg) { + if (device_name == NULL && event_cb->dev_name == NULL) + break; + if (device_name == NULL || event_cb->dev_name == NULL) + continue; + if (!strcmp(event_cb->dev_name, device_name)) + break; + } + } + + /* create a new callback. */ + if (event_cb == NULL) { + event_cb = malloc(sizeof(struct dev_event_callback)); + if (event_cb != NULL) { + event_cb->cb_fn = cb_fn; + event_cb->cb_arg = cb_arg; + event_cb->active = 0; + if (!device_name) { + event_cb->dev_name = NULL; + } else { + event_cb->dev_name = strdup(device_name); + if (event_cb->dev_name == NULL) { + ret = -ENOMEM; + goto error; + } + } + TAILQ_INSERT_TAIL(&dev_event_cbs, event_cb, next); + } else { + RTE_LOG(ERR, EAL, + "Failed to allocate memory for device " + "event callback."); + ret = -ENOMEM; + goto error; + } + } else { + RTE_LOG(ERR, EAL, + "The callback is already exist, no need " + "to register again.\n"); + ret = -EEXIST; + } + + rte_spinlock_unlock(&dev_event_lock); + return 0; +error: + free(event_cb); + rte_spinlock_unlock(&dev_event_lock); + return ret; +} + +int __rte_experimental +rte_dev_event_callback_unregister(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg) +{ + int ret = 0; + struct dev_event_callback *event_cb, *next; + + if (!cb_fn) + return -EINVAL; + + rte_spinlock_lock(&dev_event_lock); + /*walk through the callbacks and remove all that match. */ + for (event_cb = TAILQ_FIRST(&dev_event_cbs); event_cb != NULL; + event_cb = next) { + + next = TAILQ_NEXT(event_cb, next); + + if (device_name != NULL && event_cb->dev_name != NULL) { + if (!strcmp(event_cb->dev_name, device_name)) { + if (event_cb->cb_fn != cb_fn || + (cb_arg != (void *)-1 && + event_cb->cb_arg != cb_arg)) + continue; + } + } else if (device_name != NULL) { + continue; + } + + /* + * if this callback is not executing right now, + * then remove it. + */ + if (event_cb->active == 0) { + TAILQ_REMOVE(&dev_event_cbs, event_cb, next); + free(event_cb); + ret++; + } else { + continue; + } + } + rte_spinlock_unlock(&dev_event_lock); + return ret; +} + +void +dev_callback_process(char *device_name, enum rte_dev_event_type event) +{ + struct dev_event_callback *cb_lst; + + if (device_name == NULL) + return; + + rte_spinlock_lock(&dev_event_lock); + + TAILQ_FOREACH(cb_lst, &dev_event_cbs, next) { + if (cb_lst->dev_name) { + if (strcmp(cb_lst->dev_name, device_name)) + continue; + } + cb_lst->active = 1; + rte_spinlock_unlock(&dev_event_lock); + cb_lst->cb_fn(device_name, event, + cb_lst->cb_arg); + rte_spinlock_lock(&dev_event_lock); + cb_lst->active = 0; + } + rte_spinlock_unlock(&dev_event_lock); +} diff --git a/lib/librte_eal/common/eal_common_devargs.c b/lib/librte_eal/common/eal_common_devargs.c index 810b3e18..b0434158 100644 --- a/lib/librte_eal/common/eal_common_devargs.c +++ b/lib/librte_eal/common/eal_common_devargs.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -18,6 +19,9 @@ #include #include "eal_private.h" +/** user device double-linked queue type definition */ +TAILQ_HEAD(rte_devargs_list, rte_devargs); + /** Global list of user devices */ struct rte_devargs_list devargs_list = TAILQ_HEAD_INITIALIZER(devargs_list); @@ -59,15 +63,23 @@ bus_name_cmp(const struct rte_bus *bus, const void *name) } int __rte_experimental -rte_eal_devargs_parse(const char *dev, struct rte_devargs *da) +rte_devargs_parse(struct rte_devargs *da, const char *format, ...) { struct rte_bus *bus = NULL; + va_list ap; + va_start(ap, format); + char dev[vsnprintf(NULL, 0, format, ap) + 1]; const char *devname; const size_t maxlen = sizeof(da->name); size_t i; - if (dev == NULL || da == NULL) + va_end(ap); + if (da == NULL) return -EINVAL; + + va_start(ap, format); + vsnprintf(dev, sizeof(dev), format, ap); + va_end(ap); /* Retrieve eventual bus info */ do { devname = dev; @@ -113,11 +125,11 @@ rte_eal_devargs_parse(const char *dev, struct rte_devargs *da) } int __rte_experimental -rte_eal_devargs_insert(struct rte_devargs *da) +rte_devargs_insert(struct rte_devargs *da) { int ret; - ret = rte_eal_devargs_remove(da->bus->name, da->name); + ret = rte_devargs_remove(da->bus->name, da->name); if (ret < 0) return ret; TAILQ_INSERT_TAIL(&devargs_list, da, next); @@ -125,8 +137,9 @@ rte_eal_devargs_insert(struct rte_devargs *da) } /* store a whitelist parameter for later parsing */ +__rte_experimental int -rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str) +rte_devargs_add(enum rte_devtype devtype, const char *devargs_str) { struct rte_devargs *devargs = NULL; struct rte_bus *bus = NULL; @@ -137,7 +150,7 @@ rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str) if (devargs == NULL) goto fail; - if (rte_eal_devargs_parse(dev, devargs)) + if (rte_devargs_parse(devargs, "%s", dev)) goto fail; devargs->type = devtype; bus = devargs->bus; @@ -162,7 +175,7 @@ fail: } int __rte_experimental -rte_eal_devargs_remove(const char *busname, const char *devname) +rte_devargs_remove(const char *busname, const char *devname) { struct rte_devargs *d; void *tmp; @@ -180,8 +193,9 @@ rte_eal_devargs_remove(const char *busname, const char *devname) } /* count the number of devices of a specified type */ +__rte_experimental unsigned int -rte_eal_devargs_type_count(enum rte_devtype devtype) +rte_devargs_type_count(enum rte_devtype devtype) { struct rte_devargs *devargs; unsigned int count = 0; @@ -195,8 +209,9 @@ rte_eal_devargs_type_count(enum rte_devtype devtype) } /* dump the user devices on the console */ +__rte_experimental void -rte_eal_devargs_dump(FILE *f) +rte_devargs_dump(FILE *f) { struct rte_devargs *devargs; @@ -207,3 +222,23 @@ rte_eal_devargs_dump(FILE *f) devargs->name, devargs->args); } } + +/* bus-aware rte_devargs iterator. */ +__rte_experimental +struct rte_devargs * +rte_devargs_next(const char *busname, const struct rte_devargs *start) +{ + struct rte_devargs *da; + + if (start != NULL) + da = TAILQ_NEXT(start, next); + else + da = TAILQ_FIRST(&devargs_list); + while (da != NULL) { + if (busname == NULL || + (strcmp(busname, da->bus->name) == 0)) + return da; + da = TAILQ_NEXT(da, next); + } + return NULL; +} diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c new file mode 100644 index 00000000..019f84c1 --- /dev/null +++ b/lib/librte_eal/common/eal_common_fbarray.c @@ -0,0 +1,879 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "eal_filesystem.h" +#include "eal_private.h" + +#include "rte_fbarray.h" + +#define MASK_SHIFT 6ULL +#define MASK_ALIGN (1ULL << MASK_SHIFT) +#define MASK_LEN_TO_IDX(x) ((x) >> MASK_SHIFT) +#define MASK_LEN_TO_MOD(x) ((x) - RTE_ALIGN_FLOOR(x, MASK_ALIGN)) +#define MASK_GET_IDX(idx, mod) ((idx << MASK_SHIFT) + mod) + +/* + * This is a mask that is always stored at the end of array, to provide fast + * way of finding free/used spots without looping through each element. + */ + +struct used_mask { + unsigned int n_masks; + uint64_t data[]; +}; + +static size_t +calc_mask_size(unsigned int len) +{ + /* mask must be multiple of MASK_ALIGN, even though length of array + * itself may not be aligned on that boundary. + */ + len = RTE_ALIGN_CEIL(len, MASK_ALIGN); + return sizeof(struct used_mask) + + sizeof(uint64_t) * MASK_LEN_TO_IDX(len); +} + +static size_t +calc_data_size(size_t page_sz, unsigned int elt_sz, unsigned int len) +{ + size_t data_sz = elt_sz * len; + size_t msk_sz = calc_mask_size(len); + return RTE_ALIGN_CEIL(data_sz + msk_sz, page_sz); +} + +static struct used_mask * +get_used_mask(void *data, unsigned int elt_sz, unsigned int len) +{ + return (struct used_mask *) RTE_PTR_ADD(data, elt_sz * len); +} + +static int +resize_and_map(int fd, void *addr, size_t len) +{ + char path[PATH_MAX]; + void *map_addr; + + if (ftruncate(fd, len)) { + RTE_LOG(ERR, EAL, "Cannot truncate %s\n", path); + /* pass errno up the chain */ + rte_errno = errno; + return -1; + } + + map_addr = mmap(addr, len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (map_addr != addr) { + RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno)); + /* pass errno up the chain */ + rte_errno = errno; + return -1; + } + return 0; +} + +static int +find_next_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int msk_idx, lookahead_idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk, ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + ignore_msk = ~((1ULL << first_mod) - 1); + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-1ULL << last_mod); + + for (msk_idx = first; msk_idx < msk->n_masks; msk_idx++) { + uint64_t cur_msk, lookahead_msk; + unsigned int run_start, clz, left; + bool found = false; + /* + * The process of getting n consecutive bits for arbitrary n is + * a bit involved, but here it is in a nutshell: + * + * 1. let n be the number of consecutive bits we're looking for + * 2. check if n can fit in one mask, and if so, do n-1 + * rshift-ands to see if there is an appropriate run inside + * our current mask + * 2a. if we found a run, bail out early + * 2b. if we didn't find a run, proceed + * 3. invert the mask and count leading zeroes (that is, count + * how many consecutive set bits we had starting from the + * end of current mask) as k + * 3a. if k is 0, continue to next mask + * 3b. if k is not 0, we have a potential run + * 4. to satisfy our requirements, next mask must have n-k + * consecutive set bits right at the start, so we will do + * (n-k-1) rshift-ands and check if first bit is set. + * + * Step 4 will need to be repeated if (n-k) > MASK_ALIGN until + * we either run out of masks, lose the run, or find what we + * were looking for. + */ + cur_msk = msk->data[msk_idx]; + left = n; + + /* if we're looking for free spaces, invert the mask */ + if (!used) + cur_msk = ~cur_msk; + + /* combine current ignore mask with last index ignore mask */ + if (msk_idx == last) + ignore_msk |= last_msk; + + /* if we have an ignore mask, ignore once */ + if (ignore_msk) { + cur_msk &= ignore_msk; + ignore_msk = 0; + } + + /* if n can fit in within a single mask, do a search */ + if (n <= MASK_ALIGN) { + uint64_t tmp_msk = cur_msk; + unsigned int s_idx; + for (s_idx = 0; s_idx < n - 1; s_idx++) + tmp_msk &= tmp_msk >> 1ULL; + /* we found what we were looking for */ + if (tmp_msk != 0) { + run_start = __builtin_ctzll(tmp_msk); + return MASK_GET_IDX(msk_idx, run_start); + } + } + + /* + * we didn't find our run within the mask, or n > MASK_ALIGN, + * so we're going for plan B. + */ + + /* count leading zeroes on inverted mask */ + if (~cur_msk == 0) + clz = sizeof(cur_msk) * 8; + else + clz = __builtin_clzll(~cur_msk); + + /* if there aren't any runs at the end either, just continue */ + if (clz == 0) + continue; + + /* we have a partial run at the end, so try looking ahead */ + run_start = MASK_ALIGN - clz; + left -= clz; + + for (lookahead_idx = msk_idx + 1; lookahead_idx < msk->n_masks; + lookahead_idx++) { + unsigned int s_idx, need; + lookahead_msk = msk->data[lookahead_idx]; + + /* if we're looking for free space, invert the mask */ + if (!used) + lookahead_msk = ~lookahead_msk; + + /* figure out how many consecutive bits we need here */ + need = RTE_MIN(left, MASK_ALIGN); + + for (s_idx = 0; s_idx < need - 1; s_idx++) + lookahead_msk &= lookahead_msk >> 1ULL; + + /* if first bit is not set, we've lost the run */ + if ((lookahead_msk & 1) == 0) { + /* + * we've scanned this far, so we know there are + * no runs in the space we've lookahead-scanned + * as well, so skip that on next iteration. + */ + ignore_msk = ~((1ULL << need) - 1); + msk_idx = lookahead_idx; + break; + } + + left -= need; + + /* check if we've found what we were looking for */ + if (left == 0) { + found = true; + break; + } + } + + /* we didn't find anything, so continue */ + if (!found) + continue; + + return MASK_GET_IDX(msk_idx, run_start); + } + /* we didn't find anything */ + rte_errno = used ? -ENOENT : -ENOSPC; + return -1; +} + +static int +find_next(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk, ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + ignore_msk = ~((1ULL << first_mod) - 1ULL); + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-(1ULL) << last_mod); + + for (idx = first; idx < msk->n_masks; idx++) { + uint64_t cur = msk->data[idx]; + int found; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + if (idx == last) + cur &= last_msk; + + /* ignore everything before start on first iteration */ + if (idx == first) + cur &= ignore_msk; + + /* check if we have any entries */ + if (cur == 0) + continue; + + /* + * find first set bit - that will correspond to whatever it is + * that we're looking for. + */ + found = __builtin_ctzll(cur); + return MASK_GET_IDX(idx, found); + } + /* we didn't find anything */ + rte_errno = used ? -ENOENT : -ENOSPC; + return -1; +} + +static int +find_contig(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk; + unsigned int need_len, result = 0; + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-(1ULL) << last_mod); + + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + for (idx = first; idx < msk->n_masks; idx++, result += need_len) { + uint64_t cur = msk->data[idx]; + unsigned int run_len; + + need_len = MASK_ALIGN; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* if this is last mask, ignore everything after last bit */ + if (idx == last) + cur &= last_msk; + + /* ignore everything before start on first iteration */ + if (idx == first) { + cur >>= first_mod; + /* at the start, we don't need the full mask len */ + need_len -= first_mod; + } + + /* we will be looking for zeroes, so invert the mask */ + cur = ~cur; + + /* if mask is zero, we have a complete run */ + if (cur == 0) + continue; + + /* + * see if current run ends before mask end. + */ + run_len = __builtin_ctzll(cur); + + /* add however many zeroes we've had in the last run and quit */ + if (run_len < need_len) { + result += run_len; + break; + } + } + return result; +} + +static int +set_used(struct rte_fbarray *arr, unsigned int idx, bool used) +{ + struct used_mask *msk; + uint64_t msk_bit = 1ULL << MASK_LEN_TO_MOD(idx); + unsigned int msk_idx = MASK_LEN_TO_IDX(idx); + bool already_used; + int ret = -1; + + if (arr == NULL || idx >= arr->len) { + rte_errno = EINVAL; + return -1; + } + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + ret = 0; + + /* prevent array from changing under us */ + rte_rwlock_write_lock(&arr->rwlock); + + already_used = (msk->data[msk_idx] & msk_bit) != 0; + + /* nothing to be done */ + if (used == already_used) + goto out; + + if (used) { + msk->data[msk_idx] |= msk_bit; + arr->count++; + } else { + msk->data[msk_idx] &= ~msk_bit; + arr->count--; + } +out: + rte_rwlock_write_unlock(&arr->rwlock); + + return ret; +} + +static int +fully_validate(const char *name, unsigned int elt_sz, unsigned int len) +{ + if (name == NULL || elt_sz == 0 || len == 0 || len > INT_MAX) { + rte_errno = EINVAL; + return -1; + } + + if (strnlen(name, RTE_FBARRAY_NAME_LEN) == RTE_FBARRAY_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +int __rte_experimental +rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len, + unsigned int elt_sz) +{ + size_t page_sz, mmap_len; + char path[PATH_MAX]; + struct used_mask *msk; + void *data = NULL; + int fd = -1; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + if (fully_validate(name, elt_sz, len)) + return -1; + + page_sz = sysconf(_SC_PAGESIZE); + if (page_sz == (size_t)-1) + goto fail; + + /* calculate our memory limits */ + mmap_len = calc_data_size(page_sz, elt_sz, len); + + data = eal_get_virtual_area(NULL, &mmap_len, page_sz, 0, 0); + if (data == NULL) + goto fail; + + eal_get_fbarray_path(path, sizeof(path), name); + + /* + * Each fbarray is unique to process namespace, i.e. the filename + * depends on process prefix. Try to take out a lock and see if we + * succeed. If we don't, someone else is using it already. + */ + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n", __func__, + path, strerror(errno)); + rte_errno = errno; + goto fail; + } else if (flock(fd, LOCK_EX | LOCK_NB)) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n", __func__, + path, strerror(errno)); + rte_errno = EBUSY; + goto fail; + } + + /* take out a non-exclusive lock, so that other processes could still + * attach to it, but no other process could reinitialize it. + */ + if (flock(fd, LOCK_SH | LOCK_NB)) { + rte_errno = errno; + goto fail; + } + + if (resize_and_map(fd, data, mmap_len)) + goto fail; + + /* we've mmap'ed the file, we can now close the fd */ + close(fd); + + /* initialize the data */ + memset(data, 0, mmap_len); + + /* populate data structure */ + strlcpy(arr->name, name, sizeof(arr->name)); + arr->data = data; + arr->len = len; + arr->elt_sz = elt_sz; + arr->count = 0; + + msk = get_used_mask(data, elt_sz, len); + msk->n_masks = MASK_LEN_TO_IDX(RTE_ALIGN_CEIL(len, MASK_ALIGN)); + + rte_rwlock_init(&arr->rwlock); + + return 0; +fail: + if (data) + munmap(data, mmap_len); + if (fd >= 0) + close(fd); + return -1; +} + +int __rte_experimental +rte_fbarray_attach(struct rte_fbarray *arr) +{ + size_t page_sz, mmap_len; + char path[PATH_MAX]; + void *data = NULL; + int fd = -1; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + /* + * we don't need to synchronize attach as two values we need (element + * size and array length) are constant for the duration of life of + * the array, so the parts we care about will not race. + */ + + if (fully_validate(arr->name, arr->elt_sz, arr->len)) + return -1; + + page_sz = sysconf(_SC_PAGESIZE); + if (page_sz == (size_t)-1) + goto fail; + + mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len); + + data = eal_get_virtual_area(arr->data, &mmap_len, page_sz, 0, 0); + if (data == NULL) + goto fail; + + eal_get_fbarray_path(path, sizeof(path), arr->name); + + fd = open(path, O_RDWR); + if (fd < 0) { + rte_errno = errno; + goto fail; + } + + /* lock the file, to let others know we're using it */ + if (flock(fd, LOCK_SH | LOCK_NB)) { + rte_errno = errno; + goto fail; + } + + if (resize_and_map(fd, data, mmap_len)) + goto fail; + + close(fd); + + /* we're done */ + + return 0; +fail: + if (data) + munmap(data, mmap_len); + if (fd >= 0) + close(fd); + return -1; +} + +int __rte_experimental +rte_fbarray_detach(struct rte_fbarray *arr) +{ + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + /* + * we don't need to synchronize detach as two values we need (element + * size and total capacity) are constant for the duration of life of + * the array, so the parts we care about will not race. if the user is + * detaching while doing something else in the same process, we can't + * really do anything about it, things will blow up either way. + */ + + size_t page_sz = sysconf(_SC_PAGESIZE); + + if (page_sz == (size_t)-1) + return -1; + + /* this may already be unmapped (e.g. repeated call from previously + * failed destroy(), but this is on user, we can't (easily) know if this + * is still mapped. + */ + munmap(arr->data, calc_data_size(page_sz, arr->elt_sz, arr->len)); + + return 0; +} + +int __rte_experimental +rte_fbarray_destroy(struct rte_fbarray *arr) +{ + int fd, ret; + char path[PATH_MAX]; + + ret = rte_fbarray_detach(arr); + if (ret) + return ret; + + /* try deleting the file */ + eal_get_fbarray_path(path, sizeof(path), arr->name); + + fd = open(path, O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open fbarray file: %s\n", + strerror(errno)); + return -1; + } + if (flock(fd, LOCK_EX | LOCK_NB)) { + RTE_LOG(DEBUG, EAL, "Cannot destroy fbarray - another process is using it\n"); + rte_errno = EBUSY; + ret = -1; + } else { + ret = 0; + unlink(path); + memset(arr, 0, sizeof(*arr)); + } + close(fd); + + return ret; +} + +void * __rte_experimental +rte_fbarray_get(const struct rte_fbarray *arr, unsigned int idx) +{ + void *ret = NULL; + if (arr == NULL) { + rte_errno = EINVAL; + return NULL; + } + + if (idx >= arr->len) { + rte_errno = EINVAL; + return NULL; + } + + ret = RTE_PTR_ADD(arr->data, idx * arr->elt_sz); + + return ret; +} + +int __rte_experimental +rte_fbarray_set_used(struct rte_fbarray *arr, unsigned int idx) +{ + return set_used(arr, idx, true); +} + +int __rte_experimental +rte_fbarray_set_free(struct rte_fbarray *arr, unsigned int idx) +{ + return set_used(arr, idx, false); +} + +int __rte_experimental +rte_fbarray_is_used(struct rte_fbarray *arr, unsigned int idx) +{ + struct used_mask *msk; + int msk_idx; + uint64_t msk_bit; + int ret = -1; + + if (arr == NULL || idx >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + msk_idx = MASK_LEN_TO_IDX(idx); + msk_bit = 1ULL << MASK_LEN_TO_MOD(idx); + + ret = (msk->data[msk_idx] & msk_bit) != 0; + + rte_rwlock_read_unlock(&arr->rwlock); + + return ret; +} + +int __rte_experimental +rte_fbarray_find_next_free(struct rte_fbarray *arr, unsigned int start) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + if (arr->len == arr->count) { + rte_errno = ENOSPC; + goto out; + } + + ret = find_next(arr, start, false); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int __rte_experimental +rte_fbarray_find_next_used(struct rte_fbarray *arr, unsigned int start) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + if (arr->count == 0) { + rte_errno = ENOENT; + goto out; + } + + ret = find_next(arr, start, true); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int __rte_experimental +rte_fbarray_find_next_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len || n > arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + if (arr->len == arr->count || arr->len - arr->count < n) { + rte_errno = ENOSPC; + goto out; + } + + ret = find_next_n(arr, start, n, false); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int __rte_experimental +rte_fbarray_find_next_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len || n > arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + if (arr->count < n) { + rte_errno = ENOENT; + goto out; + } + + ret = find_next_n(arr, start, n, true); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int __rte_experimental +rte_fbarray_find_contig_free(struct rte_fbarray *arr, unsigned int start) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + if (arr->len == arr->count) { + rte_errno = ENOSPC; + goto out; + } + + if (arr->count == 0) { + ret = arr->len - start; + goto out; + } + + ret = find_contig(arr, start, false); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int __rte_experimental +rte_fbarray_find_contig_used(struct rte_fbarray *arr, unsigned int start) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + ret = find_contig(arr, start, true); + + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int __rte_experimental +rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt) +{ + void *end; + int ret = -1; + + /* + * no need to synchronize as it doesn't matter if underlying data + * changes - we're doing pointer arithmetic here. + */ + + if (arr == NULL || elt == NULL) { + rte_errno = EINVAL; + return -1; + } + end = RTE_PTR_ADD(arr->data, arr->elt_sz * arr->len); + if (elt < arr->data || elt >= end) { + rte_errno = EINVAL; + return -1; + } + + ret = RTE_PTR_DIFF(elt, arr->data) / arr->elt_sz; + + return ret; +} + +void __rte_experimental +rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f) +{ + struct used_mask *msk; + unsigned int i; + + if (arr == NULL || f == NULL) { + rte_errno = EINVAL; + return; + } + + if (fully_validate(arr->name, arr->elt_sz, arr->len)) { + fprintf(f, "Invalid file-backed array\n"); + goto out; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + fprintf(f, "File-backed array: %s\n", arr->name); + fprintf(f, "size: %i occupied: %i elt_sz: %i\n", + arr->len, arr->count, arr->elt_sz); + + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + + for (i = 0; i < msk->n_masks; i++) + fprintf(f, "msk idx %i: 0x%016" PRIx64 "\n", i, msk->data[i]); +out: + rte_rwlock_read_unlock(&arr->rwlock); +} diff --git a/lib/librte_eal/common/eal_common_hypervisor.c b/lib/librte_eal/common/eal_common_hypervisor.c index c3b4c621..5388b81a 100644 --- a/lib/librte_eal/common/eal_common_hypervisor.c +++ b/lib/librte_eal/common/eal_common_hypervisor.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright 2017 Mellanox Technologies, Ltd. + * Copyright 2017 Mellanox Technologies, Ltd */ #include "rte_hypervisor.h" diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c index 7724fa43..3167e9d7 100644 --- a/lib/librte_eal/common/eal_common_lcore.c +++ b/lib/librte_eal/common/eal_common_lcore.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -16,6 +17,19 @@ #include "eal_private.h" #include "eal_thread.h" +static int +socket_id_cmp(const void *a, const void *b) +{ + const int *lcore_id_a = a; + const int *lcore_id_b = b; + + if (*lcore_id_a < *lcore_id_b) + return -1; + if (*lcore_id_a > *lcore_id_b) + return 1; + return 0; +} + /* * Parse /sys/devices/system/cpu to get the number of physical and logical * processors on the machine. The function will fill the cpu_info @@ -28,6 +42,8 @@ rte_eal_cpu_init(void) struct rte_config *config = rte_eal_get_configuration(); unsigned lcore_id; unsigned count = 0; + unsigned int socket_id, prev_socket_id; + int lcore_to_socket_id[RTE_MAX_LCORE]; /* * Parse the maximum set of logical cores, detect the subset of running @@ -39,6 +55,19 @@ rte_eal_cpu_init(void) /* init cpuset for per lcore config */ CPU_ZERO(&lcore_config[lcore_id].cpuset); + /* find socket first */ + socket_id = eal_cpu_socket_id(lcore_id); + if (socket_id >= RTE_MAX_NUMA_NODES) { +#ifdef RTE_EAL_ALLOW_INV_SOCKET_ID + socket_id = 0; +#else + RTE_LOG(ERR, EAL, "Socket ID (%u) is greater than RTE_MAX_NUMA_NODES (%d)\n", + socket_id, RTE_MAX_NUMA_NODES); + return -1; +#endif + } + lcore_to_socket_id[lcore_id] = socket_id; + /* in 1:1 mapping, record related cpu detected state */ lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id); if (lcore_config[lcore_id].detected == 0) { @@ -54,18 +83,7 @@ rte_eal_cpu_init(void) config->lcore_role[lcore_id] = ROLE_RTE; lcore_config[lcore_id].core_role = ROLE_RTE; lcore_config[lcore_id].core_id = eal_cpu_core_id(lcore_id); - lcore_config[lcore_id].socket_id = eal_cpu_socket_id(lcore_id); - if (lcore_config[lcore_id].socket_id >= RTE_MAX_NUMA_NODES) { -#ifdef RTE_EAL_ALLOW_INV_SOCKET_ID - lcore_config[lcore_id].socket_id = 0; -#else - RTE_LOG(ERR, EAL, "Socket ID (%u) is greater than " - "RTE_MAX_NUMA_NODES (%d)\n", - lcore_config[lcore_id].socket_id, - RTE_MAX_NUMA_NODES); - return -1; -#endif - } + lcore_config[lcore_id].socket_id = socket_id; RTE_LOG(DEBUG, EAL, "Detected lcore %u as " "core %u on socket %u\n", lcore_id, lcore_config[lcore_id].core_id, @@ -79,5 +97,38 @@ rte_eal_cpu_init(void) RTE_MAX_LCORE); RTE_LOG(INFO, EAL, "Detected %u lcore(s)\n", config->lcore_count); + /* sort all socket id's in ascending order */ + qsort(lcore_to_socket_id, RTE_DIM(lcore_to_socket_id), + sizeof(lcore_to_socket_id[0]), socket_id_cmp); + + prev_socket_id = -1; + config->numa_node_count = 0; + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + socket_id = lcore_to_socket_id[lcore_id]; + if (socket_id != prev_socket_id) + config->numa_nodes[config->numa_node_count++] = + socket_id; + prev_socket_id = socket_id; + } + RTE_LOG(INFO, EAL, "Detected %u NUMA nodes\n", config->numa_node_count); + return 0; } + +unsigned int __rte_experimental +rte_socket_count(void) +{ + const struct rte_config *config = rte_eal_get_configuration(); + return config->numa_node_count; +} + +int __rte_experimental +rte_socket_id_by_idx(unsigned int idx) +{ + const struct rte_config *config = rte_eal_get_configuration(); + if (idx >= config->numa_node_count) { + rte_errno = EINVAL; + return -1; + } + return config->numa_nodes[idx]; +} diff --git a/lib/librte_eal/common/eal_common_log.c b/lib/librte_eal/common/eal_common_log.c index 37b2e20e..81811894 100644 --- a/lib/librte_eal/common/eal_common_log.c +++ b/lib/librte_eal/common/eal_common_log.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,23 @@ struct rte_logs rte_logs = { .file = NULL, }; +struct rte_eal_opt_loglevel { + /** Next list entry */ + TAILQ_ENTRY(rte_eal_opt_loglevel) next; + /** Compiled regular expression obtained from the option */ + regex_t re_match; + /** Glob match string option */ + char *pattern; + /** Log level value obtained from the option */ + uint32_t level; +}; + +TAILQ_HEAD(rte_eal_opt_loglevel_list, rte_eal_opt_loglevel); + +/** List of valid EAL log level options */ +static struct rte_eal_opt_loglevel_list opt_loglevel_list = + TAILQ_HEAD_INITIALIZER(opt_loglevel_list); + /* Stream to use for logging if rte_logs.file is NULL */ static FILE *default_log_stream; @@ -89,9 +107,9 @@ rte_log_set_level(uint32_t type, uint32_t level) return 0; } -/* set level */ +/* set log level by regular expression */ int -rte_log_set_level_regexp(const char *pattern, uint32_t level) +rte_log_set_level_regexp(const char *regex, uint32_t level) { regex_t r; size_t i; @@ -99,7 +117,7 @@ rte_log_set_level_regexp(const char *pattern, uint32_t level) if (level > RTE_LOG_DEBUG) return -1; - if (regcomp(&r, pattern, 0) != 0) + if (regcomp(&r, regex, 0) != 0) return -1; for (i = 0; i < rte_logs.dynamic_types_len; i++) { @@ -115,6 +133,69 @@ rte_log_set_level_regexp(const char *pattern, uint32_t level) return 0; } +/* + * Save the type string and the loglevel for later dynamic + * logtypes which may register later. + */ +static int rte_log_save_level(int priority, + const char *regex, const char *pattern) +{ + struct rte_eal_opt_loglevel *opt_ll = NULL; + + opt_ll = malloc(sizeof(*opt_ll)); + if (opt_ll == NULL) + goto fail; + + opt_ll->level = priority; + + if (regex) { + opt_ll->pattern = NULL; + if (regcomp(&opt_ll->re_match, regex, 0) != 0) + goto fail; + } else if (pattern) { + opt_ll->pattern = strdup(pattern); + if (opt_ll->pattern == NULL) + goto fail; + } else + goto fail; + + TAILQ_INSERT_HEAD(&opt_loglevel_list, opt_ll, next); + return 0; +fail: + free(opt_ll); + return -1; +} + +int rte_log_save_regexp(const char *regex, int tmp) +{ + return rte_log_save_level(tmp, regex, NULL); +} + +/* set log level based on glob (file match) pattern */ +int +rte_log_set_level_pattern(const char *pattern, uint32_t level) +{ + size_t i; + + if (level > RTE_LOG_DEBUG) + return -1; + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + + if (fnmatch(pattern, rte_logs.dynamic_types[i].name, 0) == 0) + rte_logs.dynamic_types[i].loglevel = level; + } + + return 0; +} + +int rte_log_save_pattern(const char *pattern, int priority) +{ + return rte_log_save_level(priority, NULL, pattern); +} + /* get the current loglevel for the message being processed */ int rte_log_cur_msg_loglevel(void) { @@ -186,6 +267,36 @@ rte_log_register(const char *name) return ret; } +/* Register an extended log type and try to pick its level from EAL options */ +int __rte_experimental +rte_log_register_type_and_pick_level(const char *name, uint32_t level_def) +{ + struct rte_eal_opt_loglevel *opt_ll; + uint32_t level = level_def; + int type; + + type = rte_log_register(name); + if (type < 0) + return type; + + TAILQ_FOREACH(opt_ll, &opt_loglevel_list, next) { + if (opt_ll->level > RTE_LOG_DEBUG) + continue; + + if (opt_ll->pattern) { + if (fnmatch(opt_ll->pattern, name, 0)) + level = opt_ll->level; + } else { + if (regexec(&opt_ll->re_match, name, 0, NULL, 0) == 0) + level = opt_ll->level; + } + } + + rte_logs.dynamic_types[type].loglevel = level; + + return type; +} + struct logtype { uint32_t log_id; const char *logtype; @@ -224,7 +335,7 @@ static const struct logtype logtype_strings[] = { }; /* Logging should be first initializer (before drivers and bus) */ -RTE_INIT_PRIO(rte_log_init, 101); +RTE_INIT_PRIO(rte_log_init, LOG); static void rte_log_init(void) { diff --git a/lib/librte_eal/common/eal_common_memalloc.c b/lib/librte_eal/common/eal_common_memalloc.c new file mode 100644 index 00000000..1d41ea11 --- /dev/null +++ b/lib/librte_eal/common/eal_common_memalloc.c @@ -0,0 +1,364 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" + +struct mem_event_callback_entry { + TAILQ_ENTRY(mem_event_callback_entry) next; + char name[RTE_MEM_EVENT_CALLBACK_NAME_LEN]; + rte_mem_event_callback_t clb; + void *arg; +}; + +struct mem_alloc_validator_entry { + TAILQ_ENTRY(mem_alloc_validator_entry) next; + char name[RTE_MEM_ALLOC_VALIDATOR_NAME_LEN]; + rte_mem_alloc_validator_t clb; + int socket_id; + size_t limit; +}; + +/** Double linked list of actions. */ +TAILQ_HEAD(mem_event_callback_entry_list, mem_event_callback_entry); +TAILQ_HEAD(mem_alloc_validator_entry_list, mem_alloc_validator_entry); + +static struct mem_event_callback_entry_list mem_event_callback_list = + TAILQ_HEAD_INITIALIZER(mem_event_callback_list); +static rte_rwlock_t mem_event_rwlock = RTE_RWLOCK_INITIALIZER; + +static struct mem_alloc_validator_entry_list mem_alloc_validator_list = + TAILQ_HEAD_INITIALIZER(mem_alloc_validator_list); +static rte_rwlock_t mem_alloc_validator_rwlock = RTE_RWLOCK_INITIALIZER; + +static struct mem_event_callback_entry * +find_mem_event_callback(const char *name, void *arg) +{ + struct mem_event_callback_entry *r; + + TAILQ_FOREACH(r, &mem_event_callback_list, next) { + if (!strcmp(r->name, name) && r->arg == arg) + break; + } + return r; +} + +static struct mem_alloc_validator_entry * +find_mem_alloc_validator(const char *name, int socket_id) +{ + struct mem_alloc_validator_entry *r; + + TAILQ_FOREACH(r, &mem_alloc_validator_list, next) { + if (!strcmp(r->name, name) && r->socket_id == socket_id) + break; + } + return r; +} + +bool +eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start, + size_t len) +{ + void *end, *aligned_start, *aligned_end; + size_t pgsz = (size_t)msl->page_sz; + const struct rte_memseg *ms; + + /* for IOVA_VA, it's always contiguous */ + if (rte_eal_iova_mode() == RTE_IOVA_VA) + return true; + + /* for legacy memory, it's always contiguous */ + if (internal_config.legacy_mem) + return true; + + end = RTE_PTR_ADD(start, len); + + /* for nohuge, we check pagemap, otherwise check memseg */ + if (!rte_eal_has_hugepages()) { + rte_iova_t cur, expected; + + aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz); + aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz); + + /* if start and end are on the same page, bail out early */ + if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz) + return true; + + /* skip first iteration */ + cur = rte_mem_virt2iova(aligned_start); + expected = cur + pgsz; + aligned_start = RTE_PTR_ADD(aligned_start, pgsz); + + while (aligned_start < aligned_end) { + cur = rte_mem_virt2iova(aligned_start); + if (cur != expected) + return false; + aligned_start = RTE_PTR_ADD(aligned_start, pgsz); + expected += pgsz; + } + } else { + int start_seg, end_seg, cur_seg; + rte_iova_t cur, expected; + + aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz); + aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz); + + start_seg = RTE_PTR_DIFF(aligned_start, msl->base_va) / + pgsz; + end_seg = RTE_PTR_DIFF(aligned_end, msl->base_va) / + pgsz; + + /* if start and end are on the same page, bail out early */ + if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz) + return true; + + /* skip first iteration */ + ms = rte_fbarray_get(&msl->memseg_arr, start_seg); + cur = ms->iova; + expected = cur + pgsz; + + /* if we can't access IOVA addresses, assume non-contiguous */ + if (cur == RTE_BAD_IOVA) + return false; + + for (cur_seg = start_seg + 1; cur_seg < end_seg; + cur_seg++, expected += pgsz) { + ms = rte_fbarray_get(&msl->memseg_arr, cur_seg); + + if (ms->iova != expected) + return false; + } + } + return true; +} + +int +eal_memalloc_mem_event_callback_register(const char *name, + rte_mem_event_callback_t clb, void *arg) +{ + struct mem_event_callback_entry *entry; + int ret, len; + if (name == NULL || clb == NULL) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_event_rwlock); + + entry = find_mem_event_callback(name, arg); + if (entry != NULL) { + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + rte_errno = ENOMEM; + ret = -1; + goto unlock; + } + + /* callback successfully created and is valid, add it to the list */ + entry->clb = clb; + entry->arg = arg; + strlcpy(entry->name, name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + TAILQ_INSERT_TAIL(&mem_event_callback_list, entry, next); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem event callback '%s:%p' registered\n", + name, arg); + +unlock: + rte_rwlock_write_unlock(&mem_event_rwlock); + return ret; +} + +int +eal_memalloc_mem_event_callback_unregister(const char *name, void *arg) +{ + struct mem_event_callback_entry *entry; + int ret, len; + + if (name == NULL) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_event_rwlock); + + entry = find_mem_event_callback(name, arg); + if (entry == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + TAILQ_REMOVE(&mem_event_callback_list, entry, next); + free(entry); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem event callback '%s:%p' unregistered\n", + name, arg); + +unlock: + rte_rwlock_write_unlock(&mem_event_rwlock); + return ret; +} + +void +eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start, + size_t len) +{ + struct mem_event_callback_entry *entry; + + rte_rwlock_read_lock(&mem_event_rwlock); + + TAILQ_FOREACH(entry, &mem_event_callback_list, next) { + RTE_LOG(DEBUG, EAL, "Calling mem event callback '%s:%p'\n", + entry->name, entry->arg); + entry->clb(event, start, len, entry->arg); + } + + rte_rwlock_read_unlock(&mem_event_rwlock); +} + +int +eal_memalloc_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit) +{ + struct mem_alloc_validator_entry *entry; + int ret, len; + if (name == NULL || clb == NULL || socket_id < 0) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_ALLOC_VALIDATOR_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_alloc_validator_rwlock); + + entry = find_mem_alloc_validator(name, socket_id); + if (entry != NULL) { + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + rte_errno = ENOMEM; + ret = -1; + goto unlock; + } + + /* callback successfully created and is valid, add it to the list */ + entry->clb = clb; + entry->socket_id = socket_id; + entry->limit = limit; + strlcpy(entry->name, name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + TAILQ_INSERT_TAIL(&mem_alloc_validator_list, entry, next); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem alloc validator '%s' on socket %i with limit %zu registered\n", + name, socket_id, limit); + +unlock: + rte_rwlock_write_unlock(&mem_alloc_validator_rwlock); + return ret; +} + +int +eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id) +{ + struct mem_alloc_validator_entry *entry; + int ret, len; + + if (name == NULL || socket_id < 0) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_ALLOC_VALIDATOR_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_alloc_validator_rwlock); + + entry = find_mem_alloc_validator(name, socket_id); + if (entry == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + TAILQ_REMOVE(&mem_alloc_validator_list, entry, next); + free(entry); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem alloc validator '%s' on socket %i unregistered\n", + name, socket_id); + +unlock: + rte_rwlock_write_unlock(&mem_alloc_validator_rwlock); + return ret; +} + +int +eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len) +{ + struct mem_alloc_validator_entry *entry; + int ret = 0; + + rte_rwlock_read_lock(&mem_alloc_validator_rwlock); + + TAILQ_FOREACH(entry, &mem_alloc_validator_list, next) { + if (entry->socket_id != socket_id || entry->limit > new_len) + continue; + RTE_LOG(DEBUG, EAL, "Calling mem alloc validator '%s' on socket %i\n", + entry->name, entry->socket_id); + if (entry->clb(socket_id, entry->limit, new_len) < 0) + ret = -1; + } + + rte_rwlock_read_unlock(&mem_alloc_validator_rwlock); + + return ret; +} diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c index 852f3bb9..4f0688f9 100644 --- a/lib/librte_eal/common/eal_common_memory.c +++ b/lib/librte_eal/common/eal_common_memory.c @@ -2,82 +2,752 @@ * Copyright(c) 2010-2014 Intel Corporation */ +#include #include #include #include #include +#include #include #include #include #include +#include #include #include #include +#include #include +#include "eal_memalloc.h" #include "eal_private.h" #include "eal_internal_cfg.h" /* - * Return a pointer to a read-only table of struct rte_physmem_desc - * elements, containing the layout of all addressable physical - * memory. The last element of the table contains a NULL address. + * Try to mmap *size bytes in /dev/zero. If it is successful, return the + * pointer to the mmap'd area and keep *size unmodified. Else, retry + * with a smaller zone: decrease *size by hugepage_sz until it reaches + * 0. In this case, return NULL. Note: this function returns an address + * which is a multiple of hugepage size. */ -const struct rte_memseg * -rte_eal_get_physmem_layout(void) + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" + +static uint64_t baseaddr_offset; +static uint64_t system_page_sz; + +void * +eal_get_virtual_area(void *requested_addr, size_t *size, + size_t page_sz, int flags, int mmap_flags) +{ + bool addr_is_hint, allow_shrink, unmap, no_align; + uint64_t map_sz; + void *mapped_addr, *aligned_addr; + + if (system_page_sz == 0) + system_page_sz = sysconf(_SC_PAGESIZE); + + mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS; + + RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); + + addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0; + allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0; + unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0; + + if (requested_addr == NULL && internal_config.base_virtaddr != 0) { + requested_addr = (void *) (internal_config.base_virtaddr + + (size_t)baseaddr_offset); + requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); + addr_is_hint = true; + } + + /* if requested address is not aligned by page size, or if requested + * address is NULL, add page size to requested length as we may get an + * address that's aligned by system page size, which can be smaller than + * our requested page size. additionally, we shouldn't try to align if + * system page size is the same as requested page size. + */ + no_align = (requested_addr != NULL && + ((uintptr_t)requested_addr & (page_sz - 1)) == 0) || + page_sz == system_page_sz; + + do { + map_sz = no_align ? *size : *size + page_sz; + if (map_sz > SIZE_MAX) { + RTE_LOG(ERR, EAL, "Map size too big\n"); + rte_errno = E2BIG; + return NULL; + } + + mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_READ, + mmap_flags, -1, 0); + if (mapped_addr == MAP_FAILED && allow_shrink) + *size -= page_sz; + } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0); + + /* align resulting address - if map failed, we will ignore the value + * anyway, so no need to add additional checks. + */ + aligned_addr = no_align ? mapped_addr : + RTE_PTR_ALIGN(mapped_addr, page_sz); + + if (*size == 0) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n", + strerror(errno)); + rte_errno = errno; + return NULL; + } else if (mapped_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", + strerror(errno)); + /* pass errno up the call chain */ + rte_errno = errno; + return NULL; + } else if (requested_addr != NULL && !addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n", + requested_addr, aligned_addr); + munmap(mapped_addr, map_sz); + rte_errno = EADDRNOTAVAIL; + return NULL; + } else if (requested_addr != NULL && addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n", + requested_addr, aligned_addr); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n"); + } + + RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", + aligned_addr, *size); + + if (unmap) { + munmap(mapped_addr, map_sz); + } else if (!no_align) { + void *map_end, *aligned_end; + size_t before_len, after_len; + + /* when we reserve space with alignment, we add alignment to + * mapping size. On 32-bit, if 1GB alignment was requested, this + * would waste 1GB of address space, which is a luxury we cannot + * afford. so, if alignment was performed, check if any unneeded + * address space can be unmapped back. + */ + + map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz); + aligned_end = RTE_PTR_ADD(aligned_addr, *size); + + /* unmap space before aligned mmap address */ + before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr); + if (before_len > 0) + munmap(mapped_addr, before_len); + + /* unmap space after aligned end mmap address */ + after_len = RTE_PTR_DIFF(map_end, aligned_end); + if (after_len > 0) + munmap(aligned_end, after_len); + } + + baseaddr_offset += *size; + + return aligned_addr; +} + +static uint64_t +get_mem_amount(uint64_t page_sz, uint64_t max_mem) +{ + uint64_t area_sz, max_pages; + + /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ + max_pages = RTE_MAX_MEMSEG_PER_LIST; + max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); + + area_sz = RTE_MIN(page_sz * max_pages, max_mem); + + /* make sure the list isn't smaller than the page size */ + area_sz = RTE_MAX(area_sz, page_sz); + + return RTE_ALIGN(area_sz, page_sz); +} + +static int +free_memseg_list(struct rte_memseg_list *msl) +{ + if (rte_fbarray_destroy(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n"); + return -1; + } + memset(msl, 0, sizeof(*msl)); + return 0; +} + +static int +alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, + uint64_t max_mem, int socket_id, int type_msl_idx) +{ + char name[RTE_FBARRAY_NAME_LEN]; + uint64_t mem_amount; + int max_segs; + + mem_amount = get_mem_amount(page_sz, max_mem); + max_segs = mem_amount / page_sz; + + snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, + type_msl_idx); + if (rte_fbarray_init(&msl->memseg_arr, name, max_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", + rte_strerror(rte_errno)); + return -1; + } + + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->base_va = NULL; + + RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", + (size_t)page_sz >> 10, socket_id); + + return 0; +} + +static int +alloc_va_space(struct rte_memseg_list *msl) +{ + uint64_t page_sz; + size_t mem_sz; + void *addr; + int flags = 0; + +#ifdef RTE_ARCH_PPC_64 + flags |= MAP_HUGETLB; +#endif + + page_sz = msl->page_sz; + mem_sz = page_sz * msl->memseg_arr.len; + + addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); + if (addr == NULL) { + if (rte_errno == EADDRNOTAVAIL) + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n", + (unsigned long long)mem_sz, msl->base_va); + else + RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); + return -1; + } + msl->base_va = addr; + + return 0; +} + +static int __rte_unused +memseg_primary_init_32(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int active_sockets, hpi_idx, msl_idx = 0; + unsigned int socket_id, i; + struct rte_memseg_list *msl; + uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem; + uint64_t max_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* this is a giant hack, but desperate times call for desperate + * measures. in legacy 32-bit mode, we cannot preallocate VA space, + * because having upwards of 2 gigabytes of VA space already mapped will + * interfere with our ability to map and sort hugepages. + * + * therefore, in legacy 32-bit mode, we will be initializing memseg + * lists much later - in eal_memory.c, right after we unmap all the + * unneeded pages. this will not affect secondary processes, as those + * should be able to mmap the space without (too many) problems. + */ + if (internal_config.legacy_mem) + return 0; + + /* 32-bit mode is a very special case. we cannot know in advance where + * the user will want to allocate their memory, so we have to do some + * heuristics. + */ + active_sockets = 0; + total_requested_mem = 0; + if (internal_config.force_sockets) + for (i = 0; i < rte_socket_count(); i++) { + uint64_t mem; + + socket_id = rte_socket_id_by_idx(i); + mem = internal_config.socket_mem[socket_id]; + + if (mem == 0) + continue; + + active_sockets++; + total_requested_mem += mem; + } + else + total_requested_mem = internal_config.memory; + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + if (total_requested_mem > max_mem) { + RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n", + (unsigned int)(max_mem >> 20)); + return -1; + } + total_extra_mem = max_mem - total_requested_mem; + extra_mem_per_socket = active_sockets == 0 ? total_extra_mem : + total_extra_mem / active_sockets; + + /* the allocation logic is a little bit convoluted, but here's how it + * works, in a nutshell: + * - if user hasn't specified on which sockets to allocate memory via + * --socket-mem, we allocate all of our memory on master core socket. + * - if user has specified sockets to allocate memory on, there may be + * some "unused" memory left (e.g. if user has specified --socket-mem + * such that not all memory adds up to 2 gigabytes), so add it to all + * sockets that are in use equally. + * + * page sizes are sorted by size in descending order, so we can safely + * assume that we dispense with bigger page sizes first. + */ + + /* create memseg lists */ + for (i = 0; i < rte_socket_count(); i++) { + int hp_sizes = (int) internal_config.num_hugepage_sizes; + uint64_t max_socket_mem, cur_socket_mem; + unsigned int master_lcore_socket; + struct rte_config *cfg = rte_eal_get_configuration(); + bool skip; + + socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (socket_id > 0) + break; +#endif + + /* if we didn't specifically request memory on this socket */ + skip = active_sockets != 0 && + internal_config.socket_mem[socket_id] == 0; + /* ...or if we didn't specifically request memory on *any* + * socket, and this is not master lcore + */ + master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore); + skip |= active_sockets == 0 && socket_id != master_lcore_socket; + + if (skip) { + RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n", + socket_id); + continue; + } + + /* max amount of memory on this socket */ + max_socket_mem = (active_sockets != 0 ? + internal_config.socket_mem[socket_id] : + internal_config.memory) + + extra_mem_per_socket; + cur_socket_mem = 0; + + for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) { + uint64_t max_pagesz_mem, cur_pagesz_mem = 0; + uint64_t hugepage_sz; + struct hugepage_info *hpi; + int type_msl_idx, max_segs, total_segs = 0; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + /* check if pages are actually available */ + if (hpi->num_pages[socket_id] == 0) + continue; + + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + max_pagesz_mem = max_socket_mem - cur_socket_mem; + + /* make it multiple of page size */ + max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem, + hugepage_sz); + + RTE_LOG(DEBUG, EAL, "Attempting to preallocate " + "%" PRIu64 "M on socket %i\n", + max_pagesz_mem >> 20, socket_id); + + type_msl_idx = 0; + while (cur_pagesz_mem < max_pagesz_mem && + total_segs < max_segs) { + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx]; + + if (alloc_memseg_list(msl, hugepage_sz, + max_pagesz_mem, socket_id, + type_msl_idx)) { + /* failing to allocate a memseg list is + * a serious error. + */ + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + if (alloc_va_space(msl)) { + /* if we couldn't allocate VA space, we + * can try with smaller page sizes. + */ + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n"); + /* deallocate memseg list */ + if (free_memseg_list(msl)) + return -1; + break; + } + + total_segs += msl->memseg_arr.len; + cur_pagesz_mem = total_segs * hugepage_sz; + type_msl_idx++; + msl_idx++; + } + cur_socket_mem += cur_pagesz_mem; + } + if (cur_socket_mem == 0) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n", + socket_id); + return -1; + } + } + + return 0; +} + +static int __rte_unused +memseg_primary_init(void) { - return rte_eal_get_configuration()->mem_config->memseg; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, socket_id, hpi_idx, msl_idx = 0; + struct rte_memseg_list *msl; + uint64_t max_mem, total_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + total_mem = 0; + + /* create memseg lists */ + for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; + hpi_idx++) { + struct hugepage_info *hpi; + uint64_t hugepage_sz; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + for (i = 0; i < (int) rte_socket_count(); i++) { + uint64_t max_type_mem, total_type_mem = 0; + int type_msl_idx, max_segs, total_segs = 0; + + socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (socket_id > 0) + break; +#endif + + if (total_mem >= max_mem) + break; + + max_type_mem = RTE_MIN(max_mem - total_mem, + (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + + type_msl_idx = 0; + while (total_type_mem < max_type_mem && + total_segs < max_segs) { + uint64_t cur_max_mem; + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx++]; + + cur_max_mem = max_type_mem - total_type_mem; + if (alloc_memseg_list(msl, hugepage_sz, + cur_max_mem, socket_id, + type_msl_idx)) + return -1; + + total_segs += msl->memseg_arr.len; + total_type_mem = total_segs * hugepage_sz; + type_msl_idx++; + + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + return -1; + } + } + total_mem += total_type_mem; + } + } + return 0; } +static int +memseg_secondary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx = 0; + struct rte_memseg_list *msl; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + + msl = &mcfg->memsegs[msl_idx]; + + /* skip empty memseg lists */ + if (msl->memseg_arr.len == 0) + continue; + + if (rte_fbarray_attach(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); + return -1; + } + + /* preallocate VA space */ + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); + return -1; + } + } + + return 0; +} + +static struct rte_memseg * +virt2memseg(const void *addr, const struct rte_memseg_list *msl) +{ + const struct rte_fbarray *arr; + void *start, *end; + int ms_idx; + + /* a memseg list was specified, check if it's the right one */ + start = msl->base_va; + end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len); + + if (addr < start || addr >= end) + return NULL; + + /* now, calculate index */ + arr = &msl->memseg_arr; + ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz; + return rte_fbarray_get(arr, ms_idx); +} + +static struct rte_memseg_list * +virt2memseg_list(const void *addr) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + int msl_idx; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + void *start, *end; + msl = &mcfg->memsegs[msl_idx]; + + start = msl->base_va; + end = RTE_PTR_ADD(start, + (size_t)msl->page_sz * msl->memseg_arr.len); + if (addr >= start && addr < end) + break; + } + /* if we didn't find our memseg list */ + if (msl_idx == RTE_MAX_MEMSEG_LISTS) + return NULL; + return msl; +} + +__rte_experimental struct rte_memseg_list * +rte_mem_virt2memseg_list(const void *addr) +{ + return virt2memseg_list(addr); +} + +struct virtiova { + rte_iova_t iova; + void *virt; +}; +static int +find_virt(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + struct virtiova *vi = arg; + if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) { + size_t offset = vi->iova - ms->iova; + vi->virt = RTE_PTR_ADD(ms->addr, offset); + /* stop the walk */ + return 1; + } + return 0; +} +static int +find_virt_legacy(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, size_t len, void *arg) +{ + struct virtiova *vi = arg; + if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) { + size_t offset = vi->iova - ms->iova; + vi->virt = RTE_PTR_ADD(ms->addr, offset); + /* stop the walk */ + return 1; + } + return 0; +} + +__rte_experimental void * +rte_mem_iova2virt(rte_iova_t iova) +{ + struct virtiova vi; + + memset(&vi, 0, sizeof(vi)); + + vi.iova = iova; + /* for legacy mem, we can get away with scanning VA-contiguous segments, + * as we know they are PA-contiguous as well + */ + if (internal_config.legacy_mem) + rte_memseg_contig_walk(find_virt_legacy, &vi); + else + rte_memseg_walk(find_virt, &vi); + + return vi.virt; +} + +__rte_experimental struct rte_memseg * +rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl) +{ + return virt2memseg(addr, msl != NULL ? msl : + rte_mem_virt2memseg_list(addr)); +} + +static int +physmem_size(const struct rte_memseg_list *msl, void *arg) +{ + uint64_t *total_len = arg; + + *total_len += msl->memseg_arr.count * msl->page_sz; + + return 0; +} /* get the total size of memory */ uint64_t rte_eal_get_physmem_size(void) { - const struct rte_mem_config *mcfg; - unsigned i = 0; uint64_t total_len = 0; - /* get pointer to global configuration */ - mcfg = rte_eal_get_configuration()->mem_config; + rte_memseg_list_walk(physmem_size, &total_len); - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - if (mcfg->memseg[i].addr == NULL) - break; + return total_len; +} - total_len += mcfg->memseg[i].len; - } +static int +dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx, ms_idx; + FILE *f = arg; - return total_len; + msl_idx = msl - mcfg->memsegs; + if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) + return -1; + + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + if (ms_idx < 0) + return -1; + + fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, " + "virt:%p, socket_id:%"PRId32", " + "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " + "nrank:%"PRIx32"\n", + msl_idx, ms_idx, + ms->iova, + ms->len, + ms->addr, + ms->socket_id, + ms->hugepage_sz, + ms->nchannel, + ms->nrank); + + return 0; } -/* Dump the physical memory layout on console */ -void -rte_dump_physmem_layout(FILE *f) +/* + * Defining here because declared in rte_memory.h, but the actual implementation + * is in eal_common_memalloc.c, like all other memalloc internals. + */ +int __rte_experimental +rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb, + void *arg) { - const struct rte_mem_config *mcfg; - unsigned i = 0; + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_event_callback_register(name, clb, arg); +} - /* get pointer to global configuration */ - mcfg = rte_eal_get_configuration()->mem_config; +int __rte_experimental +rte_mem_event_callback_unregister(const char *name, void *arg) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_event_callback_unregister(name, arg); +} - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - if (mcfg->memseg[i].addr == NULL) - break; +int __rte_experimental +rte_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id, + limit); +} - fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, " - "virt:%p, socket_id:%"PRId32", " - "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " - "nrank:%"PRIx32"\n", i, - mcfg->memseg[i].iova, - mcfg->memseg[i].len, - mcfg->memseg[i].addr, - mcfg->memseg[i].socket_id, - mcfg->memseg[i].hugepage_sz, - mcfg->memseg[i].nchannel, - mcfg->memseg[i].nrank); +int __rte_experimental +rte_mem_alloc_validator_unregister(const char *name, int socket_id) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); + rte_errno = ENOTSUP; + return -1; } + return eal_memalloc_mem_alloc_validator_unregister(name, socket_id); +} + +/* Dump the physical memory layout on console */ +void +rte_dump_physmem_layout(FILE *f) +{ + rte_memseg_walk(dump_memseg, f); } /* return the number of memory channels */ @@ -117,20 +787,162 @@ rte_mem_lock_page(const void *virt) return mlock((void *)aligned, page_size); } +int __rte_experimental +rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + const struct rte_memseg *ms; + struct rte_fbarray *arr; + + if (msl->memseg_arr.count == 0) + continue; + + arr = &msl->memseg_arr; + + ms_idx = rte_fbarray_find_next_used(arr, 0); + while (ms_idx >= 0) { + int n_segs; + size_t len; + + ms = rte_fbarray_get(arr, ms_idx); + + /* find how many more segments there are, starting with + * this one. + */ + n_segs = rte_fbarray_find_contig_used(arr, ms_idx); + len = n_segs * msl->page_sz; + + ret = func(msl, ms, len, arg); + if (ret < 0) { + ret = -1; + goto out; + } else if (ret > 0) { + ret = 1; + goto out; + } + ms_idx = rte_fbarray_find_next_used(arr, + ms_idx + n_segs); + } + } +out: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return ret; +} + +int __rte_experimental +rte_memseg_walk(rte_memseg_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + const struct rte_memseg *ms; + struct rte_fbarray *arr; + + if (msl->memseg_arr.count == 0) + continue; + + arr = &msl->memseg_arr; + + ms_idx = rte_fbarray_find_next_used(arr, 0); + while (ms_idx >= 0) { + ms = rte_fbarray_get(arr, ms_idx); + ret = func(msl, ms, arg); + if (ret < 0) { + ret = -1; + goto out; + } else if (ret > 0) { + ret = 1; + goto out; + } + ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1); + } + } +out: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return ret; +} + +int __rte_experimental +rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + + if (msl->base_va == NULL) + continue; + + ret = func(msl, arg); + if (ret < 0) { + ret = -1; + goto out; + } + if (ret > 0) { + ret = 1; + goto out; + } + } +out: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return ret; +} + /* init memory subsystem */ int rte_eal_memory_init(void) { + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int retval; RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n"); - const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? + if (!mcfg) + return -1; + + /* lock mem hotplug here, to prevent races while we init */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + + retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? +#ifndef RTE_ARCH_64 + memseg_primary_init_32() : +#else + memseg_primary_init() : +#endif + memseg_secondary_init(); + + if (retval < 0) + goto fail; + + if (eal_memalloc_init() < 0) + goto fail; + + retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? rte_eal_hugepage_init() : rte_eal_hugepage_attach(); if (retval < 0) - return -1; + goto fail; if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0) - return -1; + goto fail; return 0; +fail: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return -1; } diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c index 1ab3ade2..faa3b061 100644 --- a/lib/librte_eal/common/eal_common_memzone.c +++ b/lib/librte_eal/common/eal_common_memzone.c @@ -28,42 +28,30 @@ static inline const struct rte_memzone * memzone_lookup_thread_unsafe(const char *name) { - const struct rte_mem_config *mcfg; + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; const struct rte_memzone *mz; - unsigned i = 0; + int i = 0; /* get pointer to global configuration */ mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; /* * the algorithm is not optimal (linear), but there are few * zones and this function should be called at init only */ - for (i = 0; i < RTE_MAX_MEMZONE; i++) { - mz = &mcfg->memzone[i]; - if (mz->addr != NULL && !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE)) - return &mcfg->memzone[i]; + i = rte_fbarray_find_next_used(arr, 0); + while (i >= 0) { + mz = rte_fbarray_get(arr, i); + if (mz->addr != NULL && + !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE)) + return mz; + i = rte_fbarray_find_next_used(arr, i + 1); } - return NULL; } -static inline struct rte_memzone * -get_next_free_memzone(void) -{ - struct rte_mem_config *mcfg; - unsigned i = 0; - - /* get pointer to global configuration */ - mcfg = rte_eal_get_configuration()->mem_config; - - for (i = 0; i < RTE_MAX_MEMZONE; i++) { - if (mcfg->memzone[i].addr == NULL) - return &mcfg->memzone[i]; - } - - return NULL; -} /* This function will return the greatest free block if a heap has been * specified. If no heap has been specified, it will return the heap and @@ -98,18 +86,22 @@ find_heap_max_free_elem(int *s, unsigned align) static const struct rte_memzone * memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, - int socket_id, unsigned flags, unsigned align, unsigned bound) + int socket_id, unsigned int flags, unsigned int align, + unsigned int bound) { struct rte_memzone *mz; struct rte_mem_config *mcfg; + struct rte_fbarray *arr; size_t requested_len; - int socket, i; + int mz_idx; + bool contig; /* get pointer to global configuration */ mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; /* no more room in config */ - if (mcfg->memzone_cnt >= RTE_MAX_MEMZONE) { + if (arr->count >= arr->len) { RTE_LOG(ERR, EAL, "%s(): No more room in config\n", __func__); rte_errno = ENOSPC; return NULL; @@ -169,7 +161,17 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, if (!rte_eal_has_hugepages()) socket_id = SOCKET_ID_ANY; + contig = (flags & RTE_MEMZONE_IOVA_CONTIG) != 0; + /* malloc only cares about size flags, remove contig flag from flags */ + flags &= ~RTE_MEMZONE_IOVA_CONTIG; + if (len == 0) { + /* len == 0 is only allowed for non-contiguous zones */ + if (contig) { + RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous memzones is not supported\n"); + rte_errno = EINVAL; + return NULL; + } if (bound != 0) requested_len = bound; else { @@ -181,28 +183,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, } } - if (socket_id == SOCKET_ID_ANY) - socket = malloc_get_numa_socket(); - else - socket = socket_id; - /* allocate memory on heap */ - void *mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[socket], NULL, - requested_len, flags, align, bound); - - if ((mz_addr == NULL) && (socket_id == SOCKET_ID_ANY)) { - /* try other heaps */ - for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { - if (socket == i) - continue; - - mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[i], - NULL, requested_len, flags, align, bound); - if (mz_addr != NULL) - break; - } - } - + void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, flags, + align, bound, contig); if (mz_addr == NULL) { rte_errno = ENOMEM; return NULL; @@ -211,33 +194,37 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, struct malloc_elem *elem = malloc_elem_from_data(mz_addr); /* fill the zone in config */ - mz = get_next_free_memzone(); + mz_idx = rte_fbarray_find_next_free(arr, 0); + + if (mz_idx < 0) { + mz = NULL; + } else { + rte_fbarray_set_used(arr, mz_idx); + mz = rte_fbarray_get(arr, mz_idx); + } if (mz == NULL) { - RTE_LOG(ERR, EAL, "%s(): Cannot find free memzone but there is room " - "in config!\n", __func__); - malloc_elem_free(elem); + RTE_LOG(ERR, EAL, "%s(): Cannot find free memzone\n", __func__); + malloc_heap_free(elem); rte_errno = ENOSPC; return NULL; } - mcfg->memzone_cnt++; snprintf(mz->name, sizeof(mz->name), "%s", name); mz->iova = rte_malloc_virt2iova(mz_addr); mz->addr = mz_addr; - mz->len = (requested_len == 0 ? elem->size : requested_len); - mz->hugepage_sz = elem->ms->hugepage_sz; - mz->socket_id = elem->ms->socket_id; + mz->len = (requested_len == 0 ? + (elem->size - MALLOC_ELEM_OVERHEAD) : requested_len); + mz->hugepage_sz = elem->msl->page_sz; + mz->socket_id = elem->msl->socket_id; mz->flags = 0; - mz->memseg_id = elem->ms - rte_eal_get_configuration()->mem_config->memseg; return mz; } static const struct rte_memzone * -rte_memzone_reserve_thread_safe(const char *name, size_t len, - int socket_id, unsigned flags, unsigned align, - unsigned bound) +rte_memzone_reserve_thread_safe(const char *name, size_t len, int socket_id, + unsigned int flags, unsigned int align, unsigned int bound) { struct rte_mem_config *mcfg; const struct rte_memzone *mz = NULL; @@ -296,34 +283,38 @@ int rte_memzone_free(const struct rte_memzone *mz) { struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + struct rte_memzone *found_mz; int ret = 0; - void *addr; + void *addr = NULL; unsigned idx; if (mz == NULL) return -EINVAL; mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; rte_rwlock_write_lock(&mcfg->mlock); - idx = ((uintptr_t)mz - (uintptr_t)mcfg->memzone); - idx = idx / sizeof(struct rte_memzone); + idx = rte_fbarray_find_idx(arr, mz); + found_mz = rte_fbarray_get(arr, idx); - addr = mcfg->memzone[idx].addr; - if (addr == NULL) + if (found_mz == NULL) { + ret = -EINVAL; + } else if (found_mz->addr == NULL) { + RTE_LOG(ERR, EAL, "Memzone is not allocated\n"); ret = -EINVAL; - else if (mcfg->memzone_cnt == 0) { - rte_panic("%s(): memzone address not NULL but memzone_cnt is 0!\n", - __func__); } else { - memset(&mcfg->memzone[idx], 0, sizeof(mcfg->memzone[idx])); - mcfg->memzone_cnt--; + addr = found_mz->addr; + memset(found_mz, 0, sizeof(*found_mz)); + rte_fbarray_set_free(arr, idx); } rte_rwlock_write_unlock(&mcfg->mlock); - rte_free(addr); + if (addr != NULL) + rte_free(addr); return ret; } @@ -348,31 +339,61 @@ rte_memzone_lookup(const char *name) return memzone; } +static void +dump_memzone(const struct rte_memzone *mz, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl = NULL; + void *cur_addr, *mz_end; + struct rte_memseg *ms; + int mz_idx, ms_idx; + size_t page_sz; + FILE *f = arg; + + mz_idx = rte_fbarray_find_idx(&mcfg->memzones, mz); + + fprintf(f, "Zone %u: name:<%s>, len:0x%zx, virt:%p, " + "socket_id:%"PRId32", flags:%"PRIx32"\n", + mz_idx, + mz->name, + mz->len, + mz->addr, + mz->socket_id, + mz->flags); + + /* go through each page occupied by this memzone */ + msl = rte_mem_virt2memseg_list(mz->addr); + if (!msl) { + RTE_LOG(DEBUG, EAL, "Skipping bad memzone\n"); + return; + } + page_sz = (size_t)mz->hugepage_sz; + cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz); + mz_end = RTE_PTR_ADD(cur_addr, mz->len); + + fprintf(f, "physical segments used:\n"); + ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz; + ms = rte_fbarray_get(&msl->memseg_arr, ms_idx); + + do { + fprintf(f, " addr: %p iova: 0x%" PRIx64 " " + "len: 0x%zx " + "pagesz: 0x%zx\n", + cur_addr, ms->iova, ms->len, page_sz); + + /* advance VA to next page */ + cur_addr = RTE_PTR_ADD(cur_addr, page_sz); + + /* memzones occupy contiguous segments */ + ++ms; + } while (cur_addr < mz_end); +} + /* Dump all reserved memory zones on console */ void rte_memzone_dump(FILE *f) { - struct rte_mem_config *mcfg; - unsigned i = 0; - - /* get pointer to global configuration */ - mcfg = rte_eal_get_configuration()->mem_config; - - rte_rwlock_read_lock(&mcfg->mlock); - /* dump all zones */ - for (i=0; imemzone[i].addr == NULL) - break; - fprintf(f, "Zone %u: name:<%s>, IO:0x%"PRIx64", len:0x%zx" - ", virt:%p, socket_id:%"PRId32", flags:%"PRIx32"\n", i, - mcfg->memzone[i].name, - mcfg->memzone[i].iova, - mcfg->memzone[i].len, - mcfg->memzone[i].addr, - mcfg->memzone[i].socket_id, - mcfg->memzone[i].flags); - } - rte_rwlock_read_unlock(&mcfg->mlock); + rte_memzone_walk(dump_memzone, f); } /* @@ -382,30 +403,27 @@ int rte_eal_memzone_init(void) { struct rte_mem_config *mcfg; - const struct rte_memseg *memseg; /* get pointer to global configuration */ mcfg = rte_eal_get_configuration()->mem_config; - /* secondary processes don't need to initialise anything */ - if (rte_eal_process_type() == RTE_PROC_SECONDARY) - return 0; + rte_rwlock_write_lock(&mcfg->mlock); - memseg = rte_eal_get_physmem_layout(); - if (memseg == NULL) { - RTE_LOG(ERR, EAL, "%s(): Cannot get physical layout\n", __func__); + if (rte_eal_process_type() == RTE_PROC_PRIMARY && + rte_fbarray_init(&mcfg->memzones, "memzone", + RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) { + RTE_LOG(ERR, EAL, "Cannot allocate memzone list\n"); + return -1; + } else if (rte_eal_process_type() == RTE_PROC_SECONDARY && + rte_fbarray_attach(&mcfg->memzones)) { + RTE_LOG(ERR, EAL, "Cannot attach to memzone list\n"); + rte_rwlock_write_unlock(&mcfg->mlock); return -1; } - rte_rwlock_write_lock(&mcfg->mlock); - - /* delete all zones */ - mcfg->memzone_cnt = 0; - memset(mcfg->memzone, 0, sizeof(mcfg->memzone)); - rte_rwlock_write_unlock(&mcfg->mlock); - return rte_eal_malloc_heap_init(); + return 0; } /* Walk all reserved memory zones */ @@ -413,14 +431,18 @@ void rte_memzone_walk(void (*func)(const struct rte_memzone *, void *), void *arg) { struct rte_mem_config *mcfg; - unsigned i; + struct rte_fbarray *arr; + int i; mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; rte_rwlock_read_lock(&mcfg->mlock); - for (i=0; imemzone[i].addr != NULL) - (*func)(&mcfg->memzone[i], arg); + i = rte_fbarray_find_next_used(arr, 0); + while (i >= 0) { + struct rte_memzone *mz = rte_fbarray_get(arr, i); + (*func)(mz, arg); + i = rte_fbarray_find_next_used(arr, i + 1); } rte_rwlock_read_unlock(&mcfg->mlock); } diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c index 9f2f8d25..ecebb292 100644 --- a/lib/librte_eal/common/eal_common_options.c +++ b/lib/librte_eal/common/eal_common_options.c @@ -27,6 +27,7 @@ #include "eal_internal_cfg.h" #include "eal_options.h" #include "eal_filesystem.h" +#include "eal_private.h" #define BITS_PER_HEX 4 #define LCORE_OPT_LST 1 @@ -73,6 +74,8 @@ eal_long_options[] = { {OPT_VDEV, 1, NULL, OPT_VDEV_NUM }, {OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM }, {OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM }, + {OPT_LEGACY_MEM, 0, NULL, OPT_LEGACY_MEM_NUM }, + {OPT_SINGLE_FILE_SEGMENTS, 0, NULL, OPT_SINGLE_FILE_SEGMENTS_NUM}, {0, 0, NULL, 0 } }; @@ -151,7 +154,7 @@ eal_option_device_parse(void) TAILQ_FOREACH_SAFE(devopt, &devopt_list, next, tmp) { if (ret == 0) { - ret = rte_eal_devargs_add(devopt->type, devopt->arg); + ret = rte_devargs_add(devopt->type, devopt->arg); if (ret) RTE_LOG(ERR, EAL, "Unable to parse device '%s'\n", devopt->arg); @@ -177,8 +180,11 @@ eal_reset_internal_config(struct internal_config *internal_cfg) for (i = 0; i < RTE_MAX_NUMA_NODES; i++) internal_cfg->socket_mem[i] = 0; /* zero out hugedir descriptors */ - for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) + for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) { + memset(&internal_cfg->hugepage_info[i], 0, + sizeof(internal_cfg->hugepage_info[0])); internal_cfg->hugepage_info[i].lock_descriptor = -1; + } internal_cfg->base_virtaddr = 0; internal_cfg->syslog_facility = LOG_DAEMON; @@ -194,6 +200,7 @@ eal_reset_internal_config(struct internal_config *internal_cfg) internal_cfg->vmware_tsc_map = 0; internal_cfg->create_uio_dev = 0; internal_cfg->user_mbuf_pool_ops_name = NULL; + internal_cfg->init_complete = 0; } static int @@ -875,7 +882,7 @@ static int eal_parse_syslog(const char *facility, struct internal_config *conf) { int i; - static struct { + static const struct { const char *name; int value; } map[] = { @@ -911,43 +918,92 @@ eal_parse_syslog(const char *facility, struct internal_config *conf) } static int -eal_parse_log_level(const char *arg) +eal_parse_log_priority(const char *level) { - char *end, *str, *type, *level; + static const char * const levels[] = { + [RTE_LOG_EMERG] = "emergency", + [RTE_LOG_ALERT] = "alert", + [RTE_LOG_CRIT] = "critical", + [RTE_LOG_ERR] = "error", + [RTE_LOG_WARNING] = "warning", + [RTE_LOG_NOTICE] = "notice", + [RTE_LOG_INFO] = "info", + [RTE_LOG_DEBUG] = "debug", + }; + size_t len = strlen(level); unsigned long tmp; + char *end; + unsigned int i; - str = strdup(arg); - if (str == NULL) + if (len == 0) return -1; - if (strchr(str, ',') == NULL) { - type = NULL; - level = str; - } else { - type = strsep(&str, ","); - level = strsep(&str, ","); + /* look for named values, skip 0 which is not a valid level */ + for (i = 1; i < RTE_DIM(levels); i++) { + if (strncmp(levels[i], level, len) == 0) + return i; } + /* not a string, maybe it is numeric */ errno = 0; tmp = strtoul(level, &end, 0); /* check for errors */ - if ((errno != 0) || (level[0] == '\0') || - end == NULL || (*end != '\0')) - goto fail; + if (errno != 0 || end == NULL || *end != '\0' || + tmp >= UINT32_MAX) + return -1; - /* log_level is a uint32_t */ - if (tmp >= UINT32_MAX) - goto fail; + return tmp; +} + +static int +eal_parse_log_level(const char *arg) +{ + const char *pattern = NULL; + const char *regex = NULL; + char *str, *level; + int priority; - if (type == NULL) { - rte_log_set_global_level(tmp); - } else if (rte_log_set_level_regexp(type, tmp) < 0) { - printf("cannot set log level %s,%lu\n", - type, tmp); + str = strdup(arg); + if (str == NULL) + return -1; + + if ((level = strchr(str, ','))) { + regex = str; + *level++ = '\0'; + } else if ((level = strchr(str, ':'))) { + pattern = str; + *level++ = '\0'; + } else { + level = str; + } + + priority = eal_parse_log_priority(level); + if (priority < 0) { + fprintf(stderr, "invalid log priority: %s\n", level); goto fail; } + if (regex) { + if (rte_log_set_level_regexp(regex, priority) < 0) { + fprintf(stderr, "cannot set log level %s,%d\n", + pattern, priority); + goto fail; + } + if (rte_log_save_regexp(regex, priority) < 0) + goto fail; + } else if (pattern) { + if (rte_log_set_level_pattern(pattern, priority) < 0) { + fprintf(stderr, "cannot set log level %s:%d\n", + pattern, priority); + goto fail; + } + if (rte_log_save_pattern(pattern, priority) < 0) + goto fail; + } else { + rte_log_set_global_level(priority); + } + free(str); return 0; @@ -1089,6 +1145,8 @@ eal_parse_common_option(int opt, const char *optarg, case OPT_NO_HUGE_NUM: conf->no_hugetlbfs = 1; + /* no-huge is legacy mem */ + conf->legacy_mem = 1; break; case OPT_NO_PCI_NUM: @@ -1160,6 +1218,12 @@ eal_parse_common_option(int opt, const char *optarg, core_parsed = LCORE_OPT_MAP; break; + case OPT_LEGACY_MEM_NUM: + conf->legacy_mem = 1; + break; + case OPT_SINGLE_FILE_SEGMENTS_NUM: + conf->single_file_segments = 1; + break; /* don't know what to do, leave this to caller */ default: @@ -1302,7 +1366,7 @@ eal_common_usage(void) " --"OPT_PROC_TYPE" Type of this process (primary|secondary|auto)\n" " --"OPT_SYSLOG" Set syslog facility\n" " --"OPT_LOG_LEVEL"= Set global log level\n" - " --"OPT_LOG_LEVEL"=,\n" + " --"OPT_LOG_LEVEL"=:\n" " Set specific log level\n" " -v Display version information on startup\n" " -h, --help This help\n" diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c index caa8774a..707d8ab3 100644 --- a/lib/librte_eal/common/eal_common_proc.c +++ b/lib/librte_eal/common/eal_common_proc.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include #include "eal_private.h" #include "eal_filesystem.h" @@ -51,6 +53,7 @@ enum mp_type { MP_MSG, /* Share message with peers, will not block */ MP_REQ, /* Request for information, Will block for a reply */ MP_REP, /* Response to previously-received request */ + MP_IGN, /* Response telling requester to ignore this response */ }; struct mp_msg_internal { @@ -58,31 +61,58 @@ struct mp_msg_internal { struct rte_mp_msg msg; }; -struct sync_request { - TAILQ_ENTRY(sync_request) next; - int reply_received; +struct async_request_param { + rte_mp_async_reply_t clb; + struct rte_mp_reply user_reply; + struct timespec end; + int n_responses_processed; +}; + +struct pending_request { + TAILQ_ENTRY(pending_request) next; + enum { + REQUEST_TYPE_SYNC, + REQUEST_TYPE_ASYNC + } type; char dst[PATH_MAX]; struct rte_mp_msg *request; struct rte_mp_msg *reply; - pthread_cond_t cond; + int reply_received; + RTE_STD_C11 + union { + struct { + struct async_request_param *param; + } async; + struct { + pthread_cond_t cond; + } sync; + }; }; -TAILQ_HEAD(sync_request_list, sync_request); +TAILQ_HEAD(pending_request_list, pending_request); static struct { - struct sync_request_list requests; + struct pending_request_list requests; pthread_mutex_t lock; -} sync_requests = { - .requests = TAILQ_HEAD_INITIALIZER(sync_requests.requests), - .lock = PTHREAD_MUTEX_INITIALIZER + pthread_cond_t async_cond; +} pending_requests = { + .requests = TAILQ_HEAD_INITIALIZER(pending_requests.requests), + .lock = PTHREAD_MUTEX_INITIALIZER, + .async_cond = PTHREAD_COND_INITIALIZER + /**< used in async requests only */ }; -static struct sync_request * -find_sync_request(const char *dst, const char *act_name) +/* forward declarations */ +static int +mp_send(struct rte_mp_msg *msg, const char *peer, int type); + + +static struct pending_request * +find_pending_request(const char *dst, const char *act_name) { - struct sync_request *r; + struct pending_request *r; - TAILQ_FOREACH(r, &sync_requests.requests, next) { + TAILQ_FOREACH(r, &pending_requests.requests, next) { if (!strcmp(r->dst, dst) && !strcmp(r->request->name, act_name)) break; @@ -91,6 +121,17 @@ find_sync_request(const char *dst, const char *act_name) return r; } +static void +create_socket_path(const char *name, char *buf, int len) +{ + const char *prefix = eal_mp_socket_path(); + + if (strlen(name) > 0) + snprintf(buf, len, "%s_%s", prefix, name); + else + strlcpy(buf, prefix, len); +} + int rte_eal_primary_proc_alive(const char *config_file_path) { @@ -159,7 +200,7 @@ rte_mp_action_register(const char *name, rte_mp_t action) rte_errno = ENOMEM; return -1; } - strcpy(entry->action_name, name); + strlcpy(entry->action_name, name, sizeof(entry->action_name)); entry->action = action; pthread_mutex_lock(&mp_mutex_action); @@ -241,23 +282,30 @@ read_msg(struct mp_msg_internal *m, struct sockaddr_un *s) static void process_msg(struct mp_msg_internal *m, struct sockaddr_un *s) { - struct sync_request *sync_req; + struct pending_request *pending_req; struct action_entry *entry; struct rte_mp_msg *msg = &m->msg; rte_mp_t action = NULL; RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name); - if (m->type == MP_REP) { - pthread_mutex_lock(&sync_requests.lock); - sync_req = find_sync_request(s->sun_path, msg->name); - if (sync_req) { - memcpy(sync_req->reply, msg, sizeof(*msg)); - sync_req->reply_received = 1; - pthread_cond_signal(&sync_req->cond); + if (m->type == MP_REP || m->type == MP_IGN) { + pthread_mutex_lock(&pending_requests.lock); + pending_req = find_pending_request(s->sun_path, msg->name); + if (pending_req) { + memcpy(pending_req->reply, msg, sizeof(*msg)); + /* -1 indicates that we've been asked to ignore */ + pending_req->reply_received = + m->type == MP_REP ? 1 : -1; + + if (pending_req->type == REQUEST_TYPE_SYNC) + pthread_cond_signal(&pending_req->sync.cond); + else if (pending_req->type == REQUEST_TYPE_ASYNC) + pthread_cond_signal( + &pending_requests.async_cond); } else RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name); - pthread_mutex_unlock(&sync_requests.lock); + pthread_mutex_unlock(&pending_requests.lock); return; } @@ -267,10 +315,25 @@ process_msg(struct mp_msg_internal *m, struct sockaddr_un *s) action = entry->action; pthread_mutex_unlock(&mp_mutex_action); - if (!action) - RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name); - else if (action(msg, s->sun_path) < 0) + if (!action) { + if (m->type == MP_REQ && !internal_config.init_complete) { + /* if this is a request, and init is not yet complete, + * and callback wasn't registered, we should tell the + * requester to ignore our existence because we're not + * yet ready to process this request. + */ + struct rte_mp_msg dummy; + + memset(&dummy, 0, sizeof(dummy)); + strlcpy(dummy.name, msg->name, sizeof(dummy.name)); + mp_send(&dummy, s->sun_path, MP_IGN); + } else { + RTE_LOG(ERR, EAL, "Cannot find action: %s\n", + msg->name); + } + } else if (action(msg, s->sun_path) < 0) { RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name); + } } static void * @@ -287,11 +350,227 @@ mp_handle(void *arg __rte_unused) return NULL; } +static int +timespec_cmp(const struct timespec *a, const struct timespec *b) +{ + if (a->tv_sec < b->tv_sec) + return -1; + if (a->tv_sec > b->tv_sec) + return 1; + if (a->tv_nsec < b->tv_nsec) + return -1; + if (a->tv_nsec > b->tv_nsec) + return 1; + return 0; +} + +enum async_action { + ACTION_NONE, /**< don't do anything */ + ACTION_FREE, /**< free the action entry, but don't trigger callback */ + ACTION_TRIGGER /**< trigger callback, then free action entry */ +}; + +static enum async_action +process_async_request(struct pending_request *sr, const struct timespec *now) +{ + struct async_request_param *param; + struct rte_mp_reply *reply; + bool timeout, received, last_msg; + + param = sr->async.param; + reply = ¶m->user_reply; + + /* did we timeout? */ + timeout = timespec_cmp(¶m->end, now) <= 0; + + /* did we receive a response? */ + received = sr->reply_received != 0; + + /* if we didn't time out, and we didn't receive a response, ignore */ + if (!timeout && !received) + return ACTION_NONE; + + /* if we received a response, adjust relevant data and copy mesasge. */ + if (sr->reply_received == 1 && sr->reply) { + struct rte_mp_msg *msg, *user_msgs, *tmp; + + msg = sr->reply; + user_msgs = reply->msgs; + + tmp = realloc(user_msgs, sizeof(*msg) * + (reply->nb_received + 1)); + if (!tmp) { + RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n", + sr->dst, sr->request->name); + /* this entry is going to be removed and its message + * dropped, but we don't want to leak memory, so + * continue. + */ + } else { + user_msgs = tmp; + reply->msgs = user_msgs; + memcpy(&user_msgs[reply->nb_received], + msg, sizeof(*msg)); + reply->nb_received++; + } + + /* mark this request as processed */ + param->n_responses_processed++; + } else if (sr->reply_received == -1) { + /* we were asked to ignore this process */ + reply->nb_sent--; + } else if (timeout) { + /* count it as processed response, but don't increment + * nb_received. + */ + param->n_responses_processed++; + } + + free(sr->reply); + + last_msg = param->n_responses_processed == reply->nb_sent; + + return last_msg ? ACTION_TRIGGER : ACTION_FREE; +} + +static void +trigger_async_action(struct pending_request *sr) +{ + struct async_request_param *param; + struct rte_mp_reply *reply; + + param = sr->async.param; + reply = ¶m->user_reply; + + param->clb(sr->request, reply); + + /* clean up */ + free(sr->async.param->user_reply.msgs); + free(sr->async.param); + free(sr->request); +} + +static struct pending_request * +check_trigger(struct timespec *ts) +{ + struct pending_request *next, *cur, *trigger = NULL; + + TAILQ_FOREACH_SAFE(cur, &pending_requests.requests, next, next) { + enum async_action action; + if (cur->type != REQUEST_TYPE_ASYNC) + continue; + + action = process_async_request(cur, ts); + if (action == ACTION_FREE) { + TAILQ_REMOVE(&pending_requests.requests, cur, next); + free(cur); + } else if (action == ACTION_TRIGGER) { + TAILQ_REMOVE(&pending_requests.requests, cur, next); + trigger = cur; + break; + } + } + return trigger; +} + +static void +wait_for_async_messages(void) +{ + struct pending_request *sr; + struct timespec timeout; + bool timedwait = false; + bool nowait = false; + int ret; + + /* scan through the list and see if there are any timeouts that + * are earlier than our current timeout. + */ + TAILQ_FOREACH(sr, &pending_requests.requests, next) { + if (sr->type != REQUEST_TYPE_ASYNC) + continue; + if (!timedwait || timespec_cmp(&sr->async.param->end, + &timeout) < 0) { + memcpy(&timeout, &sr->async.param->end, + sizeof(timeout)); + timedwait = true; + } + + /* sometimes, we don't even wait */ + if (sr->reply_received) { + nowait = true; + break; + } + } + + if (nowait) + return; + + do { + ret = timedwait ? + pthread_cond_timedwait( + &pending_requests.async_cond, + &pending_requests.lock, + &timeout) : + pthread_cond_wait( + &pending_requests.async_cond, + &pending_requests.lock); + } while (ret != 0 && ret != ETIMEDOUT); + + /* we've been woken up or timed out */ +} + +static void * +async_reply_handle(void *arg __rte_unused) +{ + struct timeval now; + struct timespec ts_now; + while (1) { + struct pending_request *trigger = NULL; + + pthread_mutex_lock(&pending_requests.lock); + + /* we exit this function holding the lock */ + wait_for_async_messages(); + + if (gettimeofday(&now, NULL) < 0) { + pthread_mutex_unlock(&pending_requests.lock); + RTE_LOG(ERR, EAL, "Cannot get current time\n"); + break; + } + ts_now.tv_nsec = now.tv_usec * 1000; + ts_now.tv_sec = now.tv_sec; + + do { + trigger = check_trigger(&ts_now); + /* unlock request list */ + pthread_mutex_unlock(&pending_requests.lock); + + if (trigger) { + trigger_async_action(trigger); + free(trigger); + + /* we've triggered a callback, but there may be + * more, so lock the list and check again. + */ + pthread_mutex_lock(&pending_requests.lock); + } + } while (trigger); + } + + RTE_LOG(ERR, EAL, "ERROR: asynchronous requests disabled\n"); + + return NULL; +} + static int open_socket_fd(void) { + char peer_name[PATH_MAX] = {0}; struct sockaddr_un un; - const char *prefix = eal_mp_socket_path(); + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + snprintf(peer_name, sizeof(peer_name), + "%d_%"PRIx64, getpid(), rte_rdtsc()); mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0); if (mp_fd < 0) { @@ -301,13 +580,11 @@ open_socket_fd(void) memset(&un, 0, sizeof(un)); un.sun_family = AF_UNIX; - if (rte_eal_process_type() == RTE_PROC_PRIMARY) - snprintf(un.sun_path, sizeof(un.sun_path), "%s", prefix); - else { - snprintf(un.sun_path, sizeof(un.sun_path), "%s_%d_%"PRIx64, - prefix, getpid(), rte_rdtsc()); - } + + create_socket_path(peer_name, un.sun_path, sizeof(un.sun_path)); + unlink(un.sun_path); /* May still exist since last run */ + if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) { RTE_LOG(ERR, EAL, "failed to bind %s: %s\n", un.sun_path, strerror(errno)); @@ -342,54 +619,73 @@ unlink_sockets(const char *filter) return 0; } -static void -unlink_socket_by_path(const char *path) -{ - char *filename; - char *fullpath = strdup(path); - - if (!fullpath) - return; - filename = basename(fullpath); - unlink_sockets(filename); - free(fullpath); - RTE_LOG(INFO, EAL, "Remove socket %s\n", path); -} - int rte_mp_channel_init(void) { - char thread_name[RTE_MAX_THREAD_NAME_LEN]; - char *path; - pthread_t tid; + char path[PATH_MAX]; + int dir_fd; + pthread_t mp_handle_tid, async_reply_handle_tid; + + /* create filter path */ + create_socket_path("*", path, sizeof(path)); + strlcpy(mp_filter, basename(path), sizeof(mp_filter)); - snprintf(mp_filter, PATH_MAX, ".%s_unix_*", - internal_config.hugefile_prefix); + /* path may have been modified, so recreate it */ + create_socket_path("*", path, sizeof(path)); + strlcpy(mp_dir_path, dirname(path), sizeof(mp_dir_path)); + + /* lock the directory */ + dir_fd = open(mp_dir_path, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "failed to open %s: %s\n", + mp_dir_path, strerror(errno)); + return -1; + } - path = strdup(eal_mp_socket_path()); - snprintf(mp_dir_path, PATH_MAX, "%s", dirname(path)); - free(path); + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "failed to lock %s: %s\n", + mp_dir_path, strerror(errno)); + close(dir_fd); + return -1; + } if (rte_eal_process_type() == RTE_PROC_PRIMARY && - unlink_sockets(mp_filter)) { + unlink_sockets(mp_filter)) { RTE_LOG(ERR, EAL, "failed to unlink mp sockets\n"); + close(dir_fd); return -1; } - if (open_socket_fd() < 0) + if (open_socket_fd() < 0) { + close(dir_fd); + return -1; + } + + if (rte_ctrl_thread_create(&mp_handle_tid, "rte_mp_handle", + NULL, mp_handle, NULL) < 0) { + RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", + strerror(errno)); + close(mp_fd); + close(dir_fd); + mp_fd = -1; return -1; + } - if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) { + if (rte_ctrl_thread_create(&async_reply_handle_tid, + "rte_mp_async", NULL, + async_reply_handle, NULL) < 0) { RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", strerror(errno)); close(mp_fd); + close(dir_fd); mp_fd = -1; return -1; } - /* try best to set thread name */ - snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle"); - rte_thread_setname(tid, thread_name); + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + close(dir_fd); + return 0; } @@ -416,7 +712,7 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg, int type) memset(&dst, 0, sizeof(dst)); dst.sun_family = AF_UNIX; - snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path); + strlcpy(dst.sun_path, dst_path, sizeof(dst.sun_path)); memset(&msgh, 0, sizeof(msgh)); memset(control, 0, sizeof(control)); @@ -444,13 +740,12 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg, int type) if (snd < 0) { rte_errno = errno; /* Check if it caused by peer process exits */ - if (errno == -ECONNREFUSED) { - /* We don't unlink the primary's socket here */ - if (rte_eal_process_type() == RTE_PROC_PRIMARY) - unlink_socket_by_path(dst_path); + if (errno == ECONNREFUSED && + rte_eal_process_type() == RTE_PROC_PRIMARY) { + unlink(dst_path); return 0; } - if (errno == -ENOBUFS) { + if (errno == ENOBUFS) { RTE_LOG(ERR, EAL, "Peer cannot receive message %s\n", dst_path); return 0; @@ -466,7 +761,7 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg, int type) static int mp_send(struct rte_mp_msg *msg, const char *peer, int type) { - int ret = 0; + int dir_fd, ret = 0; DIR *mp_dir; struct dirent *ent; @@ -488,14 +783,32 @@ mp_send(struct rte_mp_msg *msg, const char *peer, int type) rte_errno = errno; return -1; } + + dir_fd = dirfd(mp_dir); + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + rte_errno = errno; + closedir(mp_dir); + return -1; + } + while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + if (fnmatch(mp_filter, ent->d_name, 0) != 0) continue; - if (send_msg(ent->d_name, msg, type) < 0) + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + if (send_msg(path, msg, type) < 0) ret = -1; } + /* unlock the dir */ + flock(dir_fd, LOCK_UN); + /* dir_fd automatically closed on closedir */ closedir(mp_dir); return ret; } @@ -539,25 +852,75 @@ rte_mp_sendmsg(struct rte_mp_msg *msg) } static int -mp_request_one(const char *dst, struct rte_mp_msg *req, +mp_request_async(const char *dst, struct rte_mp_msg *req, + struct async_request_param *param) +{ + struct rte_mp_msg *reply_msg; + struct pending_request *pending_req, *exist; + int ret; + + pending_req = calloc(1, sizeof(*pending_req)); + reply_msg = calloc(1, sizeof(*reply_msg)); + if (pending_req == NULL || reply_msg == NULL) { + RTE_LOG(ERR, EAL, "Could not allocate space for sync request\n"); + rte_errno = ENOMEM; + ret = -1; + goto fail; + } + + pending_req->type = REQUEST_TYPE_ASYNC; + strlcpy(pending_req->dst, dst, sizeof(pending_req->dst)); + pending_req->request = req; + pending_req->reply = reply_msg; + pending_req->async.param = param; + + /* queue already locked by caller */ + + exist = find_pending_request(dst, req->name); + if (exist) { + RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name); + rte_errno = EEXIST; + ret = -1; + goto fail; + } + + ret = send_msg(dst, req, MP_REQ); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n", + dst, req->name); + ret = -1; + goto fail; + } else if (ret == 0) { + ret = 0; + goto fail; + } + TAILQ_INSERT_TAIL(&pending_requests.requests, pending_req, next); + + param->user_reply.nb_sent++; + + return 0; +fail: + free(pending_req); + free(reply_msg); + return ret; +} + +static int +mp_request_sync(const char *dst, struct rte_mp_msg *req, struct rte_mp_reply *reply, const struct timespec *ts) { int ret; - struct timeval now; struct rte_mp_msg msg, *tmp; - struct sync_request sync_req, *exist; - - sync_req.reply_received = 0; - strcpy(sync_req.dst, dst); - sync_req.request = req; - sync_req.reply = &msg; - pthread_cond_init(&sync_req.cond, NULL); - - pthread_mutex_lock(&sync_requests.lock); - exist = find_sync_request(dst, req->name); - if (!exist) - TAILQ_INSERT_TAIL(&sync_requests.requests, &sync_req, next); - pthread_mutex_unlock(&sync_requests.lock); + struct pending_request pending_req, *exist; + + pending_req.type = REQUEST_TYPE_SYNC; + pending_req.reply_received = 0; + strlcpy(pending_req.dst, dst, sizeof(pending_req.dst)); + pending_req.request = req; + pending_req.reply = &msg; + pthread_cond_init(&pending_req.sync.cond, NULL); + + exist = find_pending_request(dst, req->name); if (exist) { RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name); rte_errno = EEXIST; @@ -572,33 +935,31 @@ mp_request_one(const char *dst, struct rte_mp_msg *req, } else if (ret == 0) return 0; + TAILQ_INSERT_TAIL(&pending_requests.requests, &pending_req, next); + reply->nb_sent++; - pthread_mutex_lock(&sync_requests.lock); do { - pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts); - /* Check spurious wakeups */ - if (sync_req.reply_received == 1) - break; - /* Check if time is out */ - if (gettimeofday(&now, NULL) < 0) - break; - if (now.tv_sec < ts->tv_sec) - break; - else if (now.tv_sec == ts->tv_sec && - now.tv_usec * 1000 < ts->tv_nsec) - break; - } while (1); - /* We got the lock now */ - TAILQ_REMOVE(&sync_requests.requests, &sync_req, next); - pthread_mutex_unlock(&sync_requests.lock); + ret = pthread_cond_timedwait(&pending_req.sync.cond, + &pending_requests.lock, ts); + } while (ret != 0 && ret != ETIMEDOUT); + + TAILQ_REMOVE(&pending_requests.requests, &pending_req, next); - if (sync_req.reply_received == 0) { + if (pending_req.reply_received == 0) { RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n", dst, req->name); rte_errno = ETIMEDOUT; return -1; } + if (pending_req.reply_received == -1) { + RTE_LOG(DEBUG, EAL, "Asked to ignore response\n"); + /* not receiving this message is not an error, so decrement + * number of sent messages + */ + reply->nb_sent--; + return 0; + } tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1)); if (!tmp) { @@ -614,10 +975,10 @@ mp_request_one(const char *dst, struct rte_mp_msg *req, } int __rte_experimental -rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply, +rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, const struct timespec *ts) { - int ret = 0; + int dir_fd, ret = 0; DIR *mp_dir; struct dirent *ent; struct timeval now; @@ -642,8 +1003,12 @@ rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply, reply->msgs = NULL; /* for secondary process, send request to the primary process only */ - if (rte_eal_process_type() == RTE_PROC_SECONDARY) - return mp_request_one(eal_mp_socket_path(), req, reply, &end); + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + pthread_mutex_lock(&pending_requests.lock); + ret = mp_request_sync(eal_mp_socket_path(), req, reply, &end); + pthread_mutex_unlock(&pending_requests.lock); + return ret; + } /* for primary process, broadcast request, and collect reply 1 by 1 */ mp_dir = opendir(mp_dir_path); @@ -653,22 +1018,190 @@ rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply, return -1; } + dir_fd = dirfd(mp_dir); + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + closedir(mp_dir); + rte_errno = errno; + return -1; + } + + pthread_mutex_lock(&pending_requests.lock); while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + if (fnmatch(mp_filter, ent->d_name, 0) != 0) continue; - if (mp_request_one(ent->d_name, req, reply, &end)) + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + + /* unlocks the mutex while waiting for response, + * locks on receive + */ + if (mp_request_sync(path, req, reply, &end)) ret = -1; } + pthread_mutex_unlock(&pending_requests.lock); + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + /* dir_fd automatically closed on closedir */ closedir(mp_dir); return ret; } int __rte_experimental -rte_mp_reply(struct rte_mp_msg *msg, const char *peer) +rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, + rte_mp_async_reply_t clb) { + struct rte_mp_msg *copy; + struct pending_request *dummy; + struct async_request_param *param; + struct rte_mp_reply *reply; + int dir_fd, ret = 0; + DIR *mp_dir; + struct dirent *ent; + struct timeval now; + struct timespec *end; + bool dummy_used = false; + RTE_LOG(DEBUG, EAL, "request: %s\n", req->name); + + if (check_input(req) == false) + return -1; + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Faile to get current time\n"); + rte_errno = errno; + return -1; + } + copy = calloc(1, sizeof(*copy)); + dummy = calloc(1, sizeof(*dummy)); + param = calloc(1, sizeof(*param)); + if (copy == NULL || dummy == NULL || param == NULL) { + RTE_LOG(ERR, EAL, "Failed to allocate memory for async reply\n"); + rte_errno = ENOMEM; + goto fail; + } + + /* copy message */ + memcpy(copy, req, sizeof(*copy)); + + param->n_responses_processed = 0; + param->clb = clb; + end = ¶m->end; + reply = ¶m->user_reply; + + end->tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000; + end->tv_sec = now.tv_sec + ts->tv_sec + + (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000; + reply->nb_sent = 0; + reply->nb_received = 0; + reply->msgs = NULL; + + /* we have to lock the request queue here, as we will be adding a bunch + * of requests to the queue at once, and some of the replies may arrive + * before we add all of the requests to the queue. + */ + pthread_mutex_lock(&pending_requests.lock); + + /* we have to ensure that callback gets triggered even if we don't send + * anything, therefore earlier we have allocated a dummy request. fill + * it, and put it on the queue if we don't send any requests. + */ + dummy->type = REQUEST_TYPE_ASYNC; + dummy->request = copy; + dummy->reply = NULL; + dummy->async.param = param; + dummy->reply_received = 1; /* short-circuit the timeout */ + + /* for secondary process, send request to the primary process only */ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + ret = mp_request_async(eal_mp_socket_path(), copy, param); + + /* if we didn't send anything, put dummy request on the queue */ + if (ret == 0 && reply->nb_sent == 0) { + TAILQ_INSERT_TAIL(&pending_requests.requests, dummy, + next); + dummy_used = true; + } + + pthread_mutex_unlock(&pending_requests.lock); + + /* if we couldn't send anything, clean up */ + if (ret != 0) + goto fail; + return 0; + } + + /* for primary process, broadcast request */ + mp_dir = opendir(mp_dir_path); + if (!mp_dir) { + RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path); + rte_errno = errno; + goto unlock_fail; + } + dir_fd = dirfd(mp_dir); + + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + rte_errno = errno; + goto closedir_fail; + } + + while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + + if (fnmatch(mp_filter, ent->d_name, 0) != 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + + if (mp_request_async(path, copy, param)) + ret = -1; + } + /* if we didn't send anything, put dummy request on the queue */ + if (ret == 0 && reply->nb_sent == 0) { + TAILQ_INSERT_HEAD(&pending_requests.requests, dummy, next); + dummy_used = true; + } + + /* trigger async request thread wake up */ + pthread_cond_signal(&pending_requests.async_cond); + + /* finally, unlock the queue */ + pthread_mutex_unlock(&pending_requests.lock); + + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + + /* dir_fd automatically closed on closedir */ + closedir(mp_dir); + + /* if dummy was unused, free it */ + if (!dummy_used) + free(dummy); + + return ret; +closedir_fail: + closedir(mp_dir); +unlock_fail: + pthread_mutex_unlock(&pending_requests.lock); +fail: + free(dummy); + free(param); + free(copy); + return -1; +} + +int __rte_experimental +rte_mp_reply(struct rte_mp_msg *msg, const char *peer) +{ RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name); if (check_input(msg) == false) diff --git a/lib/librte_eal/common/eal_common_thread.c b/lib/librte_eal/common/eal_common_thread.c index 40902e49..42398630 100644 --- a/lib/librte_eal/common/eal_common_thread.c +++ b/lib/librte_eal/common/eal_common_thread.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,7 @@ #include #include +#include "eal_private.h" #include "eal_thread.h" RTE_DECLARE_PER_LCORE(unsigned , _socket_id); @@ -32,10 +34,7 @@ rte_lcore_has_role(unsigned int lcore_id, enum rte_lcore_role_t role) if (lcore_id >= RTE_MAX_LCORE) return -EINVAL; - if (cfg->lcore_role[lcore_id] == role) - return 0; - - return -EINVAL; + return cfg->lcore_role[lcore_id] == role; } int eal_cpuset_socket_id(rte_cpuset_t *cpusetp) @@ -140,3 +139,93 @@ exit: return ret; } + + +struct rte_thread_ctrl_params { + void *(*start_routine)(void *); + void *arg; + pthread_barrier_t configured; +}; + +static void *rte_thread_init(void *arg) +{ + int ret; + struct rte_thread_ctrl_params *params = arg; + void *(*start_routine)(void *) = params->start_routine; + void *routine_arg = params->arg; + + ret = pthread_barrier_wait(¶ms->configured); + if (ret == PTHREAD_BARRIER_SERIAL_THREAD) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + + return start_routine(routine_arg); +} + +__rte_experimental int +rte_ctrl_thread_create(pthread_t *thread, const char *name, + const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg) +{ + struct rte_thread_ctrl_params *params; + unsigned int lcore_id; + rte_cpuset_t cpuset; + int cpu_found, ret; + + params = malloc(sizeof(*params)); + if (!params) + return -1; + + params->start_routine = start_routine; + params->arg = arg; + + pthread_barrier_init(¶ms->configured, NULL, 2); + + ret = pthread_create(thread, attr, rte_thread_init, (void *)params); + if (ret != 0) { + free(params); + return ret; + } + + if (name != NULL) { + ret = rte_thread_setname(*thread, name); + if (ret < 0) + goto fail; + } + + cpu_found = 0; + CPU_ZERO(&cpuset); + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + if (eal_cpu_detected(lcore_id) && + rte_lcore_has_role(lcore_id, ROLE_OFF)) { + CPU_SET(lcore_id, &cpuset); + cpu_found = 1; + } + } + /* if no detected cpu is off, use master core */ + if (!cpu_found) + CPU_SET(rte_get_master_lcore(), &cpuset); + + ret = pthread_setaffinity_np(*thread, sizeof(cpuset), &cpuset); + if (ret < 0) + goto fail; + + ret = pthread_barrier_wait(¶ms->configured); + if (ret == PTHREAD_BARRIER_SERIAL_THREAD) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + + return 0; + +fail: + if (PTHREAD_BARRIER_SERIAL_THREAD == + pthread_barrier_wait(¶ms->configured)) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + pthread_cancel(*thread); + pthread_join(*thread, NULL); + return ret; +} diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h index 4708dd54..364f38d1 100644 --- a/lib/librte_eal/common/eal_filesystem.h +++ b/lib/librte_eal/common/eal_filesystem.h @@ -22,13 +22,19 @@ #include #include "eal_internal_cfg.h" -static const char *default_config_dir = "/var/run"; +/* sets up platform-specific runtime data dir */ +int +eal_create_runtime_dir(void); + +/* returns runtime dir */ +const char * +eal_get_runtime_dir(void); static inline const char * eal_runtime_config_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - const char *directory = default_config_dir; + const char *directory = "/var/run"; const char *home_dir = getenv("HOME"); if (getuid() != 0 && home_dir != NULL) @@ -39,43 +45,50 @@ eal_runtime_config_path(void) } /** Path of primary/secondary communication unix socket file. */ -#define MP_SOCKET_PATH_FMT "%s/.%s_unix" +#define MP_SOCKET_FNAME "mp_socket" static inline const char * eal_mp_socket_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - const char *directory = default_config_dir; - const char *home_dir = getenv("HOME"); - if (getuid() != 0 && home_dir != NULL) - directory = home_dir; - snprintf(buffer, sizeof(buffer) - 1, MP_SOCKET_PATH_FMT, - directory, internal_config.hugefile_prefix); + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + MP_SOCKET_FNAME); + return buffer; +} +#define FBARRAY_NAME_FMT "%s/fbarray_%s" +static inline const char * +eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) { + snprintf(buffer, buflen, FBARRAY_NAME_FMT, eal_get_runtime_dir(), name); return buffer; } /** Path of hugepage info file. */ -#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info" - +#define HUGEPAGE_INFO_FNAME "hugepage_info" static inline const char * eal_hugepage_info_path(void) { static char buffer[PATH_MAX]; /* static so auto-zeroed */ - const char *directory = default_config_dir; - const char *home_dir = getenv("HOME"); - if (getuid() != 0 && home_dir != NULL) - directory = home_dir; - snprintf(buffer, sizeof(buffer) - 1, HUGEPAGE_INFO_FMT, directory, - internal_config.hugefile_prefix); + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + HUGEPAGE_INFO_FNAME); + return buffer; +} + +/** Path of hugepage data file. */ +#define HUGEPAGE_DATA_FNAME "hugepage_data" +static inline const char * +eal_hugepage_data_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(), + HUGEPAGE_DATA_FNAME); return buffer; } /** String format for hugepage map files. */ #define HUGEFILE_FMT "%s/%smap_%d" -#define TEMP_HUGEFILE_FMT "%s/%smap_temp_%d" - static inline const char * eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id) { @@ -85,6 +98,17 @@ eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id return buffer; } +/** String format for hugepage map lock files. */ +#define HUGEFILE_LOCK_FMT "%s/map_%d.lock" +static inline const char * +eal_get_hugefile_lock_path(char *buffer, size_t buflen, int f_id) +{ + snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, eal_get_runtime_dir(), + f_id); + buffer[buflen - 1] = '\0'; + return buffer; +} + /** define the default filename prefix for the %s values above */ #define HUGEFILE_PREFIX_DEFAULT "rte" diff --git a/lib/librte_eal/common/eal_hugepages.h b/lib/librte_eal/common/eal_hugepages.h index 1d519bbb..4582f19c 100644 --- a/lib/librte_eal/common/eal_hugepages.h +++ b/lib/librte_eal/common/eal_hugepages.h @@ -22,14 +22,19 @@ struct hugepage_file { size_t size; /**< the page size */ int socket_id; /**< NUMA socket ID */ int file_id; /**< the '%d' in HUGEFILE_FMT */ - int memseg_id; /**< the memory segment to which page belongs */ char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */ }; /** - * Read the information from linux on what hugepages are available - * for the EAL to use + * Read the information on what hugepages are available for the EAL to use, + * clearing out any unused ones. */ int eal_hugepage_info_init(void); +/** + * Read whatever information primary process has shared about hugepages into + * secondary process. + */ +int eal_hugepage_info_read(void); + #endif /* EAL_HUGEPAGES_H */ diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h index 1169fcc3..c4cbf3ac 100644 --- a/lib/librte_eal/common/eal_internal_cfg.h +++ b/lib/librte_eal/common/eal_internal_cfg.h @@ -21,9 +21,9 @@ */ struct hugepage_info { uint64_t hugepage_sz; /**< size of a huge page */ - const char *hugedir; /**< dir where hugetlbfs is mounted */ + char hugedir[PATH_MAX]; /**< dir where hugetlbfs is mounted */ uint32_t num_pages[RTE_MAX_NUMA_NODES]; - /**< number of hugepages of that size on each socket */ + /**< number of hugepages of that size on each socket */ int lock_descriptor; /**< file descriptor for hugepage dir */ }; @@ -47,6 +47,14 @@ struct internal_config { volatile unsigned force_sockets; volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */ uintptr_t base_virtaddr; /**< base address to try and reserve memory from */ + volatile unsigned legacy_mem; + /**< true to enable legacy memory behavior (no dynamic allocation, + * IOVA-contiguous segments). + */ + volatile unsigned single_file_segments; + /**< true if storing all pages within single files (per-page-size, + * per-node) non-legacy mode only. + */ volatile int syslog_facility; /**< facility passed to openlog() */ /** default interrupt mode for VFIO */ volatile enum rte_intr_mode vfio_intr_mode; @@ -56,6 +64,8 @@ struct internal_config { /**< user defined mbuf pool ops name */ unsigned num_hugepage_sizes; /**< how many sizes on this system */ struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES]; + volatile unsigned int init_complete; + /**< indicates whether EAL has completed initialization */ }; extern struct internal_config internal_config; /**< Global EAL configuration. */ diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h new file mode 100644 index 00000000..36bb1a02 --- /dev/null +++ b/lib/librte_eal/common/eal_memalloc.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef EAL_MEMALLOC_H +#define EAL_MEMALLOC_H + +#include + +#include +#include + +/* + * Allocate segment of specified page size. + */ +struct rte_memseg * +eal_memalloc_alloc_seg(size_t page_sz, int socket); + +/* + * Allocate `n_segs` segments. + * + * Note: `ms` can be NULL. + * + * Note: it is possible to request best-effort allocation by setting `exact` to + * `false`, in which case allocator will return however many pages it managed to + * allocate successfully. + */ +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, + int socket, bool exact); + +/* + * Deallocate segment + */ +int +eal_memalloc_free_seg(struct rte_memseg *ms); + +/* + * Deallocate `n_segs` segments. Returns 0 on successful deallocation of all + * segments, returns -1 on error. Any segments that could have been deallocated, + * will be deallocated even in case of error. + */ +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs); + +/* + * Check if memory pointed to by `start` and of `length` that resides in + * memseg list `msl` is IOVA-contiguous. + */ +bool +eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start, + size_t len); + +/* synchronize local memory map to primary process */ +int +eal_memalloc_sync_with_primary(void); + +int +eal_memalloc_mem_event_callback_register(const char *name, + rte_mem_event_callback_t clb, void *arg); + +int +eal_memalloc_mem_event_callback_unregister(const char *name, void *arg); + +void +eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start, + size_t len); + +int +eal_memalloc_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit); + +int +eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id); + +int +eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len); + +int +eal_memalloc_init(void); + +#endif /* EAL_MEMALLOC_H */ diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h index e86c7114..211ae06a 100644 --- a/lib/librte_eal/common/eal_options.h +++ b/lib/librte_eal/common/eal_options.h @@ -55,6 +55,10 @@ enum { OPT_VFIO_INTR_NUM, #define OPT_VMWARE_TSC_MAP "vmware-tsc-map" OPT_VMWARE_TSC_MAP_NUM, +#define OPT_LEGACY_MEM "legacy-mem" + OPT_LEGACY_MEM_NUM, +#define OPT_SINGLE_FILE_SEGMENTS "single-file-segments" + OPT_SINGLE_FILE_SEGMENTS_NUM, OPT_LONG_MAX_NUM }; diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h index 0b287700..bdadc4d5 100644 --- a/lib/librte_eal/common/eal_private.h +++ b/lib/librte_eal/common/eal_private.h @@ -9,6 +9,8 @@ #include #include +#include + /** * Initialize the memzone subsystem (private to eal). * @@ -80,6 +82,12 @@ int rte_eal_timer_init(void); */ int rte_eal_log_init(const char *id, int facility); +/** + * Save the log regexp for later + */ +int rte_log_save_regexp(const char *type, int priority); +int rte_log_save_pattern(const char *pattern, int priority); + /** * Init tail queues for non-EAL library structures. This is to allow * the rings, mempools, etc. lists to be shared among multiple processes @@ -126,6 +134,39 @@ int rte_eal_alarm_init(void); */ int rte_eal_check_module(const char *module_name); +/** + * Get virtual area of specified size from the OS. + * + * This function is private to the EAL. + * + * @param requested_addr + * Address where to request address space. + * @param size + * Size of requested area. + * @param page_sz + * Page size on which to align requested virtual area. + * @param flags + * EAL_VIRTUAL_AREA_* flags. + * @param mmap_flags + * Extra flags passed directly to mmap(). + * + * @return + * Virtual area address if successful. + * NULL if unsuccessful. + */ + +#define EAL_VIRTUAL_AREA_ADDR_IS_HINT (1 << 0) +/**< don't fail if cannot get exact requested address. */ +#define EAL_VIRTUAL_AREA_ALLOW_SHRINK (1 << 1) +/**< try getting smaller sized (decrement by page size) virtual areas if cannot + * get area of requested size. + */ +#define EAL_VIRTUAL_AREA_UNMAP (1 << 2) +/**< immediately unmap reserved virtual area. */ +void * +eal_get_virtual_area(void *requested_addr, size_t *size, + size_t page_sz, int flags, int mmap_flags); + /** * Get cpu core_id. * @@ -205,4 +246,16 @@ struct rte_bus *rte_bus_find_by_device_name(const char *str); int rte_mp_channel_init(void); +/** + * Internal Executes all the user application registered callbacks for + * the specific device. It is for DPDK internal user only. User + * application should not call it directly. + * + * @param device_name + * The device name. + * @param event + * the device event type. + */ +void dev_callback_process(char *device_name, enum rte_dev_event_type event); + #endif /* _EAL_PRIVATE_H_ */ diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic.h b/lib/librte_eal/common/include/arch/arm/rte_atomic.h index f3f3b6e3..40e14e56 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_atomic.h +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic.h @@ -1,33 +1,5 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_ATOMIC_ARM_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_32.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_32.h index d2b7fa20..859562e5 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_32.h +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_32.h @@ -1,33 +1,5 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_ATOMIC_ARM32_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_byteorder.h b/lib/librte_eal/common/include/arch/arm/rte_byteorder.h index 8af0a39a..9ec4a975 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_byteorder.h +++ b/lib/librte_eal/common/include/arch/arm/rte_byteorder.h @@ -1,33 +1,5 @@ -/* - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_BYTEORDER_ARM_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_cpuflags.h b/lib/librte_eal/common/include/arch/arm/rte_cpuflags.h index b8f62889..022e7da5 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_cpuflags.h +++ b/lib/librte_eal/common/include/arch/arm/rte_cpuflags.h @@ -1,33 +1,5 @@ -/* - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_CPUFLAGS_ARM_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_cpuflags_32.h b/lib/librte_eal/common/include/arch/arm/rte_cpuflags_32.h index eb02d9b9..b5347be1 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_cpuflags_32.h +++ b/lib/librte_eal/common/include/arch/arm/rte_cpuflags_32.h @@ -1,33 +1,5 @@ -/* - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_CPUFLAGS_ARM32_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_cycles.h b/lib/librte_eal/common/include/arch/arm/rte_cycles.h index a8009a06..e8ffa894 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_cycles.h +++ b/lib/librte_eal/common/include/arch/arm/rte_cycles.h @@ -1,33 +1,5 @@ -/* - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_CYCLES_ARM_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h index 9c1be71e..c4f974fe 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h +++ b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h @@ -1,33 +1,5 @@ -/* - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_CYCLES_ARM32_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy.h index 1d562c3f..47dea9a8 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy.h @@ -1,33 +1,5 @@ -/* - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_MEMCPY_ARM_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h index e4dafda1..eb02c3b4 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h @@ -1,33 +1,5 @@ -/* - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_MEMCPY_ARM32_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_prefetch.h b/lib/librte_eal/common/include/arch/arm/rte_prefetch.h index aa37de57..27870c2a 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_prefetch.h +++ b/lib/librte_eal/common/include/arch/arm/rte_prefetch.h @@ -1,33 +1,5 @@ -/* - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_PREFETCH_ARM_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_prefetch_32.h b/lib/librte_eal/common/include/arch/arm/rte_prefetch_32.h index 43cde172..e53420a0 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_prefetch_32.h +++ b/lib/librte_eal/common/include/arch/arm/rte_prefetch_32.h @@ -1,33 +1,5 @@ -/* - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_PREFETCH_ARM32_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_rwlock.h b/lib/librte_eal/common/include/arch/arm/rte_rwlock.h index 664bec88..18bb37b0 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_rwlock.h +++ b/lib/librte_eal/common/include/arch/arm/rte_rwlock.h @@ -1,3 +1,5 @@ +/* SPDX-License-Identifier: BSD-3-Clause + */ /* copied from ppc_64 */ #ifndef _RTE_RWLOCK_ARM_H_ diff --git a/lib/librte_eal/common/include/arch/arm/rte_spinlock.h b/lib/librte_eal/common/include/arch/arm/rte_spinlock.h index 396a42e8..1a6916b6 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_spinlock.h +++ b/lib/librte_eal/common/include/arch/arm/rte_spinlock.h @@ -1,33 +1,5 @@ -/* - * BSD LICENSE - * - * Copyright(c) 2015 RehiveTech. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of RehiveTech nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015 RehiveTech. All rights reserved. */ #ifndef _RTE_SPINLOCK_ARM_H_ diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h b/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h index 39fce7b9..ce38350b 100644 --- a/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h +++ b/lib/librte_eal/common/include/arch/ppc_64/rte_atomic.h @@ -55,7 +55,7 @@ extern "C" { * Guarantees that the LOAD and STORE operations generated before the * barrier occur before the LOAD and STORE operations generated after. */ -#define rte_mb() {asm volatile("sync" : : : "memory"); } +#define rte_mb() asm volatile("sync" : : : "memory") /** * Write memory barrier. @@ -136,6 +136,12 @@ static inline int rte_atomic16_dec_and_test(rte_atomic16_t *v) return __atomic_sub_fetch(&v->cnt, 1, __ATOMIC_ACQUIRE) == 0; } +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val) +{ + return __atomic_exchange_2(dst, val, __ATOMIC_SEQ_CST); +} + /*------------------------- 32 bit atomic operations -------------------------*/ static inline int @@ -237,6 +243,13 @@ static inline int rte_atomic32_dec_and_test(rte_atomic32_t *v) return ret == 0; } + +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val) +{ + return __atomic_exchange_4(dst, val, __ATOMIC_SEQ_CST); +} + /*------------------------- 64 bit atomic operations -------------------------*/ static inline int @@ -431,7 +444,6 @@ static inline int rte_atomic64_test_and_set(rte_atomic64_t *v) { return rte_atomic64_cmpset((volatile uint64_t *)&v->cnt, 0, 1); } - /** * Atomically set a 64-bit counter to 0. * @@ -442,6 +454,13 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v) { v->cnt = 0; } + +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val) +{ + return __atomic_exchange_4(dst, val, __ATOMIC_SEQ_CST); +} + #endif #ifdef __cplusplus diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_rwlock.h b/lib/librte_eal/common/include/arch/ppc_64/rte_rwlock.h index de8af19e..9fadc040 100644 --- a/lib/librte_eal/common/include/arch/ppc_64/rte_rwlock.h +++ b/lib/librte_eal/common/include/arch/ppc_64/rte_rwlock.h @@ -1,3 +1,5 @@ +/* SPDX-License-Identifier: BSD-3-Clause + */ #ifndef _RTE_RWLOCK_PPC_64_H_ #define _RTE_RWLOCK_PPC_64_H_ diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic.h b/lib/librte_eal/common/include/arch/x86/rte_atomic.h index 5cfd3832..148398f5 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_atomic.h +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic.h @@ -104,6 +104,18 @@ rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src) return res; } +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val) +{ + asm volatile( + MPLOCKED + "xchgw %0, %1;" + : "=r" (val), "=m" (*dst) + : "0" (val), "m" (*dst) + : "memory"); /* no-clobber list */ + return val; +} + static inline int rte_atomic16_test_and_set(rte_atomic16_t *v) { return rte_atomic16_cmpset((volatile uint16_t *)&v->cnt, 0, 1); @@ -178,6 +190,18 @@ rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) return res; } +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val) +{ + asm volatile( + MPLOCKED + "xchgl %0, %1;" + : "=r" (val), "=m" (*dst) + : "0" (val), "m" (*dst) + : "memory"); /* no-clobber list */ + return val; +} + static inline int rte_atomic32_test_and_set(rte_atomic32_t *v) { return rte_atomic32_cmpset((volatile uint32_t *)&v->cnt, 0, 1); diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_32.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_32.h index fb3abf18..a932f354 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_32.h +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_32.h @@ -98,6 +98,18 @@ rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src) return res; } +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dest, uint64_t val) +{ + uint64_t old; + + do { + old = *dest; + } while (rte_atomic64_cmpset(dest, old, val) == 0); + + return old; +} + static inline void rte_atomic64_init(rte_atomic64_t *v) { diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h index 1a53a766..fd2ec9c5 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h @@ -71,6 +71,18 @@ rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src) return res; } +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val) +{ + asm volatile( + MPLOCKED + "xchgq %0, %1;" + : "=r" (val), "=m" (*dst) + : "0" (val), "m" (*dst) + : "memory"); /* no-clobber list */ + return val; +} + static inline void rte_atomic64_init(rte_atomic64_t *v) { diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h index cc140ecc..7b758094 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h @@ -52,7 +52,7 @@ rte_memcpy(void *dst, const void *src, size_t n); * Copy 16 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov16(uint8_t *dst, const uint8_t *src) { __m128i xmm0; @@ -65,7 +65,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src) * Copy 32 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { __m256i ymm0; @@ -78,7 +78,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src) * Copy 64 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { __m512i zmm0; @@ -91,7 +91,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src) * Copy 128 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov128(uint8_t *dst, const uint8_t *src) { rte_mov64(dst + 0 * 64, src + 0 * 64); @@ -102,7 +102,7 @@ rte_mov128(uint8_t *dst, const uint8_t *src) * Copy 256 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { rte_mov64(dst + 0 * 64, src + 0 * 64); @@ -293,7 +293,7 @@ COPY_BLOCK_128_BACK63: * Copy 16 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov16(uint8_t *dst, const uint8_t *src) { __m128i xmm0; @@ -306,7 +306,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src) * Copy 32 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { __m256i ymm0; @@ -319,7 +319,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src) * Copy 64 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); @@ -486,7 +486,7 @@ COPY_BLOCK_128_BACK31: * Copy 16 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov16(uint8_t *dst, const uint8_t *src) { __m128i xmm0; @@ -499,7 +499,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src) * Copy 32 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); @@ -510,7 +510,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src) * Copy 64 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); @@ -574,7 +574,7 @@ rte_mov256(uint8_t *dst, const uint8_t *src) */ #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \ __extension__ ({ \ - int tmp; \ + size_t tmp; \ while (len >= 128 + 16 - offset) { \ xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ len -= 128; \ diff --git a/lib/librte_eal/common/include/arch/x86/rte_spinlock.h b/lib/librte_eal/common/include/arch/x86/rte_spinlock.h index 4b16887e..60321da0 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_spinlock.h +++ b/lib/librte_eal/common/include/arch/x86/rte_spinlock.h @@ -76,10 +76,12 @@ static inline int rte_tm_supported(void) static inline int rte_try_tm(volatile int *lock) { + int retries; + if (!rte_rtm_supported) return 0; - int retries = RTE_RTM_MAX_RETRIES; + retries = RTE_RTM_MAX_RETRIES; while (likely(retries--)) { diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h index 50e1b8a4..b99ba468 100644 --- a/lib/librte_eal/common/include/generic/rte_atomic.h +++ b/lib/librte_eal/common/include/generic/rte_atomic.h @@ -190,6 +190,36 @@ rte_atomic16_cmpset(volatile uint16_t *dst, uint16_t exp, uint16_t src) } #endif +/** + * Atomic exchange. + * + * (atomic) equivalent to: + * ret = *dst + * *dst = val; + * return ret; + * + * @param dst + * The destination location into which the value will be written. + * @param val + * The new value. + * @return + * The original value at that location + */ +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val); + +#ifdef RTE_FORCE_INTRINSICS +static inline uint16_t +rte_atomic16_exchange(volatile uint16_t *dst, uint16_t val) +{ +#if defined(RTE_ARCH_ARM64) && defined(RTE_TOOLCHAIN_CLANG) + return __atomic_exchange_n(dst, val, __ATOMIC_SEQ_CST); +#else + return __atomic_exchange_2(dst, val, __ATOMIC_SEQ_CST); +#endif +} +#endif + /** * The atomic counter structure. */ @@ -443,6 +473,36 @@ rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t src) } #endif +/** + * Atomic exchange. + * + * (atomic) equivalent to: + * ret = *dst + * *dst = val; + * return ret; + * + * @param dst + * The destination location into which the value will be written. + * @param val + * The new value. + * @return + * The original value at that location + */ +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val); + +#ifdef RTE_FORCE_INTRINSICS +static inline uint32_t +rte_atomic32_exchange(volatile uint32_t *dst, uint32_t val) +{ +#if defined(RTE_ARCH_ARM64) && defined(RTE_TOOLCHAIN_CLANG) + return __atomic_exchange_n(dst, val, __ATOMIC_SEQ_CST); +#else + return __atomic_exchange_4(dst, val, __ATOMIC_SEQ_CST); +#endif +} +#endif + /** * The atomic counter structure. */ @@ -695,6 +755,36 @@ rte_atomic64_cmpset(volatile uint64_t *dst, uint64_t exp, uint64_t src) } #endif +/** + * Atomic exchange. + * + * (atomic) equivalent to: + * ret = *dst + * *dst = val; + * return ret; + * + * @param dst + * The destination location into which the value will be written. + * @param val + * The new value. + * @return + * The original value at that location + */ +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val); + +#ifdef RTE_FORCE_INTRINSICS +static inline uint64_t +rte_atomic64_exchange(volatile uint64_t *dst, uint64_t val) +{ +#if defined(RTE_ARCH_ARM64) && defined(RTE_TOOLCHAIN_CLANG) + return __atomic_exchange_n(dst, val, __ATOMIC_SEQ_CST); +#else + return __atomic_exchange_8(dst, val, __ATOMIC_SEQ_CST); +#endif +} +#endif + /** * The atomic counter structure. */ diff --git a/lib/librte_eal/common/include/generic/rte_byteorder.h b/lib/librte_eal/common/include/generic/rte_byteorder.h index 9bed85cc..7d9a1463 100644 --- a/lib/librte_eal/common/include/generic/rte_byteorder.h +++ b/lib/librte_eal/common/include/generic/rte_byteorder.h @@ -123,7 +123,7 @@ typedef uint64_t rte_le64_t; /**< 64-bit little-endian value. */ static inline uint16_t rte_constant_bswap16(uint16_t x) { - return RTE_STATIC_BSWAP16(x); + return (uint16_t)RTE_STATIC_BSWAP16(x); } /* @@ -135,7 +135,7 @@ rte_constant_bswap16(uint16_t x) static inline uint32_t rte_constant_bswap32(uint32_t x) { - return RTE_STATIC_BSWAP32(x); + return (uint32_t)RTE_STATIC_BSWAP32(x); } /* @@ -147,7 +147,7 @@ rte_constant_bswap32(uint32_t x) static inline uint64_t rte_constant_bswap64(uint64_t x) { - return RTE_STATIC_BSWAP64(x); + return (uint64_t)RTE_STATIC_BSWAP64(x); } diff --git a/lib/librte_eal/common/include/generic/rte_cpuflags.h b/lib/librte_eal/common/include/generic/rte_cpuflags.h index 8d31687d..156ea002 100644 --- a/lib/librte_eal/common/include/generic/rte_cpuflags.h +++ b/lib/librte_eal/common/include/generic/rte_cpuflags.h @@ -64,4 +64,25 @@ rte_cpu_check_supported(void); int rte_cpu_is_supported(void); +/** + * This function attempts to retrieve a value from the auxiliary vector. + * If it is unsuccessful, the result will be 0, and errno will be set. + * + * @return A value from the auxiliary vector. When the value is 0, check + * errno to determine if an error occurred. + */ +unsigned long +rte_cpu_getauxval(unsigned long type); + +/** + * This function retrieves a value from the auxiliary vector, and compares it + * as a string against the value retrieved. + * + * @return The result of calling strcmp() against the value retrieved from + * the auxiliary vector. When the value is 0 (meaning a match is found), + * check errno to determine if an error occurred. + */ +int +rte_cpu_strcmp_auxval(unsigned long type, const char *str); + #endif /* _RTE_CPUFLAGS_H_ */ diff --git a/lib/librte_eal/common/include/generic/rte_rwlock.h b/lib/librte_eal/common/include/generic/rte_rwlock.h index 899e9bc4..5751a0e6 100644 --- a/lib/librte_eal/common/include/generic/rte_rwlock.h +++ b/lib/librte_eal/common/include/generic/rte_rwlock.h @@ -71,7 +71,7 @@ rte_rwlock_read_lock(rte_rwlock_t *rwl) continue; } success = rte_atomic32_cmpset((volatile uint32_t *)&rwl->cnt, - x, x + 1); + (uint32_t)x, (uint32_t)(x + 1)); } } @@ -107,7 +107,7 @@ rte_rwlock_write_lock(rte_rwlock_t *rwl) continue; } success = rte_atomic32_cmpset((volatile uint32_t *)&rwl->cnt, - 0, -1); + 0, (uint32_t)-1); } } diff --git a/lib/librte_eal/common/include/rte_bus.h b/lib/librte_eal/common/include/rte_bus.h index 6fb08341..eb9eded4 100644 --- a/lib/librte_eal/common/include/rte_bus.h +++ b/lib/librte_eal/common/include/rte_bus.h @@ -325,7 +325,7 @@ enum rte_iova_mode rte_bus_get_iommu_class(void); * The constructor has higher priority than PMD constructors. */ #define RTE_REGISTER_BUS(nm, bus) \ -RTE_INIT_PRIO(businitfn_ ##nm, 110); \ +RTE_INIT_PRIO(businitfn_ ##nm, BUS); \ static void businitfn_ ##nm(void) \ {\ (bus).name = RTE_STR(nm);\ diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h index c7803e41..434adfd4 100644 --- a/lib/librte_eal/common/include/rte_common.h +++ b/lib/librte_eal/common/include/rte_common.h @@ -81,16 +81,12 @@ typedef uint16_t unaligned_uint16_t; */ #define RTE_SET_USED(x) (void)(x) -/** - * Run function before main() with low priority. - * - * The constructor will be run after prioritized constructors. - * - * @param func - * Constructor function. - */ -#define RTE_INIT(func) \ -static void __attribute__((constructor, used)) func(void) +#define RTE_PRIORITY_LOG 101 +#define RTE_PRIORITY_BUS 110 +#define RTE_PRIORITY_LAST 65535 + +#define RTE_PRIO(prio) \ + RTE_PRIORITY_ ## prio /** * Run function before main() with high priority. @@ -102,7 +98,18 @@ static void __attribute__((constructor, used)) func(void) * Lowest number is the first to run. */ #define RTE_INIT_PRIO(func, prio) \ -static void __attribute__((constructor(prio), used)) func(void) +static void __attribute__((constructor(RTE_PRIO(prio)), used)) func(void) + +/** + * Run function before main() with low priority. + * + * The constructor will be run after prioritized constructors. + * + * @param func + * Constructor function. + */ +#define RTE_INIT(func) \ + RTE_INIT_PRIO(func, LAST) /** * Force a function to be inlined @@ -117,7 +124,7 @@ static void __attribute__((constructor(prio), used)) func(void) /*********** Macros for pointer arithmetic ********/ /** - * add a byte-value offset from a pointer + * add a byte-value offset to a pointer */ #define RTE_PTR_ADD(ptr, x) ((void*)((uintptr_t)(ptr) + (x))) @@ -190,6 +197,22 @@ static void __attribute__((constructor(prio), used)) func(void) */ #define RTE_ALIGN(val, align) RTE_ALIGN_CEIL(val, align) +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no lower + * than the first parameter. + */ +#define RTE_ALIGN_MUL_CEIL(v, mul) \ + (((v + (typeof(v))(mul) - 1) / ((typeof(v))(mul))) * (typeof(v))(mul)) + +/** + * Macro to align a value to the multiple of given value. The resultant + * value will be of the same type as the first parameter and will be no higher + * than the first parameter. + */ +#define RTE_ALIGN_MUL_FLOOR(v, mul) \ + ((v / ((typeof(v))(mul))) * (typeof(v))(mul)) + /** * Checks if a pointer is aligned to a given power-of-two value * @@ -223,6 +246,51 @@ extern int RTE_BUILD_BUG_ON_detected_error; } while(0) #endif +/** + * Combines 32b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param x + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint32_t +rte_combine32ms1b(register uint32_t x) +{ + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + + return x; +} + +/** + * Combines 64b inputs most significant set bits into the least + * significant bits to construct a value with the same MSBs as x + * but all 1's under it. + * + * @param v + * The integer whose MSBs need to be combined with its LSBs + * @return + * The combined value. + */ +static inline uint64_t +rte_combine64ms1b(register uint64_t v) +{ + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + + return v; +} + /*********** Macros to work with powers of 2 ********/ /** @@ -250,15 +318,28 @@ static inline uint32_t rte_align32pow2(uint32_t x) { x--; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; + x = rte_combine32ms1b(x); return x + 1; } +/** + * Aligns input parameter to the previous power of 2 + * + * @param x + * The integer value to algin + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint32_t +rte_align32prevpow2(uint32_t x) +{ + x = rte_combine32ms1b(x); + + return x - (x >> 1); +} + /** * Aligns 64b input parameter to the next power of 2 * @@ -272,16 +353,28 @@ static inline uint64_t rte_align64pow2(uint64_t v) { v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v |= v >> 32; + v = rte_combine64ms1b(v); return v + 1; } +/** + * Aligns 64b input parameter to the previous power of 2 + * + * @param v + * The 64b value to align + * + * @return + * Input parameter aligned to the previous power of 2 + */ +static inline uint64_t +rte_align64prevpow2(uint64_t v) +{ + v = rte_combine64ms1b(v); + + return v - (v >> 1); +} + /*********** Macros for calculating min and max **********/ /** @@ -320,7 +413,7 @@ rte_align64pow2(uint64_t v) static inline uint32_t rte_bsf32(uint32_t v) { - return __builtin_ctz(v); + return (uint32_t)__builtin_ctz(v); } /** diff --git a/lib/librte_eal/common/include/rte_dev.h b/lib/librte_eal/common/include/rte_dev.h index b688f1ef..3879ff3c 100644 --- a/lib/librte_eal/common/include/rte_dev.h +++ b/lib/librte_eal/common/include/rte_dev.h @@ -24,6 +24,25 @@ extern "C" { #include #include +/** + * The device event type. + */ +enum rte_dev_event_type { + RTE_DEV_EVENT_ADD, /**< device being added */ + RTE_DEV_EVENT_REMOVE, /**< device being removed */ + RTE_DEV_EVENT_MAX /**< max value of this enum */ +}; + +struct rte_dev_event { + enum rte_dev_event_type type; /**< device event type */ + int subsystem; /**< subsystem id */ + char *devname; /**< device name */ +}; + +typedef void (*rte_dev_event_cb_fn)(char *device_name, + enum rte_dev_event_type event, + void *cb_arg); + __attribute__((format(printf, 2, 0))) static inline void rte_pmd_debug_trace(const char *func_name, const char *fmt, ...) @@ -32,15 +51,18 @@ rte_pmd_debug_trace(const char *func_name, const char *fmt, ...) va_start(ap, fmt); - char buffer[vsnprintf(NULL, 0, fmt, ap) + 1]; + { + char buffer[vsnprintf(NULL, 0, fmt, ap) + 1]; - va_end(ap); + va_end(ap); - va_start(ap, fmt); - vsnprintf(buffer, sizeof(buffer), fmt, ap); - va_end(ap); + va_start(ap, fmt); + vsnprintf(buffer, sizeof(buffer), fmt, ap); + va_end(ap); - rte_log(RTE_LOG_ERR, RTE_LOGTYPE_PMD, "%s: %s", func_name, buffer); + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_PMD, "%s: %s", + func_name, buffer); + } } /* @@ -267,4 +289,78 @@ __attribute__((used)) = str } #endif +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * It registers the callback for the specific device. + * Multiple callbacks cal be registered at the same time. + * + * @param device_name + * The device name, that is the param name of the struct rte_device, + * null value means for all devices. + * @param cb_fn + * callback address. + * @param cb_arg + * address of parameter for callback. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_event_callback_register(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * It unregisters the callback according to the specified device. + * + * @param device_name + * The device name, that is the param name of the struct rte_device, + * null value means for all devices and their callbacks. + * @param cb_fn + * callback address. + * @param cb_arg + * address of parameter for callback, (void *)-1 means to remove all + * registered which has the same callback address. + * + * @return + * - On success, return the number of callback entities removed. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_event_callback_unregister(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Start the device event monitoring. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_event_monitor_start(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Stop the device event monitoring. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int __rte_experimental +rte_dev_event_monitor_stop(void); + #endif /* _RTE_DEV_H_ */ diff --git a/lib/librte_eal/common/include/rte_devargs.h b/lib/librte_eal/common/include/rte_devargs.h index 84e5e23c..58fbd90a 100644 --- a/lib/librte_eal/common/include/rte_devargs.h +++ b/lib/librte_eal/common/include/rte_devargs.h @@ -59,13 +59,8 @@ struct rte_devargs { char *args; }; -/** user device double-linked queue type definition */ -TAILQ_HEAD(rte_devargs_list, rte_devargs); - -/** Global list of user devices */ -extern struct rte_devargs_list devargs_list; - /** + * @deprecated * Parse a devargs string. * * For PCI devices, the format of arguments string is "PCI_ADDR" or @@ -90,6 +85,7 @@ extern struct rte_devargs_list devargs_list; * - 0 on success * - A negative value on error */ +__rte_deprecated int rte_eal_parse_devargs_str(const char *devargs_str, char **drvname, char **drvargs); @@ -100,18 +96,37 @@ int rte_eal_parse_devargs_str(const char *devargs_str, * in argument. Store which bus will handle the device, its name * and the eventual device parameters. * - * @param dev - * The device declaration string. + * The device string is built with a printf-like syntax. + * + * The syntax is: + * + * bus:device_identifier,arg1=val1,arg2=val2 + * + * where "bus:" is the bus name followed by any character separator. + * The bus name is optional. If no bus name is specified, each bus + * will attempt to recognize the device identifier. The first one + * to succeed will be used. + * + * Examples: + * + * pci:0000:05.00.0,arg=val + * 05.00.0,arg=val + * vdev:net_ring0 + * * @param da * The devargs structure holding the device information. + * @param format + * Format string describing a device. * * @return * - 0 on success. * - Negative errno on error. */ -int __rte_experimental -rte_eal_devargs_parse(const char *dev, - struct rte_devargs *da); +__rte_experimental +int +rte_devargs_parse(struct rte_devargs *da, + const char *format, ...) +__attribute__((format(printf, 2, 0))); /** * Insert an rte_devargs in the global list. @@ -123,21 +138,30 @@ rte_eal_devargs_parse(const char *dev, * - 0 on success * - Negative on error. */ -int __rte_experimental -rte_eal_devargs_insert(struct rte_devargs *da); +__rte_experimental +int +rte_devargs_insert(struct rte_devargs *da); /** * Add a device to the user device list + * See rte_devargs_parse() for details. * - * For PCI devices, the format of arguments string is "PCI_ADDR" or - * "PCI_ADDR,key=val,key2=val2,...". Examples: "08:00.1", "0000:5:00.0", - * "04:00.0,arg=val". + * @param devtype + * The type of the device. + * @param devargs_str + * The arguments as given by the user. * - * For virtual devices, the format of arguments string is "DRIVER_NAME*" - * or "DRIVER_NAME*,key=val,key2=val2,...". Examples: "net_ring", - * "net_ring0", "net_pmdAnything,arg=0:arg2=1". The validity of the - * driver name is not checked by this function, it is done when probing - * the drivers. + * @return + * - 0 on success + * - A negative value on error + */ +__rte_experimental +int rte_devargs_add(enum rte_devtype devtype, const char *devargs_str); + +/** + * @deprecated + * Add a device to the user device list + * See rte_devargs_parse() for details. * * @param devtype * The type of the device. @@ -148,6 +172,7 @@ rte_eal_devargs_insert(struct rte_devargs *da); * - 0 on success * - A negative value on error */ +__rte_deprecated int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str); /** @@ -166,8 +191,9 @@ int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str); * <0 on error. * >0 if the devargs was not within the user device list. */ -int __rte_experimental rte_eal_devargs_remove(const char *busname, - const char *devname); +__rte_experimental +int rte_devargs_remove(const char *busname, + const char *devname); /** * Count the number of user devices of a specified type @@ -178,6 +204,21 @@ int __rte_experimental rte_eal_devargs_remove(const char *busname, * @return * The number of devices. */ +__rte_experimental +unsigned int +rte_devargs_type_count(enum rte_devtype devtype); + +/** + * @deprecated + * Count the number of user devices of a specified type + * + * @param devtype + * The type of the devices to counted. + * + * @return + * The number of devices. + */ +__rte_deprecated unsigned int rte_eal_devargs_type_count(enum rte_devtype devtype); @@ -187,8 +228,47 @@ rte_eal_devargs_type_count(enum rte_devtype devtype); * @param f * A pointer to a file for output */ +__rte_experimental +void rte_devargs_dump(FILE *f); + +/** + * @deprecated + * This function dumps the list of user device and their arguments. + * + * @param f + * A pointer to a file for output + */ +__rte_deprecated void rte_eal_devargs_dump(FILE *f); +/** + * Find next rte_devargs matching the provided bus name. + * + * @param busname + * Limit the iteration to devargs related to buses + * matching this name. + * Will return any next rte_devargs if NULL. + * + * @param start + * Starting iteration point. The iteration will start at + * the first rte_devargs if NULL. + * + * @return + * Next rte_devargs entry matching the requested bus, + * NULL if there is none. + */ +__rte_experimental +struct rte_devargs * +rte_devargs_next(const char *busname, const struct rte_devargs *start); + +/** + * Iterate over all rte_devargs for a specific bus. + */ +#define RTE_EAL_DEVARGS_FOREACH(busname, da) \ + for (da = rte_devargs_next(busname, NULL); \ + da != NULL; \ + da = rte_devargs_next(busname, da)) \ + #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h index 044474e6..8de5d69e 100644 --- a/lib/librte_eal/common/include/rte_eal.h +++ b/lib/librte_eal/common/include/rte_eal.h @@ -57,6 +57,8 @@ enum rte_proc_type_t { struct rte_config { uint32_t master_lcore; /**< Id of the master lcore */ uint32_t lcore_count; /**< Number of available logical cores. */ + uint32_t numa_node_count; /**< Number of detected NUMA nodes. */ + uint32_t numa_nodes[RTE_MAX_NUMA_NODES]; /**< List of detected NUMA nodes. */ uint32_t service_lcore_count;/**< Number of available service cores. */ enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE]; /**< State of cores. */ @@ -229,6 +231,16 @@ struct rte_mp_reply { */ typedef int (*rte_mp_t)(const struct rte_mp_msg *msg, const void *peer); +/** + * Asynchronous reply function typedef used by other components. + * + * As we create socket channel for primary/secondary communication, use + * this function typedef to register action for coming responses to asynchronous + * requests. + */ +typedef int (*rte_mp_async_reply_t)(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply); + /** * @warning * @b EXPERIMENTAL: this API may change without prior notice @@ -314,9 +326,35 @@ rte_mp_sendmsg(struct rte_mp_msg *msg); * - On failure, return -1, and the reason will be stored in rte_errno. */ int __rte_experimental -rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply, +rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, const struct timespec *ts); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Send a request to the peer process and expect a reply in a separate callback. + * + * This function sends a request message to the peer process, and will not + * block. Instead, reply will be received in a separate callback. + * + * @param req + * The req argument contains the customized request message. + * + * @param ts + * The ts argument specifies how long we can wait for the peer(s) to reply. + * + * @param clb + * The callback to trigger when all responses for this request have arrived. + * + * @return + * - On success, return 0. + * - On failure, return -1, and the reason will be stored in rte_errno. + */ +int __rte_experimental +rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, + rte_mp_async_reply_t clb); + /** * @warning * @b EXPERIMENTAL: this API may change without prior notice @@ -464,11 +502,13 @@ const char * __rte_experimental rte_eal_mbuf_user_pool_ops(void); /** + * @deprecated * Get default pool ops name for mbuf * * @return * returns default pool ops name. */ +__rte_deprecated const char * rte_eal_mbuf_default_mempool_ops(void); diff --git a/lib/librte_eal/common/include/rte_eal_interrupts.h b/lib/librte_eal/common/include/rte_eal_interrupts.h index 3f792a97..6eb49327 100644 --- a/lib/librte_eal/common/include/rte_eal_interrupts.h +++ b/lib/librte_eal/common/include/rte_eal_interrupts.h @@ -34,6 +34,7 @@ enum rte_intr_handle_type { RTE_INTR_HANDLE_ALARM, /**< alarm handle */ RTE_INTR_HANDLE_EXT, /**< external handler */ RTE_INTR_HANDLE_VDEV, /**< virtual device */ + RTE_INTR_HANDLE_DEV_EVENT, /**< device event handle */ RTE_INTR_HANDLE_MAX /**< count of elements */ }; diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h index 29fa0b60..aff0688d 100644 --- a/lib/librte_eal/common/include/rte_eal_memconfig.h +++ b/lib/librte_eal/common/include/rte_eal_memconfig.h @@ -12,11 +12,30 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { #endif +/** + * memseg list is a special case as we need to store a bunch of other data + * together with the array itself. + */ +struct rte_memseg_list { + RTE_STD_C11 + union { + void *base_va; + /**< Base virtual address for this memseg list. */ + uint64_t addr_64; + /**< Makes sure addr is always 64-bits */ + }; + int socket_id; /**< Socket ID for all memsegs in this list. */ + uint64_t page_sz; /**< Page size for all memsegs in this list. */ + volatile uint32_t version; /**< version number for multiprocess sync. */ + struct rte_fbarray memseg_arr; +}; + /** * the structure for the memory configuration for the RTE. * Used by the rte_config structure. It is separated out, as for multi-process @@ -40,11 +59,14 @@ struct rte_mem_config { rte_rwlock_t qlock; /**< used for tailq operation for thread safe. */ rte_rwlock_t mplock; /**< only used by mempool LIB for thread-safe. */ - uint32_t memzone_cnt; /**< Number of allocated memzones */ + rte_rwlock_t memory_hotplug_lock; + /**< indicates whether memory hotplug request is in progress. */ /* memory segments and zones */ - struct rte_memseg memseg[RTE_MAX_MEMSEG]; /**< Physmem descriptors. */ - struct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */ + struct rte_fbarray memzones; /**< Memzone descriptors. */ + + struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS]; + /**< list of dynamic arrays holding memsegs */ struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */ diff --git a/lib/librte_eal/common/include/rte_fbarray.h b/lib/librte_eal/common/include/rte_fbarray.h new file mode 100644 index 00000000..3e61fffe --- /dev/null +++ b/lib/librte_eal/common/include/rte_fbarray.h @@ -0,0 +1,356 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef RTE_FBARRAY_H +#define RTE_FBARRAY_H + +/** + * @file + * + * File-backed shared indexed array for DPDK. + * + * Basic workflow is expected to be the following: + * 1) Allocate array either using ``rte_fbarray_init()`` or + * ``rte_fbarray_attach()`` (depending on whether it's shared between + * multiple DPDK processes) + * 2) find free spots using ``rte_fbarray_find_next_free()`` + * 3) get pointer to data in the free spot using ``rte_fbarray_get()``, and + * copy data into the pointer (element size is fixed) + * 4) mark entry as used using ``rte_fbarray_set_used()`` + * + * Calls to ``rte_fbarray_init()`` and ``rte_fbarray_destroy()`` will have + * consequences for all processes, while calls to ``rte_fbarray_attach()`` and + * ``rte_fbarray_detach()`` will only have consequences within a single process. + * Therefore, it is safe to call ``rte_fbarray_attach()`` or + * ``rte_fbarray_detach()`` while another process is using ``rte_fbarray``, + * provided no other thread within the same process will try to use + * ``rte_fbarray`` before attaching or after detaching. It is not safe to call + * ``rte_fbarray_init()`` or ``rte_fbarray_destroy()`` while another thread or + * another process is using ``rte_fbarray``. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include +#include + +#define RTE_FBARRAY_NAME_LEN 64 + +struct rte_fbarray { + char name[RTE_FBARRAY_NAME_LEN]; /**< name associated with an array */ + unsigned int count; /**< number of entries stored */ + unsigned int len; /**< current length of the array */ + unsigned int elt_sz; /**< size of each element */ + void *data; /**< data pointer */ + rte_rwlock_t rwlock; /**< multiprocess lock */ +}; + +/** + * Set up ``rte_fbarray`` structure and allocate underlying resources. + * + * Call this function to correctly set up ``rte_fbarray`` and allocate + * underlying files that will be backing the data in the current process. Note + * that in order to use and share ``rte_fbarray`` between multiple processes, + * data pointed to by ``arr`` pointer must itself be allocated in shared memory. + * + * @param arr + * Valid pointer to allocated ``rte_fbarray`` structure. + * + * @param name + * Unique name to be assigned to this array. + * + * @param len + * Number of elements initially available in the array. + * + * @param elt_sz + * Size of each element. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len, + unsigned int elt_sz); + + +/** + * Attach to a file backing an already allocated and correctly set up + * ``rte_fbarray`` structure. + * + * Call this function to attach to file that will be backing the data in the + * current process. The structure must have been previously correctly set up + * with a call to ``rte_fbarray_init()``. Calls to ``rte_fbarray_attach()`` are + * usually meant to be performed in a multiprocessing scenario, with data + * pointed to by ``arr`` pointer allocated in shared memory. + * + * @param arr + * Valid pointer to allocated and correctly set up rte_fbarray structure. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_attach(struct rte_fbarray *arr); + + +/** + * Deallocate resources for an already allocated and correctly set up + * ``rte_fbarray`` structure, and remove the underlying file. + * + * Call this function to deallocate all resources associated with an + * ``rte_fbarray`` structure within the current process. This will also + * zero-fill data pointed to by ``arr`` pointer and remove the underlying file + * backing the data, so it is expected that by the time this function is called, + * all other processes have detached from this ``rte_fbarray``. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_destroy(struct rte_fbarray *arr); + + +/** + * Deallocate resources for an already allocated and correctly set up + * ``rte_fbarray`` structure. + * + * Call this function to deallocate all resources associated with an + * ``rte_fbarray`` structure within current process. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_detach(struct rte_fbarray *arr); + + +/** + * Get pointer to element residing at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Index of an element to get a pointer to. + * + * @return + * - non-NULL pointer on success. + * - NULL on failure, with ``rte_errno`` indicating reason for failure. + */ +void * __rte_experimental +rte_fbarray_get(const struct rte_fbarray *arr, unsigned int idx); + + +/** + * Find index of a specified element within the array. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param elt + * Pointer to element to find index to. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt); + + +/** + * Mark specified element as used. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Element index to mark as used. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_set_used(struct rte_fbarray *arr, unsigned int idx); + + +/** + * Mark specified element as free. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Element index to mark as free. + * + * @return + * - 0 on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_set_free(struct rte_fbarray *arr, unsigned int idx); + + +/** + * Check whether element at specified index is marked as used. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param idx + * Element index to check as used. + * + * @return + * - 1 if element is used. + * - 0 if element is unused. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_is_used(struct rte_fbarray *arr, unsigned int idx); + + +/** + * Find index of next free element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_next_free(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of next used element, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_next_used(struct rte_fbarray *arr, unsigned int start); + + +/** + * Find index of next chunk of ``n`` free elements, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of free elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_next_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find index of next chunk of ``n`` used elements, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @param n + * Number of used elements to look for. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_next_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n); + + +/** + * Find how many more free entries there are, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_contig_free(struct rte_fbarray *arr, + unsigned int start); + + +/** + * Find how many more used entries there are, starting at specified index. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param start + * Element index to start search from. + * + * @return + * - non-negative integer on success. + * - -1 on failure, with ``rte_errno`` indicating reason for failure. + */ +int __rte_experimental +rte_fbarray_find_contig_used(struct rte_fbarray *arr, unsigned int start); + + +/** + * Dump ``rte_fbarray`` metadata. + * + * @param arr + * Valid pointer to allocated and correctly set up ``rte_fbarray`` structure. + * + * @param f + * File object to dump information into. + */ +void __rte_experimental +rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f); + +#ifdef __cplusplus +} +#endif + +#endif /* RTE_FBARRAY_H */ diff --git a/lib/librte_eal/common/include/rte_hypervisor.h b/lib/librte_eal/common/include/rte_hypervisor.h index 8d8aac74..5fe719c1 100644 --- a/lib/librte_eal/common/include/rte_hypervisor.h +++ b/lib/librte_eal/common/include/rte_hypervisor.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright 2017 Mellanox Technologies, Ltd. + * Copyright 2017 Mellanox Technologies, Ltd */ #ifndef RTE_HYPERVISOR_H diff --git a/lib/librte_eal/common/include/rte_lcore.h b/lib/librte_eal/common/include/rte_lcore.h index 04722203..6e09d918 100644 --- a/lib/librte_eal/common/include/rte_lcore.h +++ b/lib/librte_eal/common/include/rte_lcore.h @@ -119,7 +119,7 @@ rte_lcore_index(int lcore_id) if (lcore_id >= RTE_MAX_LCORE) return -1; if (lcore_id < 0) - lcore_id = rte_lcore_id(); + lcore_id = (int)rte_lcore_id(); return lcore_config[lcore_id].core_index; } @@ -131,6 +131,36 @@ rte_lcore_index(int lcore_id) */ unsigned rte_socket_id(void); +/** + * Return number of physical sockets detected on the system. + * + * Note that number of nodes may not be correspondent to their physical id's: + * for example, a system may report two socket id's, but the actual socket id's + * may be 0 and 8. + * + * @return + * the number of physical sockets as recognized by EAL + */ +unsigned int __rte_experimental +rte_socket_count(void); + +/** + * Return socket id with a particular index. + * + * This will return socket id at a particular position in list of all detected + * physical socket id's. For example, on a machine with sockets [0, 8], passing + * 1 as a parameter will return 8. + * + * @param idx + * index of physical socket id to return + * + * @return + * - physical socket id as recognized by EAL + * - -1 on error, with errno set to EINVAL + */ +int __rte_experimental +rte_socket_id_by_idx(unsigned int idx); + /** * Get the ID of the physical socket of the specified lcore * @@ -246,6 +276,32 @@ void rte_thread_get_affinity(rte_cpuset_t *cpusetp); */ int rte_thread_setname(pthread_t id, const char *name); +/** + * Create a control thread. + * + * Wrapper to pthread_create(), pthread_setname_np() and + * pthread_setaffinity_np(). The dataplane and service lcores are + * excluded from the affinity of the new thread. + * + * @param thread + * Filled with the thread id of the new created thread. + * @param name + * The name of the control thread (max 16 characters including '\0'). + * @param attr + * Attributes for the new thread. + * @param start_routine + * Function to be executed by the new thread. + * @param arg + * Argument passed to start_routine. + * @return + * On success, returns 0; on error, it returns a negative value + * corresponding to the error number. + */ +__rte_experimental int +rte_ctrl_thread_create(pthread_t *thread, const char *name, + const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg); + /** * Test if the core supplied has a specific role * @@ -255,7 +311,7 @@ int rte_thread_setname(pthread_t id, const char *name); * @param role * The role to be checked against. * @return - * On success, return 0; otherwise return a negative value. + * Boolean value: positive if test is true; otherwise returns 0. */ int rte_lcore_has_role(unsigned int lcore_id, enum rte_lcore_role_t role); diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h index 9029c785..2f789cb9 100644 --- a/lib/librte_eal/common/include/rte_log.h +++ b/lib/librte_eal/common/include/rte_log.h @@ -20,6 +20,7 @@ extern "C" { #include #include #include +#include #include #include @@ -129,16 +130,28 @@ uint32_t rte_log_get_global_level(void); int rte_log_get_level(uint32_t logtype); /** - * Set the log level for a given type. + * Set the log level for a given type based on shell pattern. * * @param pattern - * The regexp identifying the log type. + * The match pattern identifying the log type. + * @param level + * The level to be set. + * @return + * 0 on success, a negative value if level is invalid. + */ +int rte_log_set_level_pattern(const char *pattern, uint32_t level); + +/** + * Set the log level for a given type based on regular expression. + * + * @param regex + * The regular expression identifying the log type. * @param level * The level to be set. * @return * 0 on success, a negative value if level is invalid. */ -int rte_log_set_level_regexp(const char *pattern, uint32_t level); +int rte_log_set_level_regexp(const char *regex, uint32_t level); /** * Set the log level for a given type. @@ -194,6 +207,27 @@ int rte_log_cur_msg_logtype(void); */ int rte_log_register(const char *name); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Register a dynamic log type and try to pick its level from EAL options + * + * rte_log_register() is called inside. If successful, the function tries + * to search for matching regexp in the list of EAL log level options and + * pick the level from the last matching entry. If nothing can be applied + * from the list, the level will be set to the user-defined default value. + * + * @param name + * Name for the log type to be registered + * @param level_def + * Fallback level to be set if the global list has no matching options + * @return + * - >=0: the newly registered log type + * - <0: rte_log_register() error value + */ +int rte_log_register_type_and_pick_level(const char *name, uint32_t level_def); + /** * Dump log information. * diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h index f02a8ba1..a9fb7e45 100644 --- a/lib/librte_eal/common/include/rte_malloc.h +++ b/lib/librte_eal/common/include/rte_malloc.h @@ -13,6 +13,7 @@ #include #include +#include #include #ifdef __cplusplus @@ -277,6 +278,15 @@ rte_malloc_get_socket_stats(int socket, void rte_malloc_dump_stats(FILE *f, const char *type); +/** + * Dump contents of all malloc heaps to a file. + * + * @param f + * A pointer to a file for output + */ +void __rte_experimental +rte_malloc_dump_heaps(FILE *f); + /** * Set the maximum amount of allocated memory for this type. * diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h index ba99ed90..d43fa909 100644 --- a/lib/librte_eal/common/include/rte_malloc_heap.h +++ b/lib/librte_eal/common/include/rte_malloc_heap.h @@ -13,12 +13,18 @@ /* Number of free lists per heap, grouped by size. */ #define RTE_HEAP_NUM_FREELISTS 13 +/* dummy definition, for pointers */ +struct malloc_elem; + /** * Structure to hold malloc heap */ struct malloc_heap { rte_spinlock_t lock; LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS]; + struct malloc_elem *volatile first; + struct malloc_elem *volatile last; + unsigned alloc_count; size_t total_size; } __rte_cache_aligned; diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h index 302f865b..aab9f6fe 100644 --- a/lib/librte_eal/common/include/rte_memory.h +++ b/lib/librte_eal/common/include/rte_memory.h @@ -20,8 +20,12 @@ extern "C" { #endif #include +#include #include +/* forward declaration for pointers */ +struct rte_memseg_list; + __extension__ enum rte_page_sizes { RTE_PGSIZE_4K = 1ULL << 12, @@ -79,6 +83,8 @@ typedef uint64_t rte_iova_t; /** * Physical memory segment descriptor. */ +#define RTE_MEMSEG_FLAG_DO_NOT_FREE (1 << 0) +/**< Prevent this segment from being freed back to the OS. */ struct rte_memseg { RTE_STD_C11 union { @@ -95,6 +101,7 @@ struct rte_memseg { int32_t socket_id; /**< NUMA socket ID. */ uint32_t nchannel; /**< Number of channels. */ uint32_t nrank; /**< Number of ranks. */ + uint32_t flags; /**< Memseg-specific flags */ } __rte_packed; /** @@ -130,25 +137,138 @@ phys_addr_t rte_mem_virt2phy(const void *virt); rte_iova_t rte_mem_virt2iova(const void *virt); /** - * Get the layout of the available physical memory. + * Get virtual memory address corresponding to iova address. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param iova + * The iova address. + * @return + * Virtual address corresponding to iova address (or NULL if address does not + * exist within DPDK memory map). + */ +__rte_experimental void * +rte_mem_iova2virt(rte_iova_t iova); + +/** + * Get memseg to which a particular virtual address belongs. + * + * @param virt + * The virtual address. + * @param msl + * The memseg list in which to look up based on ``virt`` address + * (can be NULL). + * @return + * Memseg pointer on success, or NULL on error. + */ +__rte_experimental struct rte_memseg * +rte_mem_virt2memseg(const void *virt, const struct rte_memseg_list *msl); + +/** + * Get memseg list corresponding to virtual memory address. + * + * @param virt + * The virtual address. + * @return + * Memseg list to which this virtual address belongs to. + */ +__rte_experimental struct rte_memseg_list * +rte_mem_virt2memseg_list(const void *virt); + +/** + * Memseg walk function prototype. + * + * Returning 0 will continue walk + * Returning 1 will stop the walk + * Returning -1 will stop the walk and report error + */ +typedef int (*rte_memseg_walk_t)(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg); + +/** + * Memseg contig walk function prototype. This will trigger a callback on every + * VA-contiguous are starting at memseg ``ms``, so total valid VA space at each + * callback call will be [``ms->addr``, ``ms->addr + len``). + * + * Returning 0 will continue walk + * Returning 1 will stop the walk + * Returning -1 will stop the walk and report error + */ +typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg); + +/** + * Memseg list walk function prototype. This will trigger a callback on every + * allocated memseg list. + * + * Returning 0 will continue walk + * Returning 1 will stop the walk + * Returning -1 will stop the walk and report error + */ +typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl, + void *arg); + +/** + * Walk list of all memsegs. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +int __rte_experimental +rte_memseg_walk(rte_memseg_walk_t func, void *arg); + +/** + * Walk each VA-contiguous area. + * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator + * @return + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error + */ +int __rte_experimental +rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg); + +/** + * Walk each allocated memseg list. * - * It can be useful for an application to have the full physical - * memory layout to decide the size of a memory zone to reserve. This - * table is stored in rte_config (see rte_eal_get_configuration()). + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. * + * @param func + * Iterator function + * @param arg + * Argument passed to iterator * @return - * - On success, return a pointer to a read-only table of struct - * rte_physmem_desc elements, containing the layout of all - * addressable physical memory. The last element of the table - * contains a NULL address. - * - On error, return NULL. This should not happen since it is a fatal - * error that will probably cause the entire system to panic. + * 0 if walked over the entire list + * 1 if stopped by the user + * -1 if user function reported error */ -const struct rte_memseg *rte_eal_get_physmem_layout(void); +int __rte_experimental +rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg); /** * Dump the physical memory layout to a file. * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * * @param f * A pointer to a file for output */ @@ -157,6 +277,9 @@ void rte_dump_physmem_layout(FILE *f); /** * Get the total amount of available physical memory. * + * @note This function read-locks the memory hotplug subsystem, and thus cannot + * be used within memory-related callback functions. + * * @return * The total amount of available physical memory in bytes. */ @@ -191,6 +314,137 @@ unsigned rte_memory_get_nrank(void); */ int rte_eal_using_phys_addrs(void); + +/** + * Enum indicating which kind of memory event has happened. Used by callbacks to + * distinguish between memory allocations and deallocations. + */ +enum rte_mem_event { + RTE_MEM_EVENT_ALLOC = 0, /**< Allocation event. */ + RTE_MEM_EVENT_FREE, /**< Deallocation event. */ +}; +#define RTE_MEM_EVENT_CALLBACK_NAME_LEN 64 +/**< maximum length of callback name */ + +/** + * Function typedef used to register callbacks for memory events. + */ +typedef void (*rte_mem_event_callback_t)(enum rte_mem_event event_type, + const void *addr, size_t len, void *arg); + +/** + * Function used to register callbacks for memory events. + * + * @note callbacks will happen while memory hotplug subsystem is write-locked, + * therefore some functions (e.g. `rte_memseg_walk()`) will cause a + * deadlock when called from within such callbacks. + * + * @note mem event callbacks not being supported is an expected error condition, + * so user code needs to handle this situation. In these cases, return + * value will be -1, and rte_errno will be set to ENOTSUP. + * + * @param name + * Name associated with specified callback to be added to the list. + * + * @param clb + * Callback function pointer. + * + * @param arg + * Argument to pass to the callback. + * + * @return + * 0 on successful callback register + * -1 on unsuccessful callback register, with rte_errno value indicating + * reason for failure. + */ +int __rte_experimental +rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb, + void *arg); + +/** + * Function used to unregister callbacks for memory events. + * + * @param name + * Name associated with specified callback to be removed from the list. + * + * @param arg + * Argument to look for among callbacks with specified callback name. + * + * @return + * 0 on successful callback unregister + * -1 on unsuccessful callback unregister, with rte_errno value indicating + * reason for failure. + */ +int __rte_experimental +rte_mem_event_callback_unregister(const char *name, void *arg); + + +#define RTE_MEM_ALLOC_VALIDATOR_NAME_LEN 64 +/**< maximum length of alloc validator name */ +/** + * Function typedef used to register memory allocation validation callbacks. + * + * Returning 0 will allow allocation attempt to continue. Returning -1 will + * prevent allocation from succeeding. + */ +typedef int (*rte_mem_alloc_validator_t)(int socket_id, + size_t cur_limit, size_t new_len); + +/** + * @brief Register validator callback for memory allocations. + * + * Callbacks registered by this function will be called right before memory + * allocator is about to trigger allocation of more pages from the system if + * said allocation will bring total memory usage above specified limit on + * specified socket. User will be able to cancel pending allocation if callback + * returns -1. + * + * @note callbacks will happen while memory hotplug subsystem is write-locked, + * therefore some functions (e.g. `rte_memseg_walk()`) will cause a + * deadlock when called from within such callbacks. + * + * @note validator callbacks not being supported is an expected error condition, + * so user code needs to handle this situation. In these cases, return + * value will be -1, and rte_errno will be set to ENOTSUP. + * + * @param name + * Name associated with specified callback to be added to the list. + * + * @param clb + * Callback function pointer. + * + * @param socket_id + * Socket ID on which to watch for allocations. + * + * @param limit + * Limit above which to trigger callbacks. + * + * @return + * 0 on successful callback register + * -1 on unsuccessful callback register, with rte_errno value indicating + * reason for failure. + */ +int __rte_experimental +rte_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit); + +/** + * @brief Unregister validator callback for memory allocations. + * + * @param name + * Name associated with specified callback to be removed from the list. + * + * @param socket_id + * Socket ID on which to watch for allocations. + * + * @return + * 0 on successful callback unregister + * -1 on unsuccessful callback unregister, with rte_errno value indicating + * reason for failure. + */ +int __rte_experimental +rte_mem_alloc_validator_unregister(const char *name, int socket_id); + #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h index 2bfb2731..ef370fa6 100644 --- a/lib/librte_eal/common/include/rte_memzone.h +++ b/lib/librte_eal/common/include/rte_memzone.h @@ -23,6 +23,7 @@ */ #include +#include #include #include @@ -39,6 +40,7 @@ extern "C" { #define RTE_MEMZONE_512MB 0x00040000 /**< Use 512MB pages. */ #define RTE_MEMZONE_4GB 0x00080000 /**< Use 4GB pages. */ #define RTE_MEMZONE_SIZE_HINT_ONLY 0x00000004 /**< Use available page size */ +#define RTE_MEMZONE_IOVA_CONTIG 0x00100000 /**< Ask for IOVA-contiguous memzone. */ /** * A structure describing a memzone, which is a contiguous portion of @@ -66,7 +68,6 @@ struct rte_memzone { int32_t socket_id; /**< NUMA socket ID. */ uint32_t flags; /**< Characteristics of this memzone. */ - uint32_t memseg_id; /**< Memseg it belongs. */ } __attribute__((__packed__)); /** @@ -76,6 +77,13 @@ struct rte_memzone { * correctly filled memzone descriptor. If the allocation cannot be * done, return NULL. * + * @note Reserving memzones with len set to 0 will only attempt to allocate + * memzones from memory that is already available. It will not trigger any + * new allocations. + * + * @note Reserving IOVA-contiguous memzones with len set to 0 is not currently + * supported. + * * @param name * The name of the memzone. If it already exists, the function will * fail and return NULL. @@ -102,6 +110,9 @@ struct rte_memzone { * If this flag is not set, the function * will return error on an unavailable size * request. + * - RTE_MEMZONE_IOVA_CONTIG - Ensure reserved memzone is IOVA-contiguous. + * This option should be used when allocating + * memory intended for hardware rings etc. * @return * A pointer to a correctly-filled read-only memzone descriptor, or NULL * on error. @@ -126,6 +137,13 @@ const struct rte_memzone *rte_memzone_reserve(const char *name, * descriptor. If the allocation cannot be done or if the alignment * is not a power of 2, returns NULL. * + * @note Reserving memzones with len set to 0 will only attempt to allocate + * memzones from memory that is already available. It will not trigger any + * new allocations. + * + * @note Reserving IOVA-contiguous memzones with len set to 0 is not currently + * supported. + * * @param name * The name of the memzone. If it already exists, the function will * fail and return NULL. @@ -152,6 +170,9 @@ const struct rte_memzone *rte_memzone_reserve(const char *name, * If this flag is not set, the function * will return error on an unavailable size * request. + * - RTE_MEMZONE_IOVA_CONTIG - Ensure reserved memzone is IOVA-contiguous. + * This option should be used when allocating + * memory intended for hardware rings etc. * @param align * Alignment for resulting memzone. Must be a power of 2. * @return @@ -181,6 +202,13 @@ const struct rte_memzone *rte_memzone_reserve_aligned(const char *name, * boundary. That implies that requested length should be less or equal * then boundary. * + * @note Reserving memzones with len set to 0 will only attempt to allocate + * memzones from memory that is already available. It will not trigger any + * new allocations. + * + * @note Reserving IOVA-contiguous memzones with len set to 0 is not currently + * supported. + * * @param name * The name of the memzone. If it already exists, the function will * fail and return NULL. @@ -207,6 +235,9 @@ const struct rte_memzone *rte_memzone_reserve_aligned(const char *name, * If this flag is not set, the function * will return error on an unavailable size * request. + * - RTE_MEMZONE_IOVA_CONTIG - Ensure reserved memzone is IOVA-contiguous. + * This option should be used when allocating + * memory intended for hardware rings etc. * @param align * Alignment for resulting memzone. Must be a power of 2. * @param bound diff --git a/lib/librte_eal/common/include/rte_pci_dev_feature_defs.h b/lib/librte_eal/common/include/rte_pci_dev_feature_defs.h index 08222510..e12c2208 100644 --- a/lib/librte_eal/common/include/rte_pci_dev_feature_defs.h +++ b/lib/librte_eal/common/include/rte_pci_dev_feature_defs.h @@ -1,59 +1,5 @@ -/*- - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * The full GNU General Public License is included in this distribution - * in the file called LICENSE.GPL. - * - * Contact Information: - * Intel Corporation - * - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0) + * Copyright(c) 2010-2014 Intel Corporation */ #ifndef _RTE_PCI_DEV_DEFS_H_ diff --git a/lib/librte_eal/common/include/rte_pci_dev_features.h b/lib/librte_eal/common/include/rte_pci_dev_features.h index 67b986a6..6104123d 100644 --- a/lib/librte_eal/common/include/rte_pci_dev_features.h +++ b/lib/librte_eal/common/include/rte_pci_dev_features.h @@ -1,59 +1,5 @@ -/*- - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * The full GNU General Public License is included in this distribution - * in the file called LICENSE.GPL. - * - * Contact Information: - * Intel Corporation - * - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0) + * Copyright(c) 2010-2014 Intel Corporation */ #ifndef _RTE_PCI_DEV_FEATURES_H diff --git a/lib/librte_eal/common/include/rte_random.h b/lib/librte_eal/common/include/rte_random.h index 63bb2808..b2ca1c20 100644 --- a/lib/librte_eal/common/include/rte_random.h +++ b/lib/librte_eal/common/include/rte_random.h @@ -31,7 +31,7 @@ extern "C" { static inline void rte_srand(uint64_t seedval) { - srand48((long unsigned int)seedval); + srand48((long)seedval); } /** @@ -48,9 +48,9 @@ static inline uint64_t rte_rand(void) { uint64_t val; - val = lrand48(); + val = (uint64_t)lrand48(); val <<= 32; - val += lrand48(); + val += (uint64_t)lrand48(); return val; } diff --git a/lib/librte_eal/common/include/rte_service.h b/lib/librte_eal/common/include/rte_service.h index 211eb376..aea4d91b 100644 --- a/lib/librte_eal/common/include/rte_service.h +++ b/lib/librte_eal/common/include/rte_service.h @@ -47,9 +47,6 @@ extern "C" { #define RTE_SERVICE_CAP_MT_SAFE (1 << 0) /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Return the number of services registered. * * The number of services registered can be passed to *rte_service_get_by_id*, @@ -57,12 +54,9 @@ extern "C" { * * @return The number of services registered. */ -uint32_t __rte_experimental rte_service_get_count(void); +uint32_t rte_service_get_count(void); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Return the id of a service by name. * * This function provides the id of the service using the service name as @@ -84,24 +78,17 @@ uint32_t __rte_experimental rte_service_get_count(void); * @retval -EINVAL Null *service_id* pointer provided * @retval -ENODEV No such service registered */ -int32_t __rte_experimental rte_service_get_by_name(const char *name, - uint32_t *service_id); +int32_t rte_service_get_by_name(const char *name, uint32_t *service_id); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Return the name of the service. * * @return A pointer to the name of the service. The returned pointer remains * in ownership of the service, and the application must not free it. */ -const char __rte_experimental *rte_service_get_name(uint32_t id); +const char *rte_service_get_name(uint32_t id); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Check if a service has a specific capability. * * This function returns if *service* has implements *capability*. @@ -109,13 +96,9 @@ const char __rte_experimental *rte_service_get_name(uint32_t id); * @retval 1 Capability supported by this service instance * @retval 0 Capability not supported by this service instance */ -int32_t __rte_experimental rte_service_probe_capability(uint32_t id, - uint32_t capability); +int32_t rte_service_probe_capability(uint32_t id, uint32_t capability); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Map or unmap a lcore to a service. * * Each core can be added or removed from running a specific service. This @@ -134,13 +117,10 @@ int32_t __rte_experimental rte_service_probe_capability(uint32_t id, * @retval 0 lcore map updated successfully * @retval -EINVAL An invalid service or lcore was provided. */ -int32_t __rte_experimental rte_service_map_lcore_set(uint32_t service_id, - uint32_t lcore, uint32_t enable); +int32_t rte_service_map_lcore_set(uint32_t service_id, uint32_t lcore, + uint32_t enable); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Retrieve the mapping of an lcore to a service. * * @param service_id the service to apply the lcore to @@ -150,13 +130,9 @@ int32_t __rte_experimental rte_service_map_lcore_set(uint32_t service_id, * @retval 0 lcore is not mapped to service * @retval -EINVAL An invalid service or lcore was provided. */ -int32_t __rte_experimental rte_service_map_lcore_get(uint32_t service_id, - uint32_t lcore); +int32_t rte_service_map_lcore_get(uint32_t service_id, uint32_t lcore); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Set the runstate of the service. * * Each service is either running or stopped. Setting a non-zero runstate @@ -168,12 +144,9 @@ int32_t __rte_experimental rte_service_map_lcore_get(uint32_t service_id, * @retval 0 The service was successfully started * @retval -EINVAL Invalid service id */ -int32_t __rte_experimental rte_service_runstate_set(uint32_t id, uint32_t runstate); +int32_t rte_service_runstate_set(uint32_t id, uint32_t runstate); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Get the runstate for the service with *id*. See *rte_service_runstate_set* * for details of runstates. A service can call this function to ensure that * the application has indicated that it will receive CPU cycles. Either a @@ -186,12 +159,9 @@ int32_t __rte_experimental rte_service_runstate_set(uint32_t id, uint32_t runsta * @retval 0 Service is stopped * @retval -EINVAL Invalid service id */ -int32_t __rte_experimental rte_service_runstate_get(uint32_t id); +int32_t rte_service_runstate_get(uint32_t id); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Enable or disable the check for a service-core being mapped to the service. * An application can disable the check when takes the responsibility to run a * service itself using *rte_service_run_iter_on_app_lcore*. @@ -202,13 +172,9 @@ int32_t __rte_experimental rte_service_runstate_get(uint32_t id); * @retval 0 Success * @retval -EINVAL Invalid service ID */ -int32_t __rte_experimental rte_service_set_runstate_mapped_check(uint32_t id, - int32_t enable); +int32_t rte_service_set_runstate_mapped_check(uint32_t id, int32_t enable); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * This function runs a service callback from a non-service lcore. * * This function is designed to enable gradual porting to service cores, and @@ -241,13 +207,10 @@ int32_t __rte_experimental rte_service_set_runstate_mapped_check(uint32_t id, * @retval -ENOEXEC Service is not in a run-able state * @retval -EINVAL Invalid service id */ -int32_t __rte_experimental rte_service_run_iter_on_app_lcore(uint32_t id, +int32_t rte_service_run_iter_on_app_lcore(uint32_t id, uint32_t serialize_multithread_unsafe); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Start a service core. * * Starting a core makes the core begin polling. Any services assigned to it @@ -259,12 +222,9 @@ int32_t __rte_experimental rte_service_run_iter_on_app_lcore(uint32_t id, * @retval -EINVAL Failed to start core. The *lcore_id* passed in is not * currently assigned to be a service core. */ -int32_t __rte_experimental rte_service_lcore_start(uint32_t lcore_id); +int32_t rte_service_lcore_start(uint32_t lcore_id); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Stop a service core. * * Stopping a core makes the core become idle, but remains assigned as a @@ -278,12 +238,9 @@ int32_t __rte_experimental rte_service_lcore_start(uint32_t lcore_id); * The application must stop the service first, and then stop the * lcore. */ -int32_t __rte_experimental rte_service_lcore_stop(uint32_t lcore_id); +int32_t rte_service_lcore_stop(uint32_t lcore_id); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Adds lcore to the list of service cores. * * This functions can be used at runtime in order to modify the service core @@ -294,12 +251,9 @@ int32_t __rte_experimental rte_service_lcore_stop(uint32_t lcore_id); * @retval -EALREADY lcore is already added to the service core list * @retval -EINVAL Invalid lcore provided */ -int32_t __rte_experimental rte_service_lcore_add(uint32_t lcore); +int32_t rte_service_lcore_add(uint32_t lcore); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Removes lcore from the list of service cores. * * This can fail if the core is not stopped, see *rte_service_core_stop*. @@ -308,12 +262,9 @@ int32_t __rte_experimental rte_service_lcore_add(uint32_t lcore); * @retval -EBUSY Lcore is not stopped, stop service core before removing. * @retval -EINVAL failed to add lcore to service core mask. */ -int32_t __rte_experimental rte_service_lcore_del(uint32_t lcore); +int32_t rte_service_lcore_del(uint32_t lcore); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Retrieve the number of service cores currently available. * * This function returns the integer count of service cores available. The @@ -325,24 +276,18 @@ int32_t __rte_experimental rte_service_lcore_del(uint32_t lcore); * * @return The number of service cores currently configured. */ -int32_t __rte_experimental rte_service_lcore_count(void); +int32_t rte_service_lcore_count(void); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Resets all service core mappings. This does not remove the service cores * from duty, just unmaps all services / cores, and stops() the service cores. * The runstate of services is not modified. * * @retval 0 Success */ -int32_t __rte_experimental rte_service_lcore_reset_all(void); +int32_t rte_service_lcore_reset_all(void); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Enable or disable statistics collection for *service*. * * This function enables per core, per-service cycle count collection. @@ -351,13 +296,9 @@ int32_t __rte_experimental rte_service_lcore_reset_all(void); * @retval 0 Success * @retval -EINVAL Invalid service pointer passed */ -int32_t __rte_experimental rte_service_set_stats_enable(uint32_t id, - int32_t enable); +int32_t rte_service_set_stats_enable(uint32_t id, int32_t enable); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Retrieve the list of currently enabled service cores. * * This function fills in an application supplied array, with each element @@ -373,12 +314,9 @@ int32_t __rte_experimental rte_service_set_stats_enable(uint32_t id, * service core list. No items have been populated, call this function * with a size of at least *rte_service_core_count* items. */ -int32_t __rte_experimental rte_service_lcore_list(uint32_t array[], uint32_t n); +int32_t rte_service_lcore_list(uint32_t array[], uint32_t n); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Get the numer of services running on the supplied lcore. * * @param lcore Id of the service core. @@ -386,19 +324,16 @@ int32_t __rte_experimental rte_service_lcore_list(uint32_t array[], uint32_t n); * @retval -EINVAL Invalid lcore provided * @retval -ENOTSUP The provided lcore is not a service core. */ -int32_t __rte_experimental rte_service_lcore_count_services(uint32_t lcore); +int32_t rte_service_lcore_count_services(uint32_t lcore); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Dumps any information available about the service. When id is UINT32_MAX, * this function dumps info for all services. * * @retval 0 Statistics have been successfully dumped * @retval -EINVAL Invalid service id provided */ -int32_t __rte_experimental rte_service_dump(FILE *f, uint32_t id); +int32_t rte_service_dump(FILE *f, uint32_t id); /** * Returns the number of cycles that this service has consumed @@ -411,28 +346,22 @@ int32_t __rte_experimental rte_service_dump(FILE *f, uint32_t id); #define RTE_SERVICE_ATTR_CALL_COUNT 1 /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Get an attribute from a service. * * @retval 0 Success, the attribute value has been written to *attr_value*. * -EINVAL Invalid id, attr_id or attr_value was NULL. */ -int32_t __rte_experimental rte_service_attr_get(uint32_t id, uint32_t attr_id, +int32_t rte_service_attr_get(uint32_t id, uint32_t attr_id, uint32_t *attr_value); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Reset all attribute values of a service. * * @param id The service to reset all statistics of * @retval 0 Successfully reset attributes * -EINVAL Invalid service id provided */ -int32_t __rte_experimental rte_service_attr_reset_all(uint32_t id); +int32_t rte_service_attr_reset_all(uint32_t id); #ifdef __cplusplus } diff --git a/lib/librte_eal/common/include/rte_service_component.h b/lib/librte_eal/common/include/rte_service_component.h index 9ba4aa29..c12adbc2 100644 --- a/lib/librte_eal/common/include/rte_service_component.h +++ b/lib/librte_eal/common/include/rte_service_component.h @@ -13,17 +13,11 @@ #include /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Signature of callback function to run a service. */ typedef int32_t (*rte_service_func)(void *args); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * The specification of a service. * * This struct contains metadata about the service itself, the callback @@ -47,9 +41,6 @@ struct rte_service_spec { }; /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Register a new service. * * A service represents a component that the requires CPU time periodically to @@ -73,14 +64,10 @@ struct rte_service_spec { * -EINVAL Attempted to register an invalid service (eg, no callback * set) */ -int32_t __rte_experimental -rte_service_component_register(const struct rte_service_spec *spec, - uint32_t *service_id); +int32_t rte_service_component_register(const struct rte_service_spec *spec, + uint32_t *service_id); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Unregister a service component. * * The service being removed must be stopped before calling this function. @@ -89,12 +76,9 @@ rte_service_component_register(const struct rte_service_spec *spec, * @retval -EBUSY The service is currently running, stop the service before * calling unregister. No action has been taken. */ -int32_t __rte_experimental rte_service_component_unregister(uint32_t id); +int32_t rte_service_component_unregister(uint32_t id); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Private function to allow EAL to initialized default mappings. * * This function iterates all the services, and maps then to the available @@ -107,12 +91,9 @@ int32_t __rte_experimental rte_service_component_unregister(uint32_t id); * @retval -ENODEV Error in enabling service lcore on a service * @retval -ENOEXEC Error when starting services */ -int32_t __rte_experimental rte_service_start_with_defaults(void); +int32_t rte_service_start_with_defaults(void); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Set the backend runstate of a component. * * This function allows services to be registered at startup, but not yet @@ -124,13 +105,9 @@ int32_t __rte_experimental rte_service_start_with_defaults(void); * * @retval 0 Success */ -int32_t __rte_experimental rte_service_component_runstate_set(uint32_t id, - uint32_t runstate); +int32_t rte_service_component_runstate_set(uint32_t id, uint32_t runstate); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * Initialize the service library. * * In order to use the service library, it must be initialized. EAL initializes @@ -142,14 +119,11 @@ int32_t __rte_experimental rte_service_component_runstate_set(uint32_t id, int32_t rte_service_init(void); /** - * @warning - * @b EXPERIMENTAL: this API may change without prior notice - * * @internal Free up the memory that has been initialized. * This routine is to be invoked prior to process termination. * * @retval None */ -void __rte_experimental rte_service_finalize(void); +void rte_service_finalize(void); #endif /* _RTE_SERVICE_PRIVATE_H_ */ diff --git a/lib/librte_eal/common/include/rte_string_fns.h b/lib/librte_eal/common/include/rte_string_fns.h index e97047a4..97597a14 100644 --- a/lib/librte_eal/common/include/rte_string_fns.h +++ b/lib/librte_eal/common/include/rte_string_fns.h @@ -15,6 +15,8 @@ extern "C" { #endif +#include + /** * Takes string "string" parameter and splits it at character "delim" * up to maxtokens-1 times - to give "maxtokens" resulting tokens. Like @@ -45,6 +47,35 @@ int rte_strsplit(char *string, int stringlen, char **tokens, int maxtokens, char delim); +/** + * @internal + * DPDK-specific version of strlcpy for systems without + * libc or libbsd copies of the function + */ +static inline size_t +rte_strlcpy(char *dst, const char *src, size_t size) +{ + return (size_t)snprintf(dst, size, "%s", src); +} + +/* pull in a strlcpy function */ +#ifdef RTE_EXEC_ENV_BSDAPP +#include +#ifndef __BSD_VISIBLE /* non-standard functions are hidden */ +#define strlcpy(dst, src, size) rte_strlcpy(dst, src, size) +#endif + + +#else /* non-BSD platforms */ +#ifdef RTE_USE_LIBBSD +#include + +#else /* no BSD header files, create own */ +#define strlcpy(dst, src, size) rte_strlcpy(dst, src, size) + +#endif /* RTE_USE_LIBBSD */ +#endif /* BSDAPP */ + #ifdef __cplusplus } #endif diff --git a/lib/librte_eal/common/include/rte_version.h b/lib/librte_eal/common/include/rte_version.h index 54e97706..6e2a2362 100644 --- a/lib/librte_eal/common/include/rte_version.h +++ b/lib/librte_eal/common/include/rte_version.h @@ -32,12 +32,12 @@ extern "C" { /** * Minor version/month number i.e. the mm in yy.mm.z */ -#define RTE_VER_MONTH 02 +#define RTE_VER_MONTH 05 /** * Patch level number i.e. the z in yy.mm.z */ -#define RTE_VER_MINOR 1 +#define RTE_VER_MINOR 0 /** * Extra string to be appended to version number diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h index e981a622..f90972fa 100644 --- a/lib/librte_eal/common/include/rte_vfio.h +++ b/lib/librte_eal/common/include/rte_vfio.h @@ -5,6 +5,15 @@ #ifndef _RTE_VFIO_H_ #define _RTE_VFIO_H_ +/** + * @file + * RTE VFIO. This library provides various VFIO related utility functions. + */ + +#ifdef __cplusplus +extern "C" { +#endif + /* * determine if VFIO is present on the system */ @@ -28,6 +37,20 @@ #define VFIO_NOIOMMU_MODE \ "/sys/module/vfio/parameters/enable_unsafe_noiommu_mode" +/* NOIOMMU is defined from kernel version 4.5 onwards */ +#ifdef VFIO_NOIOMMU_IOMMU +#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU +#else +#define RTE_VFIO_NOIOMMU 8 +#endif + +#else /* not VFIO_PRESENT */ + +/* we don't need an actual definition, only pointer is used */ +struct vfio_device_info; + +#endif /* VFIO_PRESENT */ + /** * Setup vfio_cfg for the device identified by its address. * It discovers the configured I/O MMU groups or sets a new one for the device. @@ -119,10 +142,244 @@ int rte_vfio_is_enabled(const char *modname); */ int rte_vfio_noiommu_is_enabled(void); -/* remove group fd from internal VFIO group fd array */ +/** + * Remove group fd from internal VFIO group fd array/ + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param vfio_group_fd + * VFIO Grouup FD. + * + * @return + * 0 on success. + * <0 on failure. + */ int rte_vfio_clear_group(int vfio_group_fd); -#endif /* VFIO_PRESENT */ +/** + * Map memory region for use with VFIO. + * + * @note Require at least one device to be attached at the time of + * mapping. DMA maps done via this API will only apply to default + * container and will not apply to any of the containers created + * via rte_vfio_container_create(). + * + * @param vaddr + * Starting virtual address of memory to be mapped. + * + * @param iova + * Starting IOVA address of memory to be mapped. + * + * @param len + * Length of memory segment being mapped. + * + * @return + * 0 if success. + * -1 on error. + */ +int __rte_experimental +rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len); + + +/** + * Unmap memory region from VFIO. + * + * @param vaddr + * Starting virtual address of memory to be unmapped. + * + * @param iova + * Starting IOVA address of memory to be unmapped. + * + * @param len + * Length of memory segment being unmapped. + * + * @return + * 0 if success. + * -1 on error. + */ + +int __rte_experimental +rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len); +/** + * Parse IOMMU group number for a device + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param sysfs_base + * sysfs path prefix. + * + * @param dev_addr + * device location. + * + * @param iommu_group_num + * iommu group number + * + * @return + * >0 on success + * 0 for non-existent group or VFIO + * <0 for errors + */ +int __rte_experimental +rte_vfio_get_group_num(const char *sysfs_base, + const char *dev_addr, int *iommu_group_num); + +/** + * Open VFIO container fd or get an existing one + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @return + * > 0 container fd + * < 0 for errors + */ +int __rte_experimental +rte_vfio_get_container_fd(void); + +/** + * Open VFIO group fd or get an existing one + * + * This function is only relevant to linux and will return + * an error on BSD. + * + * @param iommu_group_num + * iommu group number + * + * @return + * > 0 group fd + * < 0 for errors + */ +int __rte_experimental +rte_vfio_get_group_fd(int iommu_group_num); + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * Create a new container for device binding. + * + * @note Any newly allocated DPDK memory will not be mapped into these + * containers by default, user needs to manage DMA mappings for + * any container created by this API. + * + * @return + * the container fd if successful + * <0 if failed + */ +int __rte_experimental +rte_vfio_container_create(void); + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * Destroy the container, unbind all vfio groups within it. + * + * @param container_fd + * the container fd to destroy + * + * @return + * 0 if successful + * <0 if failed + */ +int __rte_experimental +rte_vfio_container_destroy(int container_fd); + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * Bind a IOMMU group to a container. + * + * @param container_fd + * the container's fd + * + * @param iommu_group_num + * the iommu group number to bind to container + * + * @return + * group fd if successful + * <0 if failed + */ +int __rte_experimental +rte_vfio_container_group_bind(int container_fd, int iommu_group_num); + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * Unbind a IOMMU group from a container. + * + * @param container_fd + * the container fd of container + * + * @param iommu_group_num + * the iommu group number to delete from container + * + * @return + * 0 if successful + * <0 if failed + */ +int __rte_experimental +rte_vfio_container_group_unbind(int container_fd, int iommu_group_num); + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * Perform DMA mapping for devices in a container. + * + * @param container_fd + * the specified container fd + * + * @param vaddr + * Starting virtual address of memory to be mapped. + * + * @param iova + * Starting IOVA address of memory to be mapped. + * + * @param len + * Length of memory segment being mapped. + * + * @return + * 0 if successful + * <0 if failed + */ +int __rte_experimental +rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, + uint64_t iova, uint64_t len); + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * Perform DMA unmapping for devices in a container. + * + * @param container_fd + * the specified container fd + * + * @param vaddr + * Starting virtual address of memory to be unmapped. + * + * @param iova + * Starting IOVA address of memory to be unmapped. + * + * @param len + * Length of memory segment being unmapped. + * + * @return + * 0 if successful + * <0 if failed + */ +int __rte_experimental +rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, + uint64_t iova, uint64_t len); + +#ifdef __cplusplus +} +#endif #endif /* _RTE_VFIO_H_ */ diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c index 0cadc8af..9bfe9b9b 100644 --- a/lib/librte_eal/common/malloc_elem.c +++ b/lib/librte_eal/common/malloc_elem.c @@ -1,10 +1,12 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2010-2014 Intel Corporation */ +#include #include #include #include #include +#include #include #include @@ -16,21 +18,21 @@ #include #include +#include "eal_memalloc.h" #include "malloc_elem.h" #include "malloc_heap.h" -#define MIN_DATA_SIZE (RTE_CACHE_LINE_SIZE) - /* * Initialize a general malloc_elem header structure */ void -malloc_elem_init(struct malloc_elem *elem, - struct malloc_heap *heap, const struct rte_memseg *ms, size_t size) +malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap, + struct rte_memseg_list *msl, size_t size) { elem->heap = heap; - elem->ms = ms; + elem->msl = msl; elem->prev = NULL; + elem->next = NULL; memset(&elem->free_list, 0, sizeof(elem->free_list)); elem->state = ELEM_FREE; elem->size = size; @@ -39,15 +41,74 @@ malloc_elem_init(struct malloc_elem *elem, set_trailer(elem); } +void +malloc_elem_insert(struct malloc_elem *elem) +{ + struct malloc_elem *prev_elem, *next_elem; + struct malloc_heap *heap = elem->heap; + + /* first and last elements must be both NULL or both non-NULL */ + if ((heap->first == NULL) != (heap->last == NULL)) { + RTE_LOG(ERR, EAL, "Heap is probably corrupt\n"); + return; + } + + if (heap->first == NULL && heap->last == NULL) { + /* if empty heap */ + heap->first = elem; + heap->last = elem; + prev_elem = NULL; + next_elem = NULL; + } else if (elem < heap->first) { + /* if lower than start */ + prev_elem = NULL; + next_elem = heap->first; + heap->first = elem; + } else if (elem > heap->last) { + /* if higher than end */ + prev_elem = heap->last; + next_elem = NULL; + heap->last = elem; + } else { + /* the new memory is somewhere inbetween start and end */ + uint64_t dist_from_start, dist_from_end; + + dist_from_end = RTE_PTR_DIFF(heap->last, elem); + dist_from_start = RTE_PTR_DIFF(elem, heap->first); + + /* check which is closer, and find closest list entries */ + if (dist_from_start < dist_from_end) { + prev_elem = heap->first; + while (prev_elem->next < elem) + prev_elem = prev_elem->next; + next_elem = prev_elem->next; + } else { + next_elem = heap->last; + while (next_elem->prev > elem) + next_elem = next_elem->prev; + prev_elem = next_elem->prev; + } + } + + /* insert new element */ + elem->prev = prev_elem; + elem->next = next_elem; + if (prev_elem) + prev_elem->next = elem; + if (next_elem) + next_elem->prev = elem; +} + /* - * Initialize a dummy malloc_elem header for the end-of-memseg marker + * Attempt to find enough physically contiguous memory in this block to store + * our data. Assume that element has at least enough space to fit in the data, + * so we just check the page addresses. */ -void -malloc_elem_mkend(struct malloc_elem *elem, struct malloc_elem *prev) +static bool +elem_check_phys_contig(const struct rte_memseg_list *msl, + void *start, size_t size) { - malloc_elem_init(elem, prev->heap, prev->ms, 0); - elem->prev = prev; - elem->state = ELEM_BUSY; /* mark busy so its never merged */ + return eal_memalloc_is_contig(msl, start, size); } /* @@ -57,27 +118,59 @@ malloc_elem_mkend(struct malloc_elem *elem, struct malloc_elem *prev) */ static void * elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align, - size_t bound) + size_t bound, bool contig) { - const size_t bmask = ~(bound - 1); - uintptr_t end_pt = (uintptr_t)elem + - elem->size - MALLOC_ELEM_TRAILER_LEN; - uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align); - uintptr_t new_elem_start; - - /* check boundary */ - if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) { - end_pt = RTE_ALIGN_FLOOR(end_pt, bound); - new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align); - end_pt = new_data_start + size; - if (((end_pt - 1) & bmask) != (new_data_start & bmask)) - return NULL; - } + size_t elem_size = elem->size; + + /* + * we're allocating from the end, so adjust the size of element by + * alignment size. + */ + while (elem_size >= size) { + const size_t bmask = ~(bound - 1); + uintptr_t end_pt = (uintptr_t)elem + + elem_size - MALLOC_ELEM_TRAILER_LEN; + uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size), + align); + uintptr_t new_elem_start; + + /* check boundary */ + if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) { + end_pt = RTE_ALIGN_FLOOR(end_pt, bound); + new_data_start = RTE_ALIGN_FLOOR((end_pt - size), + align); + end_pt = new_data_start + size; + + if (((end_pt - 1) & bmask) != (new_data_start & bmask)) + return NULL; + } - new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN; + new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN; - /* if the new start point is before the exist start, it won't fit */ - return (new_elem_start < (uintptr_t)elem) ? NULL : (void *)new_elem_start; + /* if the new start point is before the exist start, + * it won't fit + */ + if (new_elem_start < (uintptr_t)elem) + return NULL; + + if (contig) { + size_t new_data_size = end_pt - new_data_start; + + /* + * if physical contiguousness was requested and we + * couldn't fit all data into one physically contiguous + * block, try again with lower addresses. + */ + if (!elem_check_phys_contig(elem->msl, + (void *)new_data_start, + new_data_size)) { + elem_size -= align; + continue; + } + } + return (void *)new_elem_start; + } + return NULL; } /* @@ -86,9 +179,9 @@ elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align, */ int malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align, - size_t bound) + size_t bound, bool contig) { - return elem_start_pt(elem, size, align, bound) != NULL; + return elem_start_pt(elem, size, align, bound, contig) != NULL; } /* @@ -98,17 +191,57 @@ malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align, static void split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt) { - struct malloc_elem *next_elem = RTE_PTR_ADD(elem, elem->size); + struct malloc_elem *next_elem = elem->next; const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem; const size_t new_elem_size = elem->size - old_elem_size; - malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size); + malloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size); split_pt->prev = elem; - next_elem->prev = split_pt; + split_pt->next = next_elem; + if (next_elem) + next_elem->prev = split_pt; + else + elem->heap->last = split_pt; + elem->next = split_pt; elem->size = old_elem_size; set_trailer(elem); } +/* + * our malloc heap is a doubly linked list, so doubly remove our element. + */ +static void __rte_unused +remove_elem(struct malloc_elem *elem) +{ + struct malloc_elem *next, *prev; + next = elem->next; + prev = elem->prev; + + if (next) + next->prev = prev; + else + elem->heap->last = prev; + if (prev) + prev->next = next; + else + elem->heap->first = next; + + elem->prev = NULL; + elem->next = NULL; +} + +static int +next_elem_is_adjacent(struct malloc_elem *elem) +{ + return elem->next == RTE_PTR_ADD(elem, elem->size); +} + +static int +prev_elem_is_adjacent(struct malloc_elem *elem) +{ + return elem == RTE_PTR_ADD(elem->prev, elem->prev->size); +} + /* * Given an element size, compute its freelist index. * We free an element into the freelist containing similarly-sized elements. @@ -162,8 +295,8 @@ malloc_elem_free_list_insert(struct malloc_elem *elem) /* * Remove the specified element from its heap's free list. */ -static void -elem_free_list_remove(struct malloc_elem *elem) +void +malloc_elem_free_list_remove(struct malloc_elem *elem) { LIST_REMOVE(elem, free_list); } @@ -176,14 +309,15 @@ elem_free_list_remove(struct malloc_elem *elem) */ struct malloc_elem * malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align, - size_t bound) + size_t bound, bool contig) { - struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound); + struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound, + contig); const size_t old_elem_size = (uintptr_t)new_elem - (uintptr_t)elem; const size_t trailer_size = elem->size - old_elem_size - size - MALLOC_ELEM_OVERHEAD; - elem_free_list_remove(elem); + malloc_elem_free_list_remove(elem); if (trailer_size > MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { /* split it, too much free space after elem */ @@ -192,6 +326,9 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align, split_elem(elem, new_free_elem); malloc_elem_free_list_insert(new_free_elem); + + if (elem == elem->heap->last) + elem->heap->last = new_free_elem; } if (old_elem_size < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { @@ -230,9 +367,62 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align, static inline void join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2) { - struct malloc_elem *next = RTE_PTR_ADD(elem2, elem2->size); + struct malloc_elem *next = elem2->next; elem1->size += elem2->size; - next->prev = elem1; + if (next) + next->prev = elem1; + else + elem1->heap->last = elem1; + elem1->next = next; +} + +struct malloc_elem * +malloc_elem_join_adjacent_free(struct malloc_elem *elem) +{ + /* + * check if next element exists, is adjacent and is free, if so join + * with it, need to remove from free list. + */ + if (elem->next != NULL && elem->next->state == ELEM_FREE && + next_elem_is_adjacent(elem)) { + void *erase; + + /* we will want to erase the trailer and header */ + erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN); + + /* remove from free list, join to this one */ + malloc_elem_free_list_remove(elem->next); + join_elem(elem, elem->next); + + /* erase header and trailer */ + memset(erase, 0, MALLOC_ELEM_OVERHEAD); + } + + /* + * check if prev element exists, is adjacent and is free, if so join + * with it, need to remove from free list. + */ + if (elem->prev != NULL && elem->prev->state == ELEM_FREE && + prev_elem_is_adjacent(elem)) { + struct malloc_elem *new_elem; + void *erase; + + /* we will want to erase trailer and header */ + erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN); + + /* remove from free list, join to this one */ + malloc_elem_free_list_remove(elem->prev); + + new_elem = elem->prev; + join_elem(new_elem, elem); + + /* erase header and trailer */ + memset(erase, 0, MALLOC_ELEM_OVERHEAD); + + elem = new_elem; + } + + return elem; } /* @@ -240,43 +430,74 @@ join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2) * blocks either immediately before or immediately after newly freed block * are also free, the blocks are merged together. */ -int +struct malloc_elem * malloc_elem_free(struct malloc_elem *elem) { - if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) - return -1; + void *ptr; + size_t data_len; - rte_spinlock_lock(&(elem->heap->lock)); - size_t sz = elem->size - sizeof(*elem) - MALLOC_ELEM_TRAILER_LEN; - uint8_t *ptr = (uint8_t *)&elem[1]; - struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size); - if (next->state == ELEM_FREE){ - /* remove from free list, join to this one */ - elem_free_list_remove(next); - join_elem(elem, next); - sz += (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN); - } + ptr = RTE_PTR_ADD(elem, sizeof(*elem)); + data_len = elem->size - MALLOC_ELEM_OVERHEAD; + + elem = malloc_elem_join_adjacent_free(elem); - /* check if previous element is free, if so join with it and return, - * need to re-insert in free list, as that element's size is changing - */ - if (elem->prev != NULL && elem->prev->state == ELEM_FREE) { - elem_free_list_remove(elem->prev); - join_elem(elem->prev, elem); - sz += (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN); - ptr -= (sizeof(*elem) + MALLOC_ELEM_TRAILER_LEN); - elem = elem->prev; - } malloc_elem_free_list_insert(elem); + elem->pad = 0; + /* decrease heap's count of allocated elements */ elem->heap->alloc_count--; - memset(ptr, 0, sz); + memset(ptr, 0, data_len); - rte_spinlock_unlock(&(elem->heap->lock)); + return elem; +} - return 0; +/* assume all checks were already done */ +void +malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len) +{ + struct malloc_elem *hide_start, *hide_end, *prev, *next; + size_t len_before, len_after; + + hide_start = start; + hide_end = RTE_PTR_ADD(start, len); + + prev = elem->prev; + next = elem->next; + + /* we cannot do anything with non-adjacent elements */ + if (next && next_elem_is_adjacent(elem)) { + len_after = RTE_PTR_DIFF(next, hide_end); + if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* split after */ + split_elem(elem, hide_end); + + malloc_elem_free_list_insert(hide_end); + } else if (len_after > 0) { + RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n"); + return; + } + } + + /* we cannot do anything with non-adjacent elements */ + if (prev && prev_elem_is_adjacent(elem)) { + len_before = RTE_PTR_DIFF(hide_start, elem); + if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* split before */ + split_elem(elem, hide_start); + + prev = elem; + elem = hide_start; + + malloc_elem_free_list_insert(prev); + } else if (len_before > 0) { + RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n"); + return; + } + } + + remove_elem(elem); } /* @@ -287,22 +508,23 @@ int malloc_elem_resize(struct malloc_elem *elem, size_t size) { const size_t new_size = size + elem->pad + MALLOC_ELEM_OVERHEAD; + /* if we request a smaller size, then always return ok */ if (elem->size >= new_size) return 0; - struct malloc_elem *next = RTE_PTR_ADD(elem, elem->size); - rte_spinlock_lock(&elem->heap->lock); - if (next ->state != ELEM_FREE) - goto err_return; - if (elem->size + next->size < new_size) - goto err_return; + /* check if there is a next element, it's free and adjacent */ + if (!elem->next || elem->next->state != ELEM_FREE || + !next_elem_is_adjacent(elem)) + return -1; + if (elem->size + elem->next->size < new_size) + return -1; /* we now know the element fits, so remove from free list, * join the two */ - elem_free_list_remove(next); - join_elem(elem, next); + malloc_elem_free_list_remove(elem->next); + join_elem(elem, elem->next); if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) { /* now we have a big block together. Lets cut it down a bit, by splitting */ @@ -311,10 +533,28 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size) split_elem(elem, split_pt); malloc_elem_free_list_insert(split_pt); } - rte_spinlock_unlock(&elem->heap->lock); return 0; +} -err_return: - rte_spinlock_unlock(&elem->heap->lock); - return -1; +static inline const char * +elem_state_to_str(enum elem_state state) +{ + switch (state) { + case ELEM_PAD: + return "PAD"; + case ELEM_BUSY: + return "BUSY"; + case ELEM_FREE: + return "FREE"; + } + return "ERROR"; +} + +void +malloc_elem_dump(const struct malloc_elem *elem, FILE *f) +{ + fprintf(f, "Malloc element at %p (%s)\n", elem, + elem_state_to_str(elem->state)); + fprintf(f, " len: 0x%zx pad: 0x%" PRIx32 "\n", elem->size, elem->pad); + fprintf(f, " prev: %p next: %p\n", elem->prev, elem->next); } diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h index f4c1c7a9..7331af9c 100644 --- a/lib/librte_eal/common/malloc_elem.h +++ b/lib/librte_eal/common/malloc_elem.h @@ -5,7 +5,11 @@ #ifndef MALLOC_ELEM_H_ #define MALLOC_ELEM_H_ -#include +#include + +#include + +#define MIN_DATA_SIZE (RTE_CACHE_LINE_SIZE) /* dummy definition of struct so we can use pointers to it in malloc_elem struct */ struct malloc_heap; @@ -18,9 +22,13 @@ enum elem_state { struct malloc_elem { struct malloc_heap *heap; - struct malloc_elem *volatile prev; /* points to prev elem in memseg */ - LIST_ENTRY(malloc_elem) free_list; /* list of free elements in heap */ - const struct rte_memseg *ms; + struct malloc_elem *volatile prev; + /**< points to prev elem in memseg */ + struct malloc_elem *volatile next; + /**< points to next elem in memseg */ + LIST_ENTRY(malloc_elem) free_list; + /**< list of free elements in heap */ + struct rte_memseg_list *msl; volatile enum elem_state state; uint32_t pad; size_t size; @@ -107,15 +115,11 @@ malloc_elem_from_data(const void *data) void malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap, - const struct rte_memseg *ms, + struct rte_memseg_list *msl, size_t size); -/* - * initialise a dummy malloc_elem header for the end-of-memseg marker - */ void -malloc_elem_mkend(struct malloc_elem *elem, - struct malloc_elem *prev_free); +malloc_elem_insert(struct malloc_elem *elem); /* * return true if the current malloc_elem can hold a block of data @@ -123,7 +127,7 @@ malloc_elem_mkend(struct malloc_elem *elem, */ int malloc_elem_can_hold(struct malloc_elem *elem, size_t size, - unsigned align, size_t bound); + unsigned int align, size_t bound, bool contig); /* * reserve a block of data in an existing malloc_elem. If the malloc_elem @@ -131,16 +135,19 @@ malloc_elem_can_hold(struct malloc_elem *elem, size_t size, */ struct malloc_elem * malloc_elem_alloc(struct malloc_elem *elem, size_t size, - unsigned align, size_t bound); + unsigned int align, size_t bound, bool contig); /* * free a malloc_elem block by adding it to the free list. If the * blocks either immediately before or immediately after newly freed block * are also free, the blocks are merged together. */ -int +struct malloc_elem * malloc_elem_free(struct malloc_elem *elem); +struct malloc_elem * +malloc_elem_join_adjacent_free(struct malloc_elem *elem); + /* * attempt to resize a malloc_elem by expanding into any free space * immediately after it in memory. @@ -148,6 +155,18 @@ malloc_elem_free(struct malloc_elem *elem); int malloc_elem_resize(struct malloc_elem *elem, size_t size); +void +malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len); + +void +malloc_elem_free_list_remove(struct malloc_elem *elem); + +/* + * dump contents of malloc elem to a file. + */ +void +malloc_elem_dump(const struct malloc_elem *elem, FILE *f); + /* * Given an element size, compute its freelist index. */ diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c index 7aafc880..d6cf3af8 100644 --- a/lib/librte_eal/common/malloc_heap.c +++ b/lib/librte_eal/common/malloc_heap.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -20,9 +21,13 @@ #include #include #include +#include +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" #include "malloc_elem.h" #include "malloc_heap.h" +#include "malloc_mp.h" static unsigned check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) @@ -62,26 +67,51 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) } /* - * Expand the heap with a memseg. - * This reserves the zone and sets a dummy malloc_elem header at the end - * to prevent overflow. The rest of the zone is added to free list as a single - * large free block + * Expand the heap with a memory area. */ -static void -malloc_heap_add_memseg(struct malloc_heap *heap, struct rte_memseg *ms) +static struct malloc_elem * +malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl, + void *start, size_t len) { - /* allocate the memory block headers, one at end, one at start */ - struct malloc_elem *start_elem = (struct malloc_elem *)ms->addr; - struct malloc_elem *end_elem = RTE_PTR_ADD(ms->addr, - ms->len - MALLOC_ELEM_OVERHEAD); - end_elem = RTE_PTR_ALIGN_FLOOR(end_elem, RTE_CACHE_LINE_SIZE); - const size_t elem_size = (uintptr_t)end_elem - (uintptr_t)start_elem; + struct malloc_elem *elem = start; + + malloc_elem_init(elem, heap, msl, len); + + malloc_elem_insert(elem); + + elem = malloc_elem_join_adjacent_free(elem); - malloc_elem_init(start_elem, heap, ms, elem_size); - malloc_elem_mkend(end_elem, start_elem); - malloc_elem_free_list_insert(start_elem); + malloc_elem_free_list_insert(elem); - heap->total_size += elem_size; + return elem; +} + +static int +malloc_add_seg(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *found_msl; + struct malloc_heap *heap; + int msl_idx; + + heap = &mcfg->malloc_heaps[msl->socket_id]; + + /* msl is const, so find it */ + msl_idx = msl - mcfg->memsegs; + + if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) + return -1; + + found_msl = &mcfg->memsegs[msl_idx]; + + malloc_heap_add_memory(heap, found_msl, ms->addr, len); + + heap->total_size += len; + + RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20, + msl->socket_id); + return 0; } /* @@ -92,7 +122,7 @@ malloc_heap_add_memseg(struct malloc_heap *heap, struct rte_memseg *ms) */ static struct malloc_elem * find_suitable_element(struct malloc_heap *heap, size_t size, - unsigned flags, size_t align, size_t bound) + unsigned int flags, size_t align, size_t bound, bool contig) { size_t idx; struct malloc_elem *elem, *alt_elem = NULL; @@ -101,8 +131,10 @@ find_suitable_element(struct malloc_heap *heap, size_t size, idx < RTE_HEAP_NUM_FREELISTS; idx++) { for (elem = LIST_FIRST(&heap->free_head[idx]); !!elem; elem = LIST_NEXT(elem, free_list)) { - if (malloc_elem_can_hold(elem, size, align, bound)) { - if (check_hugepage_sz(flags, elem->ms->hugepage_sz)) + if (malloc_elem_can_hold(elem, size, align, bound, + contig)) { + if (check_hugepage_sz(flags, + elem->msl->page_sz)) return elem; if (alt_elem == NULL) alt_elem = elem; @@ -122,29 +154,639 @@ find_suitable_element(struct malloc_heap *heap, size_t size, * scan fails. Once the new memseg is added, it re-scans and should return * the new element after releasing the lock. */ -void * -malloc_heap_alloc(struct malloc_heap *heap, - const char *type __attribute__((unused)), size_t size, unsigned flags, - size_t align, size_t bound) +static void * +heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size, + unsigned int flags, size_t align, size_t bound, bool contig) { struct malloc_elem *elem; size = RTE_CACHE_LINE_ROUNDUP(size); align = RTE_CACHE_LINE_ROUNDUP(align); - rte_spinlock_lock(&heap->lock); - - elem = find_suitable_element(heap, size, flags, align, bound); + elem = find_suitable_element(heap, size, flags, align, bound, contig); if (elem != NULL) { - elem = malloc_elem_alloc(elem, size, align, bound); + elem = malloc_elem_alloc(elem, size, align, bound, contig); + /* increase heap's count of allocated elements */ heap->alloc_count++; } - rte_spinlock_unlock(&heap->lock); return elem == NULL ? NULL : (void *)(&elem[1]); } +/* this function is exposed in malloc_mp.h */ +void +rollback_expand_heap(struct rte_memseg **ms, int n_segs, + struct malloc_elem *elem, void *map_addr, size_t map_len) +{ + if (elem != NULL) { + malloc_elem_free_list_remove(elem); + malloc_elem_hide_region(elem, map_addr, map_len); + } + + eal_memalloc_free_seg_bulk(ms, n_segs); +} + +/* this function is exposed in malloc_mp.h */ +struct malloc_elem * +alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig, struct rte_memseg **ms, int n_segs) +{ + struct rte_memseg_list *msl; + struct malloc_elem *elem = NULL; + size_t alloc_sz; + int allocd_pages; + void *ret, *map_addr; + + alloc_sz = (size_t)pg_sz * n_segs; + + /* first, check if we're allowed to allocate this memory */ + if (eal_memalloc_mem_alloc_validate(socket, + heap->total_size + alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "User has disallowed allocation\n"); + return NULL; + } + + allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz, + socket, true); + + /* make sure we've allocated our pages... */ + if (allocd_pages < 0) + return NULL; + + map_addr = ms[0]->addr; + msl = rte_mem_virt2memseg_list(map_addr); + + /* check if we wanted contiguous memory but didn't get it */ + if (contig && !eal_memalloc_is_contig(msl, map_addr, alloc_sz)) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n", + __func__); + goto fail; + } + + /* add newly minted memsegs to malloc heap */ + elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz); + + /* try once more, as now we have allocated new memory */ + ret = find_suitable_element(heap, elt_size, flags, align, bound, + contig); + + if (ret == NULL) + goto fail; + + return elem; + +fail: + rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); + return NULL; +} + +static int +try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz, + size_t elt_size, int socket, unsigned int flags, size_t align, + size_t bound, bool contig) +{ + struct malloc_elem *elem; + struct rte_memseg **ms; + void *map_addr; + size_t alloc_sz; + int n_segs; + bool callback_triggered = false; + + alloc_sz = RTE_ALIGN_CEIL(align + elt_size + + MALLOC_ELEM_TRAILER_LEN, pg_sz); + n_segs = alloc_sz / pg_sz; + + /* we can't know in advance how many pages we'll need, so we malloc */ + ms = malloc(sizeof(*ms) * n_segs); + + memset(ms, 0, sizeof(*ms) * n_segs); + + if (ms == NULL) + return -1; + + elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align, + bound, contig, ms, n_segs); + + if (elem == NULL) + goto free_ms; + + map_addr = ms[0]->addr; + + /* notify user about changes in memory map */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, map_addr, alloc_sz); + + /* notify other processes that this has happened */ + if (request_sync()) { + /* we couldn't ensure all processes have mapped memory, + * so free it back and notify everyone that it's been + * freed back. + * + * technically, we could've avoided adding memory addresses to + * the map, but that would've led to inconsistent behavior + * between primary and secondary processes, as those get + * callbacks during sync. therefore, force primary process to + * do alloc-and-rollback syncs as well. + */ + callback_triggered = true; + goto free_elem; + } + heap->total_size += alloc_sz; + + RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n", + socket, alloc_sz >> 20ULL); + + free(ms); + + return 0; + +free_elem: + if (callback_triggered) + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + map_addr, alloc_sz); + + rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); + + request_sync(); +free_ms: + free(ms); + + return -1; +} + +static int +try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz, + size_t elt_size, int socket, unsigned int flags, size_t align, + size_t bound, bool contig) +{ + struct malloc_mp_req req; + int req_result; + + memset(&req, 0, sizeof(req)); + + req.t = REQ_TYPE_ALLOC; + req.alloc_req.align = align; + req.alloc_req.bound = bound; + req.alloc_req.contig = contig; + req.alloc_req.flags = flags; + req.alloc_req.elt_size = elt_size; + req.alloc_req.page_sz = pg_sz; + req.alloc_req.socket = socket; + req.alloc_req.heap = heap; /* it's in shared memory */ + + req_result = request_to_primary(&req); + + if (req_result != 0) + return -1; + + if (req.result != REQ_RESULT_SUCCESS) + return -1; + + return 0; +} + +static int +try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int ret; + + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket, + flags, align, bound, contig); + } else { + ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket, + flags, align, bound, contig); + } + + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); + return ret; +} + +static int +compare_pagesz(const void *a, const void *b) +{ + const struct rte_memseg_list * const*mpa = a; + const struct rte_memseg_list * const*mpb = b; + const struct rte_memseg_list *msla = *mpa; + const struct rte_memseg_list *mslb = *mpb; + uint64_t pg_sz_a = msla->page_sz; + uint64_t pg_sz_b = mslb->page_sz; + + if (pg_sz_a < pg_sz_b) + return -1; + if (pg_sz_a > pg_sz_b) + return 1; + return 0; +} + +static int +alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS]; + struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS]; + uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS]; + uint64_t other_pg_sz[RTE_MAX_MEMSEG_LISTS]; + uint64_t prev_pg_sz; + int i, n_other_msls, n_other_pg_sz, n_requested_msls, n_requested_pg_sz; + bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0; + unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; + void *ret; + + memset(requested_msls, 0, sizeof(requested_msls)); + memset(other_msls, 0, sizeof(other_msls)); + memset(requested_pg_sz, 0, sizeof(requested_pg_sz)); + memset(other_pg_sz, 0, sizeof(other_pg_sz)); + + /* + * go through memseg list and take note of all the page sizes available, + * and if any of them were specifically requested by the user. + */ + n_requested_msls = 0; + n_other_msls = 0; + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + + if (msl->socket_id != socket) + continue; + + if (msl->base_va == NULL) + continue; + + /* if pages of specific size were requested */ + if (size_flags != 0 && check_hugepage_sz(size_flags, + msl->page_sz)) + requested_msls[n_requested_msls++] = msl; + else if (size_flags == 0 || size_hint) + other_msls[n_other_msls++] = msl; + } + + /* sort the lists, smallest first */ + qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]), + compare_pagesz); + qsort(other_msls, n_other_msls, sizeof(other_msls[0]), + compare_pagesz); + + /* now, extract page sizes we are supposed to try */ + prev_pg_sz = 0; + n_requested_pg_sz = 0; + for (i = 0; i < n_requested_msls; i++) { + uint64_t pg_sz = requested_msls[i]->page_sz; + + if (prev_pg_sz != pg_sz) { + requested_pg_sz[n_requested_pg_sz++] = pg_sz; + prev_pg_sz = pg_sz; + } + } + prev_pg_sz = 0; + n_other_pg_sz = 0; + for (i = 0; i < n_other_msls; i++) { + uint64_t pg_sz = other_msls[i]->page_sz; + + if (prev_pg_sz != pg_sz) { + other_pg_sz[n_other_pg_sz++] = pg_sz; + prev_pg_sz = pg_sz; + } + } + + /* finally, try allocating memory of specified page sizes, starting from + * the smallest sizes + */ + for (i = 0; i < n_requested_pg_sz; i++) { + uint64_t pg_sz = requested_pg_sz[i]; + + /* + * do not pass the size hint here, as user expects other page + * sizes first, before resorting to best effort allocation. + */ + if (!try_expand_heap(heap, pg_sz, size, socket, size_flags, + align, bound, contig)) + return 0; + } + if (n_other_pg_sz == 0) + return -1; + + /* now, check if we can reserve anything with size hint */ + ret = find_suitable_element(heap, size, flags, align, bound, contig); + if (ret != NULL) + return 0; + + /* + * we still couldn't reserve memory, so try expanding heap with other + * page sizes, if there are any + */ + for (i = 0; i < n_other_pg_sz; i++) { + uint64_t pg_sz = other_pg_sz[i]; + + if (!try_expand_heap(heap, pg_sz, size, socket, flags, + align, bound, contig)) + return 0; + } + return -1; +} + +/* this will try lower page sizes first */ +static void * +heap_alloc_on_socket(const char *type, size_t size, int socket, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = &mcfg->malloc_heaps[socket]; + unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; + void *ret; + + rte_spinlock_lock(&(heap->lock)); + + align = align == 0 ? 1 : align; + + /* for legacy mode, try once and with all flags */ + if (internal_config.legacy_mem) { + ret = heap_alloc(heap, type, size, flags, align, bound, contig); + goto alloc_unlock; + } + + /* + * we do not pass the size hint here, because even if allocation fails, + * we may still be able to allocate memory from appropriate page sizes, + * we just need to request more memory first. + */ + ret = heap_alloc(heap, type, size, size_flags, align, bound, contig); + if (ret != NULL) + goto alloc_unlock; + + if (!alloc_more_mem_on_socket(heap, size, socket, flags, align, bound, + contig)) { + ret = heap_alloc(heap, type, size, flags, align, bound, contig); + + /* this should have succeeded */ + if (ret == NULL) + RTE_LOG(ERR, EAL, "Error allocating from heap\n"); + } +alloc_unlock: + rte_spinlock_unlock(&(heap->lock)); + return ret; +} + +void * +malloc_heap_alloc(const char *type, size_t size, int socket_arg, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + int socket, i, cur_socket; + void *ret; + + /* return NULL if size is 0 or alignment is not power-of-2 */ + if (size == 0 || (align && !rte_is_power_of_2(align))) + return NULL; + + if (!rte_eal_has_hugepages()) + socket_arg = SOCKET_ID_ANY; + + if (socket_arg == SOCKET_ID_ANY) + socket = malloc_get_numa_socket(); + else + socket = socket_arg; + + /* Check socket parameter */ + if (socket >= RTE_MAX_NUMA_NODES) + return NULL; + + ret = heap_alloc_on_socket(type, size, socket, flags, align, bound, + contig); + if (ret != NULL || socket_arg != SOCKET_ID_ANY) + return ret; + + /* try other heaps */ + for (i = 0; i < (int) rte_socket_count(); i++) { + cur_socket = rte_socket_id_by_idx(i); + if (cur_socket == socket) + continue; + ret = heap_alloc_on_socket(type, size, cur_socket, flags, + align, bound, contig); + if (ret != NULL) + return ret; + } + return NULL; +} + +/* this function is exposed in malloc_mp.h */ +int +malloc_heap_free_pages(void *aligned_start, size_t aligned_len) +{ + int n_segs, seg_idx, max_seg_idx; + struct rte_memseg_list *msl; + size_t page_sz; + + msl = rte_mem_virt2memseg_list(aligned_start); + if (msl == NULL) + return -1; + + page_sz = (size_t)msl->page_sz; + n_segs = aligned_len / page_sz; + seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz; + max_seg_idx = seg_idx + n_segs; + + for (; seg_idx < max_seg_idx; seg_idx++) { + struct rte_memseg *ms; + + ms = rte_fbarray_get(&msl->memseg_arr, seg_idx); + eal_memalloc_free_seg(ms); + } + return 0; +} + +int +malloc_heap_free(struct malloc_elem *elem) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap; + void *start, *aligned_start, *end, *aligned_end; + size_t len, aligned_len, page_sz; + struct rte_memseg_list *msl; + unsigned int i, n_segs, before_space, after_space; + int ret; + + if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) + return -1; + + /* elem may be merged with previous element, so keep heap address */ + heap = elem->heap; + msl = elem->msl; + page_sz = (size_t)msl->page_sz; + + rte_spinlock_lock(&(heap->lock)); + + /* mark element as free */ + elem->state = ELEM_FREE; + + elem = malloc_elem_free(elem); + + /* anything after this is a bonus */ + ret = 0; + + /* ...of which we can't avail if we are in legacy mode */ + if (internal_config.legacy_mem) + goto free_unlock; + + /* check if we can free any memory back to the system */ + if (elem->size < page_sz) + goto free_unlock; + + /* probably, but let's make sure, as we may not be using up full page */ + start = elem; + len = elem->size; + aligned_start = RTE_PTR_ALIGN_CEIL(start, page_sz); + end = RTE_PTR_ADD(elem, len); + aligned_end = RTE_PTR_ALIGN_FLOOR(end, page_sz); + + aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); + + /* can't free anything */ + if (aligned_len < page_sz) + goto free_unlock; + + /* we can free something. however, some of these pages may be marked as + * unfreeable, so also check that as well + */ + n_segs = aligned_len / page_sz; + for (i = 0; i < n_segs; i++) { + const struct rte_memseg *tmp = + rte_mem_virt2memseg(aligned_start, msl); + + if (tmp->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { + /* this is an unfreeable segment, so move start */ + aligned_start = RTE_PTR_ADD(tmp->addr, tmp->len); + } + } + + /* recalculate length and number of segments */ + aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); + n_segs = aligned_len / page_sz; + + /* check if we can still free some pages */ + if (n_segs == 0) + goto free_unlock; + + /* We're not done yet. We also have to check if by freeing space we will + * be leaving free elements that are too small to store new elements. + * Check if we have enough space in the beginning and at the end, or if + * start/end are exactly page aligned. + */ + before_space = RTE_PTR_DIFF(aligned_start, elem); + after_space = RTE_PTR_DIFF(end, aligned_end); + if (before_space != 0 && + before_space < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* There is not enough space before start, but we may be able to + * move the start forward by one page. + */ + if (n_segs == 1) + goto free_unlock; + + /* move start */ + aligned_start = RTE_PTR_ADD(aligned_start, page_sz); + aligned_len -= page_sz; + n_segs--; + } + if (after_space != 0 && after_space < + MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* There is not enough space after end, but we may be able to + * move the end backwards by one page. + */ + if (n_segs == 1) + goto free_unlock; + + /* move end */ + aligned_end = RTE_PTR_SUB(aligned_end, page_sz); + aligned_len -= page_sz; + n_segs--; + } + + /* now we can finally free us some pages */ + + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); + + /* + * we allow secondary processes to clear the heap of this allocated + * memory because it is safe to do so, as even if notifications about + * unmapped pages don't make it to other processes, heap is shared + * across all processes, and will become empty of this memory anyway, + * and nothing can allocate it back unless primary process will be able + * to deliver allocation message to every single running process. + */ + + malloc_elem_free_list_remove(elem); + + malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len); + + heap->total_size -= aligned_len; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* notify user about changes in memory map */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + aligned_start, aligned_len); + + /* don't care if any of this fails */ + malloc_heap_free_pages(aligned_start, aligned_len); + + request_sync(); + } else { + struct malloc_mp_req req; + + memset(&req, 0, sizeof(req)); + + req.t = REQ_TYPE_FREE; + req.free_req.addr = aligned_start; + req.free_req.len = aligned_len; + + /* + * we request primary to deallocate pages, but we don't do it + * in this thread. instead, we notify primary that we would like + * to deallocate pages, and this process will receive another + * request (in parallel) that will do it for us on another + * thread. + * + * we also don't really care if this succeeds - the data is + * already removed from the heap, so it is, for all intents and + * purposes, hidden from the rest of DPDK even if some other + * process (including this one) may have these pages mapped. + * + * notifications about deallocated memory happen during sync. + */ + request_to_primary(&req); + } + + RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n", + msl->socket_id, aligned_len >> 20ULL); + + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); +free_unlock: + rte_spinlock_unlock(&(heap->lock)); + return ret; +} + +int +malloc_heap_resize(struct malloc_elem *elem, size_t size) +{ + int ret; + + if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) + return -1; + + rte_spinlock_lock(&(elem->heap->lock)); + + ret = malloc_elem_resize(elem, size); + + rte_spinlock_unlock(&(elem->heap->lock)); + + return ret; +} + /* * Function to retrieve data for heap on given socket */ @@ -183,21 +825,49 @@ malloc_heap_get_stats(struct malloc_heap *heap, return 0; } +/* + * Function to retrieve data for heap on given socket + */ +void +malloc_heap_dump(struct malloc_heap *heap, FILE *f) +{ + struct malloc_elem *elem; + + rte_spinlock_lock(&heap->lock); + + fprintf(f, "Heap size: 0x%zx\n", heap->total_size); + fprintf(f, "Heap alloc count: %u\n", heap->alloc_count); + + elem = heap->first; + while (elem) { + malloc_elem_dump(elem, f); + elem = elem->next; + } + + rte_spinlock_unlock(&heap->lock); +} + int rte_eal_malloc_heap_init(void) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - unsigned ms_cnt; - struct rte_memseg *ms; - if (mcfg == NULL) + if (register_mp_requests()) { + RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n"); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); return -1; - - for (ms = &mcfg->memseg[0], ms_cnt = 0; - (ms_cnt < RTE_MAX_MEMSEG) && (ms->len > 0); - ms_cnt++, ms++) { - malloc_heap_add_memseg(&mcfg->malloc_heaps[ms->socket_id], ms); } - return 0; + /* unlock mem hotplug here. it's safe for primary as no requests can + * even come before primary itself is fully initialized, and secondaries + * do not need to initialize the heap. + */ + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + + /* secondary process does not need to initialize anything */ + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + + /* add all IOVA-contiguous areas to the heap */ + return rte_memseg_contig_walk(malloc_add_seg, NULL); } diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h index e0defa70..03b80141 100644 --- a/lib/librte_eal/common/malloc_heap.h +++ b/lib/librte_eal/common/malloc_heap.h @@ -5,6 +5,8 @@ #ifndef MALLOC_HEAP_H_ #define MALLOC_HEAP_H_ +#include + #include #include @@ -24,13 +26,22 @@ malloc_get_numa_socket(void) } void * -malloc_heap_alloc(struct malloc_heap *heap, const char *type, size_t size, - unsigned flags, size_t align, size_t bound); +malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags, + size_t align, size_t bound, bool contig); + +int +malloc_heap_free(struct malloc_elem *elem); + +int +malloc_heap_resize(struct malloc_elem *elem, size_t size); int malloc_heap_get_stats(struct malloc_heap *heap, struct rte_malloc_socket_stats *socket_stats); +void +malloc_heap_dump(struct malloc_heap *heap, FILE *f); + int rte_eal_malloc_heap_init(void); diff --git a/lib/librte_eal/common/malloc_mp.c b/lib/librte_eal/common/malloc_mp.c new file mode 100644 index 00000000..931c14bc --- /dev/null +++ b/lib/librte_eal/common/malloc_mp.c @@ -0,0 +1,743 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include + +#include +#include +#include + +#include "eal_memalloc.h" + +#include "malloc_elem.h" +#include "malloc_mp.h" + +#define MP_ACTION_SYNC "mp_malloc_sync" +/**< request sent by primary process to notify of changes in memory map */ +#define MP_ACTION_ROLLBACK "mp_malloc_rollback" +/**< request sent by primary process to notify of changes in memory map. this is + * essentially a regular sync request, but we cannot send sync requests while + * another one is in progress, and we might have to - therefore, we do this as + * a separate callback. + */ +#define MP_ACTION_REQUEST "mp_malloc_request" +/**< request sent by secondary process to ask for allocation/deallocation */ +#define MP_ACTION_RESPONSE "mp_malloc_response" +/**< response sent to secondary process to indicate result of request */ + +/* forward declarations */ +static int +handle_sync_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply); +static int +handle_rollback_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply); + +#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */ + +/* when we're allocating, we need to store some state to ensure that we can + * roll back later + */ +struct primary_alloc_req_state { + struct malloc_heap *heap; + struct rte_memseg **ms; + int ms_len; + struct malloc_elem *elem; + void *map_addr; + size_t map_len; +}; + +enum req_state { + REQ_STATE_INACTIVE = 0, + REQ_STATE_ACTIVE, + REQ_STATE_COMPLETE +}; + +struct mp_request { + TAILQ_ENTRY(mp_request) next; + struct malloc_mp_req user_req; /**< contents of request */ + pthread_cond_t cond; /**< variable we use to time out on this request */ + enum req_state state; /**< indicate status of this request */ + struct primary_alloc_req_state alloc_state; +}; + +/* + * We could've used just a single request, but it may be possible for + * secondaries to timeout earlier than the primary, and send a new request while + * primary is still expecting replies to the old one. Therefore, each new + * request will get assigned a new ID, which is how we will distinguish between + * expected and unexpected messages. + */ +TAILQ_HEAD(mp_request_list, mp_request); +static struct { + struct mp_request_list list; + pthread_mutex_t lock; +} mp_request_list = { + .list = TAILQ_HEAD_INITIALIZER(mp_request_list.list), + .lock = PTHREAD_MUTEX_INITIALIZER +}; + +/** + * General workflow is the following: + * + * Allocation: + * S: send request to primary + * P: attempt to allocate memory + * if failed, sendmsg failure + * if success, send sync request + * S: if received msg of failure, quit + * if received sync request, synchronize memory map and reply with result + * P: if received sync request result + * if success, sendmsg success + * if failure, roll back allocation and send a rollback request + * S: if received msg of success, quit + * if received rollback request, synchronize memory map and reply with result + * P: if received sync request result + * sendmsg sync request result + * S: if received msg, quit + * + * Aside from timeouts, there are three points where we can quit: + * - if allocation failed straight away + * - if allocation and sync request succeeded + * - if allocation succeeded, sync request failed, allocation rolled back and + * rollback request received (irrespective of whether it succeeded or failed) + * + * Deallocation: + * S: send request to primary + * P: attempt to deallocate memory + * if failed, sendmsg failure + * if success, send sync request + * S: if received msg of failure, quit + * if received sync request, synchronize memory map and reply with result + * P: if received sync request result + * sendmsg sync request result + * S: if received msg, quit + * + * There is no "rollback" from deallocation, as it's safe to have some memory + * mapped in some processes - it's absent from the heap, so it won't get used. + */ + +static struct mp_request * +find_request_by_id(uint64_t id) +{ + struct mp_request *req; + TAILQ_FOREACH(req, &mp_request_list.list, next) { + if (req->user_req.id == id) + break; + } + return req; +} + +/* this ID is, like, totally guaranteed to be absolutely unique. pinky swear. */ +static uint64_t +get_unique_id(void) +{ + uint64_t id; + do { + id = rte_rand(); + } while (find_request_by_id(id) != NULL); + return id; +} + +/* secondary will respond to sync requests thusly */ +static int +handle_sync(const struct rte_mp_msg *msg, const void *peer) +{ + struct rte_mp_msg reply; + const struct malloc_mp_req *req = + (const struct malloc_mp_req *)msg->param; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply.param; + int ret; + + if (req->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected request from primary\n"); + return -1; + } + + memset(&reply, 0, sizeof(reply)); + + reply.num_fds = 0; + strlcpy(reply.name, msg->name, sizeof(reply.name)); + reply.len_param = sizeof(*resp); + + ret = eal_memalloc_sync_with_primary(); + + resp->t = REQ_TYPE_SYNC; + resp->id = req->id; + resp->result = ret == 0 ? REQ_RESULT_SUCCESS : REQ_RESULT_FAIL; + + rte_mp_reply(&reply, peer); + + return 0; +} + +static int +handle_alloc_request(const struct malloc_mp_req *m, + struct mp_request *req) +{ + const struct malloc_req_alloc *ar = &m->alloc_req; + struct malloc_heap *heap; + struct malloc_elem *elem; + struct rte_memseg **ms; + size_t alloc_sz; + int n_segs; + void *map_addr; + + alloc_sz = RTE_ALIGN_CEIL(ar->align + ar->elt_size + + MALLOC_ELEM_TRAILER_LEN, ar->page_sz); + n_segs = alloc_sz / ar->page_sz; + + heap = ar->heap; + + /* we can't know in advance how many pages we'll need, so we malloc */ + ms = malloc(sizeof(*ms) * n_segs); + + memset(ms, 0, sizeof(*ms) * n_segs); + + if (ms == NULL) { + RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n"); + goto fail; + } + + elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket, + ar->flags, ar->align, ar->bound, ar->contig, ms, + n_segs); + + if (elem == NULL) + goto fail; + + map_addr = ms[0]->addr; + + /* we have succeeded in allocating memory, but we still need to sync + * with other processes. however, since DPDK IPC is single-threaded, we + * send an asynchronous request and exit this callback. + */ + + req->alloc_state.ms = ms; + req->alloc_state.ms_len = n_segs; + req->alloc_state.map_addr = map_addr; + req->alloc_state.map_len = alloc_sz; + req->alloc_state.elem = elem; + req->alloc_state.heap = heap; + + return 0; +fail: + free(ms); + return -1; +} + +/* first stage of primary handling requests from secondary */ +static int +handle_request(const struct rte_mp_msg *msg, const void *peer __rte_unused) +{ + const struct malloc_mp_req *m = + (const struct malloc_mp_req *)msg->param; + struct mp_request *entry; + int ret; + + /* lock access to request */ + pthread_mutex_lock(&mp_request_list.lock); + + /* make sure it's not a dupe */ + entry = find_request_by_id(m->id); + if (entry != NULL) { + RTE_LOG(ERR, EAL, "Duplicate request id\n"); + goto fail; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate memory for request\n"); + goto fail; + } + + /* erase all data */ + memset(entry, 0, sizeof(*entry)); + + if (m->t == REQ_TYPE_ALLOC) { + ret = handle_alloc_request(m, entry); + } else if (m->t == REQ_TYPE_FREE) { + ret = malloc_heap_free_pages(m->free_req.addr, + m->free_req.len); + } else { + RTE_LOG(ERR, EAL, "Unexpected request from secondary\n"); + goto fail; + } + + if (ret != 0) { + struct rte_mp_msg resp_msg; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)resp_msg.param; + + /* send failure message straight away */ + resp_msg.num_fds = 0; + resp_msg.len_param = sizeof(*resp); + strlcpy(resp_msg.name, MP_ACTION_RESPONSE, + sizeof(resp_msg.name)); + + resp->t = m->t; + resp->result = REQ_RESULT_FAIL; + resp->id = m->id; + + if (rte_mp_sendmsg(&resp_msg)) { + RTE_LOG(ERR, EAL, "Couldn't send response\n"); + goto fail; + } + /* we did not modify the request */ + free(entry); + } else { + struct rte_mp_msg sr_msg; + struct malloc_mp_req *sr = + (struct malloc_mp_req *)sr_msg.param; + struct timespec ts; + + memset(&sr_msg, 0, sizeof(sr_msg)); + + /* we can do something, so send sync request asynchronously */ + sr_msg.num_fds = 0; + sr_msg.len_param = sizeof(*sr); + strlcpy(sr_msg.name, MP_ACTION_SYNC, sizeof(sr_msg.name)); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* sync requests carry no data */ + sr->t = REQ_TYPE_SYNC; + sr->id = m->id; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_async(&sr_msg, &ts, + handle_sync_response); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't send sync request\n"); + if (m->t == REQ_TYPE_ALLOC) + free(entry->alloc_state.ms); + goto fail; + } + + /* mark request as in progress */ + memcpy(&entry->user_req, m, sizeof(*m)); + entry->state = REQ_STATE_ACTIVE; + + TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next); + } + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + free(entry); + return -1; +} + +/* callback for asynchronous sync requests for primary. this will either do a + * sendmsg with results, or trigger rollback request. + */ +static int +handle_sync_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply) +{ + enum malloc_req_result result; + struct mp_request *entry; + const struct malloc_mp_req *mpreq = + (const struct malloc_mp_req *)request->param; + int i; + + /* lock the request */ + pthread_mutex_lock(&mp_request_list.lock); + + entry = find_request_by_id(mpreq->id); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + goto fail; + } + + result = REQ_RESULT_SUCCESS; + + if (reply->nb_received != reply->nb_sent) + result = REQ_RESULT_FAIL; + + for (i = 0; i < reply->nb_received; i++) { + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply->msgs[i].param; + + if (resp->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected response to sync request\n"); + result = REQ_RESULT_FAIL; + break; + } + if (resp->id != entry->user_req.id) { + RTE_LOG(ERR, EAL, "Response to wrong sync request\n"); + result = REQ_RESULT_FAIL; + break; + } + if (resp->result == REQ_RESULT_FAIL) { + result = REQ_RESULT_FAIL; + break; + } + } + + if (entry->user_req.t == REQ_TYPE_FREE) { + struct rte_mp_msg msg; + struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param; + + memset(&msg, 0, sizeof(msg)); + + /* this is a free request, just sendmsg result */ + resp->t = REQ_TYPE_FREE; + resp->result = result; + resp->id = entry->user_req.id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry); + } else if (entry->user_req.t == REQ_TYPE_ALLOC && + result == REQ_RESULT_SUCCESS) { + struct malloc_heap *heap = entry->alloc_state.heap; + struct rte_mp_msg msg; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)msg.param; + + memset(&msg, 0, sizeof(msg)); + + heap->total_size += entry->alloc_state.map_len; + + /* result is success, so just notify secondary about this */ + resp->t = REQ_TYPE_ALLOC; + resp->result = result; + resp->id = entry->user_req.id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry->alloc_state.ms); + free(entry); + } else if (entry->user_req.t == REQ_TYPE_ALLOC && + result == REQ_RESULT_FAIL) { + struct rte_mp_msg rb_msg; + struct malloc_mp_req *rb = + (struct malloc_mp_req *)rb_msg.param; + struct timespec ts; + struct primary_alloc_req_state *state = + &entry->alloc_state; + int ret; + + memset(&rb_msg, 0, sizeof(rb_msg)); + + /* we've failed to sync, so do a rollback */ + rollback_expand_heap(state->ms, state->ms_len, state->elem, + state->map_addr, state->map_len); + + /* send rollback request */ + rb_msg.num_fds = 0; + rb_msg.len_param = sizeof(*rb); + strlcpy(rb_msg.name, MP_ACTION_ROLLBACK, sizeof(rb_msg.name)); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* sync requests carry no data */ + rb->t = REQ_TYPE_SYNC; + rb->id = entry->user_req.id; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_async(&rb_msg, &ts, + handle_rollback_response); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Could not send rollback request to secondary process\n"); + + /* we couldn't send rollback request, but that's OK - + * secondary will time out, and memory has been removed + * from heap anyway. + */ + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(state->ms); + free(entry); + goto fail; + } + } else { + RTE_LOG(ERR, EAL, " to sync request of unknown type\n"); + goto fail; + } + + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + return -1; +} + +static int +handle_rollback_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply __rte_unused) +{ + struct rte_mp_msg msg; + struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param; + const struct malloc_mp_req *mpreq = + (const struct malloc_mp_req *)request->param; + struct mp_request *entry; + + /* lock the request */ + pthread_mutex_lock(&mp_request_list.lock); + + memset(&msg, 0, sizeof(0)); + + entry = find_request_by_id(mpreq->id); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + goto fail; + } + + if (entry->user_req.t != REQ_TYPE_ALLOC) { + RTE_LOG(ERR, EAL, "Unexpected active request\n"); + goto fail; + } + + /* we don't care if rollback succeeded, request still failed */ + resp->t = REQ_TYPE_ALLOC; + resp->result = REQ_RESULT_FAIL; + resp->id = mpreq->id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + /* clean up */ + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry->alloc_state.ms); + free(entry); + + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + return -1; +} + +/* final stage of the request from secondary */ +static int +handle_response(const struct rte_mp_msg *msg, const void *peer __rte_unused) +{ + const struct malloc_mp_req *m = + (const struct malloc_mp_req *)msg->param; + struct mp_request *entry; + + pthread_mutex_lock(&mp_request_list.lock); + + entry = find_request_by_id(m->id); + if (entry != NULL) { + /* update request status */ + entry->user_req.result = m->result; + + entry->state = REQ_STATE_COMPLETE; + + /* trigger thread wakeup */ + pthread_cond_signal(&entry->cond); + } + + pthread_mutex_unlock(&mp_request_list.lock); + + return 0; +} + +/* synchronously request memory map sync, this is only called whenever primary + * process initiates the allocation. + */ +int +request_sync(void) +{ + struct rte_mp_msg msg; + struct rte_mp_reply reply; + struct malloc_mp_req *req = (struct malloc_mp_req *)msg.param; + struct timespec ts; + int i, ret; + + memset(&msg, 0, sizeof(msg)); + memset(&reply, 0, sizeof(reply)); + + /* no need to create tailq entries as this is entirely synchronous */ + + msg.num_fds = 0; + msg.len_param = sizeof(*req); + strlcpy(msg.name, MP_ACTION_SYNC, sizeof(msg.name)); + + /* sync request carries no data */ + req->t = REQ_TYPE_SYNC; + req->id = get_unique_id(); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_sync(&msg, &reply, &ts); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Could not send sync request to secondary process\n"); + ret = -1; + goto out; + } + + if (reply.nb_received != reply.nb_sent) { + RTE_LOG(ERR, EAL, "Not all secondaries have responded\n"); + ret = -1; + goto out; + } + + for (i = 0; i < reply.nb_received; i++) { + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply.msgs[i].param; + if (resp->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected response from secondary\n"); + ret = -1; + goto out; + } + if (resp->id != req->id) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + ret = -1; + goto out; + } + if (resp->result != REQ_RESULT_SUCCESS) { + RTE_LOG(ERR, EAL, "Secondary process failed to synchronize\n"); + ret = -1; + goto out; + } + } + + ret = 0; +out: + free(reply.msgs); + return ret; +} + +/* this is a synchronous wrapper around a bunch of asynchronous requests to + * primary process. this will initiate a request and wait until responses come. + */ +int +request_to_primary(struct malloc_mp_req *user_req) +{ + struct rte_mp_msg msg; + struct malloc_mp_req *msg_req = (struct malloc_mp_req *)msg.param; + struct mp_request *entry; + struct timespec ts; + struct timeval now; + int ret; + + memset(&msg, 0, sizeof(msg)); + memset(&ts, 0, sizeof(ts)); + + pthread_mutex_lock(&mp_request_list.lock); + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate memory for request\n"); + goto fail; + } + + memset(entry, 0, sizeof(*entry)); + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Cannot get current time\n"); + goto fail; + } + + ts.tv_nsec = (now.tv_usec * 1000) % 1000000000; + ts.tv_sec = now.tv_sec + MP_TIMEOUT_S + + (now.tv_usec * 1000) / 1000000000; + + /* initialize the request */ + pthread_cond_init(&entry->cond, NULL); + + msg.num_fds = 0; + msg.len_param = sizeof(*msg_req); + strlcpy(msg.name, MP_ACTION_REQUEST, sizeof(msg.name)); + + /* (attempt to) get a unique id */ + user_req->id = get_unique_id(); + + /* copy contents of user request into the message */ + memcpy(msg_req, user_req, sizeof(*msg_req)); + + if (rte_mp_sendmsg(&msg)) { + RTE_LOG(ERR, EAL, "Cannot send message to primary\n"); + goto fail; + } + + /* copy contents of user request into active request */ + memcpy(&entry->user_req, user_req, sizeof(*user_req)); + + /* mark request as in progress */ + entry->state = REQ_STATE_ACTIVE; + + TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next); + + /* finally, wait on timeout */ + do { + ret = pthread_cond_timedwait(&entry->cond, + &mp_request_list.lock, &ts); + } while (ret != 0 && ret != ETIMEDOUT); + + if (entry->state != REQ_STATE_COMPLETE) { + RTE_LOG(ERR, EAL, "Request timed out\n"); + ret = -1; + } else { + ret = 0; + user_req->result = entry->user_req.result; + } + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry); + + pthread_mutex_unlock(&mp_request_list.lock); + return ret; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + free(entry); + return -1; +} + +int +register_mp_requests(void) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + if (rte_mp_action_register(MP_ACTION_REQUEST, handle_request)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_REQUEST); + return -1; + } + } else { + if (rte_mp_action_register(MP_ACTION_SYNC, handle_sync)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_SYNC); + return -1; + } + if (rte_mp_action_register(MP_ACTION_ROLLBACK, handle_sync)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_SYNC); + return -1; + } + if (rte_mp_action_register(MP_ACTION_RESPONSE, + handle_response)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_RESPONSE); + return -1; + } + } + return 0; +} diff --git a/lib/librte_eal/common/malloc_mp.h b/lib/librte_eal/common/malloc_mp.h new file mode 100644 index 00000000..2b86b76f --- /dev/null +++ b/lib/librte_eal/common/malloc_mp.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef MALLOC_MP_H +#define MALLOC_MP_H + +#include +#include + +#include +#include +#include +#include + +/* forward declarations */ +struct malloc_heap; +struct rte_memseg; + +/* multiprocess synchronization structures for malloc */ +enum malloc_req_type { + REQ_TYPE_ALLOC, /**< ask primary to allocate */ + REQ_TYPE_FREE, /**< ask primary to free */ + REQ_TYPE_SYNC /**< ask secondary to synchronize its memory map */ +}; + +enum malloc_req_result { + REQ_RESULT_SUCCESS, + REQ_RESULT_FAIL +}; + +struct malloc_req_alloc { + struct malloc_heap *heap; + uint64_t page_sz; + size_t elt_size; + int socket; + unsigned int flags; + size_t align; + size_t bound; + bool contig; +}; + +struct malloc_req_free { + RTE_STD_C11 + union { + void *addr; + uint64_t addr_64; + }; + uint64_t len; +}; + +struct malloc_mp_req { + enum malloc_req_type t; + RTE_STD_C11 + union { + struct malloc_req_alloc alloc_req; + struct malloc_req_free free_req; + }; + uint64_t id; /**< not to be populated by caller */ + enum malloc_req_result result; +}; + +int +register_mp_requests(void); + +int +request_to_primary(struct malloc_mp_req *req); + +/* synchronous memory map sync request */ +int +request_sync(void); + +/* functions from malloc_heap exposed here */ +int +malloc_heap_free_pages(void *aligned_start, size_t aligned_len); + +struct malloc_elem * +alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig, struct rte_memseg **ms, int n_segs); + +void +rollback_expand_heap(struct rte_memseg **ms, int n_segs, + struct malloc_elem *elem, void *map_addr, size_t map_len); + +#endif /* MALLOC_MP_H */ diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build index 82b8910f..8a3dcfee 100644 --- a/lib/librte_eal/common/meson.build +++ b/lib/librte_eal/common/meson.build @@ -11,10 +11,12 @@ common_sources = files( 'eal_common_devargs.c', 'eal_common_dev.c', 'eal_common_errno.c', + 'eal_common_fbarray.c', 'eal_common_hexdump.c', 'eal_common_launch.c', 'eal_common_lcore.c', 'eal_common_log.c', + 'eal_common_memalloc.c', 'eal_common_memory.c', 'eal_common_memzone.c', 'eal_common_options.c', @@ -25,6 +27,7 @@ common_sources = files( 'eal_common_timer.c', 'malloc_elem.c', 'malloc_heap.c', + 'malloc_mp.c', 'rte_keepalive.c', 'rte_malloc.c', 'rte_reciprocal.c', @@ -51,6 +54,7 @@ common_headers = files( 'include/rte_eal_memconfig.h', 'include/rte_eal_interrupts.h', 'include/rte_errno.h', + 'include/rte_fbarray.h', 'include/rte_hexdump.h', 'include/rte_interrupts.h', 'include/rte_keepalive.h', diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c index e0e0d0b3..b51a6d11 100644 --- a/lib/librte_eal/common/rte_malloc.c +++ b/lib/librte_eal/common/rte_malloc.c @@ -29,20 +29,17 @@ void rte_free(void *addr) { if (addr == NULL) return; - if (malloc_elem_free(malloc_elem_from_data(addr)) < 0) - rte_panic("Fatal error: Invalid memory\n"); + if (malloc_heap_free(malloc_elem_from_data(addr)) < 0) + RTE_LOG(ERR, EAL, "Error: Invalid memory\n"); } /* * Allocate memory on specified heap. */ void * -rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg) +rte_malloc_socket(const char *type, size_t size, unsigned int align, + int socket_arg) { - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int socket, i; - void *ret; - /* return NULL if size is 0 or alignment is not power-of-2 */ if (size == 0 || (align && !rte_is_power_of_2(align))) return NULL; @@ -50,33 +47,12 @@ rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg) if (!rte_eal_has_hugepages()) socket_arg = SOCKET_ID_ANY; - if (socket_arg == SOCKET_ID_ANY) - socket = malloc_get_numa_socket(); - else - socket = socket_arg; - /* Check socket parameter */ - if (socket >= RTE_MAX_NUMA_NODES) + if (socket_arg >= RTE_MAX_NUMA_NODES) return NULL; - ret = malloc_heap_alloc(&mcfg->malloc_heaps[socket], type, - size, 0, align == 0 ? 1 : align, 0); - if (ret != NULL || socket_arg != SOCKET_ID_ANY) - return ret; - - /* try other heaps */ - for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { - /* we already tried this one */ - if (i == socket) - continue; - - ret = malloc_heap_alloc(&mcfg->malloc_heaps[i], type, - size, 0, align == 0 ? 1 : align, 0); - if (ret != NULL) - return ret; - } - - return NULL; + return malloc_heap_alloc(type, size, socket_arg, 0, + align == 0 ? 1 : align, 0, false); } /* @@ -134,13 +110,15 @@ rte_realloc(void *ptr, size_t size, unsigned align) return rte_malloc(NULL, size, align); struct malloc_elem *elem = malloc_elem_from_data(ptr); - if (elem == NULL) - rte_panic("Fatal error: memory corruption detected\n"); + if (elem == NULL) { + RTE_LOG(ERR, EAL, "Error: memory corruption detected\n"); + return NULL; + } size = RTE_CACHE_LINE_ROUNDUP(size), align = RTE_CACHE_LINE_ROUNDUP(align); /* check alignment matches first, and if ok, see if we can resize block */ if (RTE_PTR_ALIGN(ptr,align) == ptr && - malloc_elem_resize(elem, size) == 0) + malloc_heap_resize(elem, size) == 0) return ptr; /* either alignment is off, or we have no room to expand, @@ -181,6 +159,23 @@ rte_malloc_get_socket_stats(int socket, return malloc_heap_get_stats(&mcfg->malloc_heaps[socket], socket_stats); } +/* + * Function to dump contents of all heaps + */ +void __rte_experimental +rte_malloc_dump_heaps(FILE *f) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int idx; + + for (idx = 0; idx < rte_socket_count(); idx++) { + unsigned int socket = rte_socket_id_by_idx(idx); + fprintf(f, "Heap on socket %i:\n", socket); + malloc_heap_dump(&mcfg->malloc_heaps[socket], f); + } + +} + /* * Print stats on memory type. If type is NULL, info on all types is printed */ @@ -222,17 +217,21 @@ rte_malloc_set_limit(__rte_unused const char *type, rte_iova_t rte_malloc_virt2iova(const void *addr) { - rte_iova_t iova; - const struct malloc_elem *elem = malloc_elem_from_data(addr); + const struct rte_memseg *ms; + struct malloc_elem *elem = malloc_elem_from_data(addr); + if (elem == NULL) return RTE_BAD_IOVA; - if (elem->ms->iova == RTE_BAD_IOVA) - return RTE_BAD_IOVA; if (rte_eal_iova_mode() == RTE_IOVA_VA) - iova = (uintptr_t)addr; - else - iova = elem->ms->iova + - RTE_PTR_DIFF(addr, elem->ms->addr); - return iova; + return (uintptr_t) addr; + + ms = rte_mem_virt2memseg(addr, elem->msl); + if (ms == NULL) + return RTE_BAD_IOVA; + + if (ms->iova == RTE_BAD_IOVA) + return RTE_BAD_IOVA; + + return ms->iova + RTE_PTR_DIFF(addr, ms->addr); } diff --git a/lib/librte_eal/common/rte_service.c b/lib/librte_eal/common/rte_service.c index be9b5e6d..73507aac 100644 --- a/lib/librte_eal/common/rte_service.c +++ b/lib/librte_eal/common/rte_service.c @@ -115,7 +115,7 @@ fail_mem: return -ENOMEM; } -void __rte_experimental +void rte_service_finalize(void) { if (!rte_service_library_initialized) @@ -161,7 +161,7 @@ service_mt_safe(struct rte_service_spec_impl *s) return !!(s->spec.capabilities & RTE_SERVICE_CAP_MT_SAFE); } -int32_t __rte_experimental +int32_t rte_service_set_stats_enable(uint32_t id, int32_t enabled) { struct rte_service_spec_impl *s; @@ -175,7 +175,7 @@ rte_service_set_stats_enable(uint32_t id, int32_t enabled) return 0; } -int32_t __rte_experimental +int32_t rte_service_set_runstate_mapped_check(uint32_t id, int32_t enabled) { struct rte_service_spec_impl *s; @@ -189,13 +189,13 @@ rte_service_set_runstate_mapped_check(uint32_t id, int32_t enabled) return 0; } -uint32_t __rte_experimental +uint32_t rte_service_get_count(void) { return rte_service_count; } -int32_t __rte_experimental +int32_t rte_service_get_by_name(const char *name, uint32_t *service_id) { if (!service_id) @@ -213,7 +213,7 @@ rte_service_get_by_name(const char *name, uint32_t *service_id) return -ENODEV; } -const char * __rte_experimental +const char * rte_service_get_name(uint32_t id) { struct rte_service_spec_impl *s; @@ -221,7 +221,7 @@ rte_service_get_name(uint32_t id) return s->spec.name; } -int32_t __rte_experimental +int32_t rte_service_probe_capability(uint32_t id, uint32_t capability) { struct rte_service_spec_impl *s; @@ -229,7 +229,7 @@ rte_service_probe_capability(uint32_t id, uint32_t capability) return !!(s->spec.capabilities & capability); } -int32_t __rte_experimental +int32_t rte_service_component_register(const struct rte_service_spec *spec, uint32_t *id_ptr) { @@ -262,7 +262,7 @@ rte_service_component_register(const struct rte_service_spec *spec, return 0; } -int32_t __rte_experimental +int32_t rte_service_component_unregister(uint32_t id) { uint32_t i; @@ -283,7 +283,7 @@ rte_service_component_unregister(uint32_t id) return 0; } -int32_t __rte_experimental +int32_t rte_service_component_runstate_set(uint32_t id, uint32_t runstate) { struct rte_service_spec_impl *s; @@ -298,7 +298,7 @@ rte_service_component_runstate_set(uint32_t id, uint32_t runstate) return 0; } -int32_t __rte_experimental +int32_t rte_service_runstate_set(uint32_t id, uint32_t runstate) { struct rte_service_spec_impl *s; @@ -313,7 +313,7 @@ rte_service_runstate_set(uint32_t id, uint32_t runstate) return 0; } -int32_t __rte_experimental +int32_t rte_service_runstate_get(uint32_t id) { struct rte_service_spec_impl *s; @@ -374,7 +374,7 @@ service_run(uint32_t i, struct core_state *cs, uint64_t service_mask) return 0; } -int32_t __rte_experimental rte_service_run_iter_on_app_lcore(uint32_t id, +int32_t rte_service_run_iter_on_app_lcore(uint32_t id, uint32_t serialize_mt_unsafe) { /* run service on calling core, using all-ones as the service mask */ @@ -430,7 +430,7 @@ rte_service_runner_func(void *arg) return 0; } -int32_t __rte_experimental +int32_t rte_service_lcore_count(void) { int32_t count = 0; @@ -440,7 +440,7 @@ rte_service_lcore_count(void) return count; } -int32_t __rte_experimental +int32_t rte_service_lcore_list(uint32_t array[], uint32_t n) { uint32_t count = rte_service_lcore_count(); @@ -463,7 +463,7 @@ rte_service_lcore_list(uint32_t array[], uint32_t n) return count; } -int32_t __rte_experimental +int32_t rte_service_lcore_count_services(uint32_t lcore) { if (lcore >= RTE_MAX_LCORE) @@ -476,7 +476,7 @@ rte_service_lcore_count_services(uint32_t lcore) return __builtin_popcountll(cs->service_mask); } -int32_t __rte_experimental +int32_t rte_service_start_with_defaults(void) { /* create a default mapping from cores to services, then start the @@ -562,7 +562,7 @@ service_update(struct rte_service_spec *service, uint32_t lcore, return 0; } -int32_t __rte_experimental +int32_t rte_service_map_lcore_set(uint32_t id, uint32_t lcore, uint32_t enabled) { struct rte_service_spec_impl *s; @@ -571,7 +571,7 @@ rte_service_map_lcore_set(uint32_t id, uint32_t lcore, uint32_t enabled) return service_update(&s->spec, lcore, &on, 0); } -int32_t __rte_experimental +int32_t rte_service_map_lcore_get(uint32_t id, uint32_t lcore) { struct rte_service_spec_impl *s; @@ -597,7 +597,7 @@ set_lcore_state(uint32_t lcore, int32_t state) lcore_states[lcore].is_service_core = (state == ROLE_SERVICE); } -int32_t __rte_experimental +int32_t rte_service_lcore_reset_all(void) { /* loop over cores, reset all to mask 0 */ @@ -617,7 +617,7 @@ rte_service_lcore_reset_all(void) return 0; } -int32_t __rte_experimental +int32_t rte_service_lcore_add(uint32_t lcore) { if (lcore >= RTE_MAX_LCORE) @@ -636,7 +636,7 @@ rte_service_lcore_add(uint32_t lcore) return rte_eal_wait_lcore(lcore); } -int32_t __rte_experimental +int32_t rte_service_lcore_del(uint32_t lcore) { if (lcore >= RTE_MAX_LCORE) @@ -655,7 +655,7 @@ rte_service_lcore_del(uint32_t lcore) return 0; } -int32_t __rte_experimental +int32_t rte_service_lcore_start(uint32_t lcore) { if (lcore >= RTE_MAX_LCORE) @@ -678,7 +678,7 @@ rte_service_lcore_start(uint32_t lcore) return ret; } -int32_t __rte_experimental +int32_t rte_service_lcore_stop(uint32_t lcore) { if (lcore >= RTE_MAX_LCORE) @@ -708,7 +708,7 @@ rte_service_lcore_stop(uint32_t lcore) return 0; } -int32_t __rte_experimental +int32_t rte_service_attr_get(uint32_t id, uint32_t attr_id, uint32_t *attr_value) { struct rte_service_spec_impl *s; @@ -753,7 +753,7 @@ rte_service_dump_one(FILE *f, struct rte_service_spec_impl *s, s->cycles_spent, s->cycles_spent / calls); } -int32_t __rte_experimental +int32_t rte_service_attr_reset_all(uint32_t id) { struct rte_service_spec_impl *s; @@ -781,7 +781,8 @@ service_dump_calls_per_lcore(FILE *f, uint32_t lcore, uint32_t reset) fprintf(f, "\n"); } -int32_t __rte_experimental rte_service_dump(FILE *f, uint32_t id) +int32_t +rte_service_dump(FILE *f, uint32_t id) { uint32_t i; int print_one = (id != UINT32_MAX); -- cgit 1.2.3-korg