diff options
author | Christian Ehrhardt <christian.ehrhardt@canonical.com> | 2018-06-01 09:09:08 +0200 |
---|---|---|
committer | Christian Ehrhardt <christian.ehrhardt@canonical.com> | 2018-06-01 09:12:07 +0200 |
commit | 1bd9b61222f3a81ffe770fc00b70ded6e760c42b (patch) | |
tree | 0bf7d996cf0664796687c1be6d22958fcf6a8096 /lib/librte_eal/linuxapp/eal | |
parent | bb4e158029645f37809fcf81a3acddd6fa11f88a (diff) |
New upstream version 18.05
Change-Id: Icd4170ddc4f63aeae5d0559490e5195b5349f9c2
Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
Diffstat (limited to 'lib/librte_eal/linuxapp/eal')
-rw-r--r-- | lib/librte_eal/linuxapp/eal/Makefile | 9 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal.c | 182 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal_cpuflags.c | 84 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal_dev.c | 224 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 210 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal_interrupts.c | 28 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal_memalloc.c | 1309 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal_memory.c | 1142 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal_thread.c | 2 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal_timer.c | 12 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal_vfio.c | 1630 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal_vfio.h | 59 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 410 | ||||
-rw-r--r-- | lib/librte_eal/linuxapp/eal/meson.build | 3 |
14 files changed, 4164 insertions, 1140 deletions
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile index 7e5bbe88..3719ec9d 100644 --- a/lib/librte_eal/linuxapp/eal/Makefile +++ b/lib/librte_eal/linuxapp/eal/Makefile @@ -10,7 +10,7 @@ ARCH_DIR ?= $(RTE_ARCH) EXPORT_MAP := ../../rte_eal_version.map VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) -LIBABIVER := 6 +LIBABIVER := 7 VPATH += $(RTE_SDK)/lib/librte_eal/common @@ -30,17 +30,20 @@ endif # specific to linuxapp exec-env SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_cpuflags.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_hugepage_info.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memory.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_dev.c # from common dir SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c @@ -48,6 +51,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memalloc.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c @@ -61,9 +65,11 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c @@ -81,6 +87,7 @@ CFLAGS_eal_interrupts.o := -D_GNU_SOURCE CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE CFLAGS_eal_timer.o := -D_GNU_SOURCE CFLAGS_eal_lcore.o := -D_GNU_SOURCE +CFLAGS_eal_memalloc.o := -D_GNU_SOURCE CFLAGS_eal_thread.o := -D_GNU_SOURCE CFLAGS_eal_log.o := -D_GNU_SOURCE CFLAGS_eal_common_log.o := -D_GNU_SOURCE diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c index 38306bf5..8655b869 100644 --- a/lib/librte_eal/linuxapp/eal/eal.c +++ b/lib/librte_eal/linuxapp/eal/eal.c @@ -74,8 +74,8 @@ static int mem_cfg_fd = -1; static struct flock wr_lock = { .l_type = F_WRLCK, .l_whence = SEEK_SET, - .l_start = offsetof(struct rte_mem_config, memseg), - .l_len = sizeof(early_mem_config.memseg), + .l_start = offsetof(struct rte_mem_config, memsegs), + .l_len = sizeof(early_mem_config.memsegs), }; /* Address of global and public configuration */ @@ -92,6 +92,68 @@ struct internal_config internal_config; /* used by rte_rdtsc() */ int rte_cycles_vmware_tsc_map; +/* platform-specific runtime dir */ +static char runtime_dir[PATH_MAX]; + +static const char *default_runtime_dir = "/var/run"; + +int +eal_create_runtime_dir(void) +{ + const char *directory = default_runtime_dir; + const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR"); + const char *fallback = "/tmp"; + char tmp[PATH_MAX]; + int ret; + + if (getuid() != 0) { + /* try XDG path first, fall back to /tmp */ + if (xdg_runtime_dir != NULL) + directory = xdg_runtime_dir; + else + directory = fallback; + } + /* create DPDK subdirectory under runtime dir */ + ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory); + if (ret < 0 || ret == sizeof(tmp)) { + RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n"); + return -1; + } + + /* create prefix-specific subdirectory under DPDK runtime dir */ + ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s", + tmp, internal_config.hugefile_prefix); + if (ret < 0 || ret == sizeof(runtime_dir)) { + RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n"); + return -1; + } + + /* create the path if it doesn't exist. no "mkdir -p" here, so do it + * step by step. + */ + ret = mkdir(tmp, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + tmp, strerror(errno)); + return -1; + } + + ret = mkdir(runtime_dir, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + runtime_dir, strerror(errno)); + return -1; + } + + return 0; +} + +const char * +eal_get_runtime_dir(void) +{ + return runtime_dir; +} + /* Return user provided mbuf pool ops name */ const char * __rte_experimental rte_eal_mbuf_user_pool_ops(void) @@ -348,6 +410,8 @@ eal_usage(const char *prgname) " --"OPT_BASE_VIRTADDR" Base virtual address\n" " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n" " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n" + " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n" + " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n" "\n"); /* Allow the application to print its usage message too if hook is set */ if ( rte_application_usage_hook ) { @@ -591,7 +655,8 @@ eal_parse_args(int argc, char **argv) break; case OPT_MBUF_POOL_OPS_NAME_NUM: - internal_config.user_mbuf_pool_ops_name = optarg; + internal_config.user_mbuf_pool_ops_name = + strdup(optarg); break; default: @@ -638,23 +703,23 @@ out: return ret; } +static int +check_socket(const struct rte_memseg_list *msl, void *arg) +{ + int *socket_id = arg; + + return *socket_id == msl->socket_id; +} + static void eal_check_mem_on_local_socket(void) { - const struct rte_memseg *ms; - int i, socket_id; + int socket_id; socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); - ms = rte_eal_get_physmem_layout(); - - for (i = 0; i < RTE_MAX_MEMSEG; i++) - if (ms[i].socket_id == socket_id && - ms[i].len > 0) - return; - - RTE_LOG(WARNING, EAL, "WARNING: Master core has no " - "memory on local socket!\n"); + if (rte_memseg_list_walk(check_socket, &socket_id) == 0) + RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); } static int @@ -669,6 +734,8 @@ rte_eal_mcfg_complete(void) /* ALL shared mem_config related INIT DONE */ if (rte_config.process_type == RTE_PROC_PRIMARY) rte_config.mem_config->magic = RTE_MAGIC; + + internal_config.init_complete = 1; } /* @@ -689,24 +756,8 @@ rte_eal_iopl_init(void) #ifdef VFIO_PRESENT static int rte_eal_vfio_setup(void) { - int vfio_enabled = 0; - if (rte_vfio_enable("vfio")) return -1; - vfio_enabled = rte_vfio_is_enabled("vfio"); - - if (vfio_enabled) { - - /* if we are primary process, create a thread to communicate with - * secondary processes. the thread will use a socket to wait for - * requests from secondary process to send open file descriptors, - * because VFIO does not allow multiple open descriptors on a group or - * VFIO container. - */ - if (internal_config.process_type == RTE_PROC_PRIMARY && - vfio_mp_sync_setup() < 0) - return -1; - } return 0; } @@ -766,6 +817,13 @@ rte_eal_init(int argc, char **argv) return -1; } + /* create runtime data directory */ + if (eal_create_runtime_dir() < 0) { + rte_eal_init_alert("Cannot create runtime directory\n"); + rte_errno = EACCES; + return -1; + } + if (eal_plugins_init() < 0) { rte_eal_init_alert("Cannot init plugins\n"); rte_errno = EINVAL; @@ -779,6 +837,19 @@ rte_eal_init(int argc, char **argv) return -1; } + rte_config_init(); + + /* Put mp channel init before bus scan so that we can init the vdev + * bus through mp channel in the secondary process before the bus scan. + */ + if (rte_mp_channel_init() < 0) { + rte_eal_init_alert("failed to init mp channel\n"); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_errno = EFAULT; + return -1; + } + } + if (rte_bus_scan()) { rte_eal_init_alert("Cannot scan the buses for devices\n"); rte_errno = ENODEV; @@ -798,13 +869,17 @@ rte_eal_init(int argc, char **argv) "KNI module inserted\n"); } - if (internal_config.no_hugetlbfs == 0 && - internal_config.process_type != RTE_PROC_SECONDARY && - eal_hugepage_info_init() < 0) { - rte_eal_init_alert("Cannot get hugepage information."); - rte_errno = EACCES; - rte_atomic32_clear(&run_once); - return -1; + if (internal_config.no_hugetlbfs == 0) { + /* rte_config isn't initialized yet */ + ret = internal_config.process_type == RTE_PROC_PRIMARY ? + eal_hugepage_info_init() : + eal_hugepage_info_read(); + if (ret < 0) { + rte_eal_init_alert("Cannot get hugepage information."); + rte_errno = EACCES; + rte_atomic32_clear(&run_once); + return -1; + } } if (internal_config.memory == 0 && internal_config.force_sockets == 0) { @@ -825,8 +900,6 @@ rte_eal_init(int argc, char **argv) rte_srand(rte_rdtsc()); - rte_config_init(); - if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) { rte_eal_init_alert("Cannot init logging."); rte_errno = ENOMEM; @@ -834,14 +907,6 @@ rte_eal_init(int argc, char **argv) return -1; } - if (rte_mp_channel_init() < 0) { - rte_eal_init_alert("failed to init mp channel\n"); - if (rte_eal_process_type() == RTE_PROC_PRIMARY) { - rte_errno = EFAULT; - return -1; - } - } - #ifdef VFIO_PRESENT if (rte_eal_vfio_setup() < 0) { rte_eal_init_alert("Cannot init VFIO\n"); @@ -850,6 +915,15 @@ rte_eal_init(int argc, char **argv) return -1; } #endif + /* in secondary processes, memory init may allocate additional fbarrays + * not present in primary processes, so to avoid any potential issues, + * initialize memzones first. + */ + if (rte_eal_memzone_init() < 0) { + rte_eal_init_alert("Cannot init memzone\n"); + rte_errno = ENODEV; + return -1; + } if (rte_eal_memory_init() < 0) { rte_eal_init_alert("Cannot init memory\n"); @@ -860,8 +934,8 @@ rte_eal_init(int argc, char **argv) /* the directories are locked during eal_hugepage_info_init */ eal_hugedirs_unlock(); - if (rte_eal_memzone_init() < 0) { - rte_eal_init_alert("Cannot init memzone\n"); + if (rte_eal_malloc_heap_init() < 0) { + rte_eal_init_alert("Cannot init malloc heap\n"); rte_errno = ENODEV; return -1; } @@ -888,7 +962,7 @@ rte_eal_init(int argc, char **argv) eal_thread_init_master(rte_config.master_lcore); - ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN); + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n", rte_config.master_lcore, (int)thread_id, cpuset, @@ -919,7 +993,7 @@ rte_eal_init(int argc, char **argv) rte_panic("Cannot create thread\n"); /* Set thread_name for aid in debugging. */ - snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, + snprintf(thread_name, sizeof(thread_name), "lcore-slave-%d", i); ret = rte_thread_setname(lcore_config[i].thread_id, thread_name); @@ -950,6 +1024,12 @@ rte_eal_init(int argc, char **argv) return -1; } +#ifdef VFIO_PRESENT + /* Register mp action after probe() so that we got enough info */ + if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0) + return -1; +#endif + /* initialize default service/lcore mappings and start running. Ignore * -ENOTSUP, as it indicates no service coremask passed to EAL. */ diff --git a/lib/librte_eal/linuxapp/eal/eal_cpuflags.c b/lib/librte_eal/linuxapp/eal/eal_cpuflags.c new file mode 100644 index 00000000..d38296e1 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_cpuflags.c @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Red Hat, Inc. + */ + +#include <elf.h> +#include <fcntl.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 16) +#include <sys/auxv.h> +#define HAS_AUXV 1 +#endif +#endif + +#include <rte_cpuflags.h> + +#ifndef HAS_AUXV +static unsigned long +getauxval(unsigned long type __rte_unused) +{ + errno = ENOTSUP; + return 0; +} +#endif + +#ifdef RTE_ARCH_64 +typedef Elf64_auxv_t Internal_Elfx_auxv_t; +#else +typedef Elf32_auxv_t Internal_Elfx_auxv_t; +#endif + +/** + * Provides a method for retrieving values from the auxiliary vector and + * possibly running a string comparison. + * + * @return Always returns a result. When the result is 0, check errno + * to see if an error occurred during processing. + */ +static unsigned long +_rte_cpu_getauxval(unsigned long type, const char *str) +{ + unsigned long val; + + errno = 0; + val = getauxval(type); + + if (!val && (errno == ENOTSUP || errno == ENOENT)) { + int auxv_fd = open("/proc/self/auxv", O_RDONLY); + Internal_Elfx_auxv_t auxv; + + if (auxv_fd == -1) + return 0; + + errno = ENOENT; + while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) { + if (auxv.a_type == type) { + errno = 0; + val = auxv.a_un.a_val; + if (str) + val = strcmp((const char *)val, str); + break; + } + } + close(auxv_fd); + } + + return val; +} + +unsigned long +rte_cpu_getauxval(unsigned long type) +{ + return _rte_cpu_getauxval(type, NULL); +} + +int +rte_cpu_strcmp_auxval(unsigned long type, const char *str) +{ + return _rte_cpu_getauxval(type, str); +} diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c new file mode 100644 index 00000000..1cf6aebf --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_dev.c @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include <string.h> +#include <unistd.h> +#include <sys/socket.h> +#include <linux/netlink.h> + +#include <rte_string_fns.h> +#include <rte_log.h> +#include <rte_compat.h> +#include <rte_dev.h> +#include <rte_malloc.h> +#include <rte_interrupts.h> +#include <rte_alarm.h> + +#include "eal_private.h" + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static bool monitor_started; + +#define EAL_UEV_MSG_LEN 4096 +#define EAL_UEV_MSG_ELEM_LEN 128 + +static void dev_uev_handler(__rte_unused void *param); + +/* identify the system layer which reports this event. */ +enum eal_dev_event_subsystem { + EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */ + EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */ + EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */ + EAL_DEV_EVENT_SUBSYSTEM_MAX +}; + +static int +dev_uev_socket_fd_create(void) +{ + struct sockaddr_nl addr; + int ret; + + intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC | + SOCK_NONBLOCK, + NETLINK_KOBJECT_UEVENT); + if (intr_handle.fd < 0) { + RTE_LOG(ERR, EAL, "create uevent fd failed.\n"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.nl_family = AF_NETLINK; + addr.nl_pid = 0; + addr.nl_groups = 0xffffffff; + + ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr)); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n"); + goto err; + } + + return 0; +err: + close(intr_handle.fd); + intr_handle.fd = -1; + return ret; +} + +static int +dev_uev_parse(const char *buf, struct rte_dev_event *event, int length) +{ + char action[EAL_UEV_MSG_ELEM_LEN]; + char subsystem[EAL_UEV_MSG_ELEM_LEN]; + char pci_slot_name[EAL_UEV_MSG_ELEM_LEN]; + int i = 0; + + memset(action, 0, EAL_UEV_MSG_ELEM_LEN); + memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN); + memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN); + + while (i < length) { + for (; i < length; i++) { + if (*buf) + break; + buf++; + } + /** + * check device uevent from kernel side, no need to check + * uevent from udev. + */ + if (!strncmp(buf, "libudev", 7)) { + buf += 7; + i += 7; + return -1; + } + if (!strncmp(buf, "ACTION=", 7)) { + buf += 7; + i += 7; + strlcpy(action, buf, sizeof(action)); + } else if (!strncmp(buf, "SUBSYSTEM=", 10)) { + buf += 10; + i += 10; + strlcpy(subsystem, buf, sizeof(subsystem)); + } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) { + buf += 14; + i += 14; + strlcpy(pci_slot_name, buf, sizeof(subsystem)); + event->devname = strdup(pci_slot_name); + } + for (; i < length; i++) { + if (*buf == '\0') + break; + buf++; + } + } + + /* parse the subsystem layer */ + if (!strncmp(subsystem, "uio", 3)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO; + else if (!strncmp(subsystem, "pci", 3)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI; + else if (!strncmp(subsystem, "vfio", 4)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO; + else + return -1; + + /* parse the action type */ + if (!strncmp(action, "add", 3)) + event->type = RTE_DEV_EVENT_ADD; + else if (!strncmp(action, "remove", 6)) + event->type = RTE_DEV_EVENT_REMOVE; + else + return -1; + return 0; +} + +static void +dev_delayed_unregister(void *param) +{ + rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param); + close(intr_handle.fd); + intr_handle.fd = -1; +} + +static void +dev_uev_handler(__rte_unused void *param) +{ + struct rte_dev_event uevent; + int ret; + char buf[EAL_UEV_MSG_LEN]; + + memset(&uevent, 0, sizeof(struct rte_dev_event)); + memset(buf, 0, EAL_UEV_MSG_LEN); + + ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT); + if (ret < 0 && errno == EAGAIN) + return; + else if (ret <= 0) { + /* connection is closed or broken, can not up again. */ + RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n"); + rte_eal_alarm_set(1, dev_delayed_unregister, NULL); + return; + } + + ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN); + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "It is not an valid event " + "that need to be handle.\n"); + return; + } + + RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n", + uevent.devname, uevent.type, uevent.subsystem); + + if (uevent.devname) + dev_callback_process(uevent.devname, uevent.type); +} + +int __rte_experimental +rte_dev_event_monitor_start(void) +{ + int ret; + + if (monitor_started) + return 0; + + ret = dev_uev_socket_fd_create(); + if (ret) { + RTE_LOG(ERR, EAL, "error create device event fd.\n"); + return -1; + } + + intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT; + ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL); + + if (ret) { + RTE_LOG(ERR, EAL, "fail to register uevent callback.\n"); + return -1; + } + + monitor_started = true; + + return 0; +} + +int __rte_experimental +rte_dev_event_monitor_stop(void) +{ + int ret; + + if (!monitor_started) + return 0; + + ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler, + (void *)-1); + if (ret < 0) { + RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n"); + return ret; + } + + close(intr_handle.fd); + intr_handle.fd = -1; + monitor_started = false; + return 0; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c index 8bbf771a..7eca711b 100644 --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c @@ -14,7 +14,9 @@ #include <stdarg.h> #include <unistd.h> #include <errno.h> +#include <sys/mman.h> #include <sys/queue.h> +#include <sys/stat.h> #include <rte_memory.h> #include <rte_eal.h> @@ -30,6 +32,40 @@ #include "eal_filesystem.h" static const char sys_dir_path[] = "/sys/kernel/mm/hugepages"; +static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node"; + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +map_shared_memory(const char *filename, const size_t mem_size, int flags) +{ + void *retval; + int fd = open(filename, flags, 0666); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + close(fd); + return retval; +} + +static void * +open_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR); +} + +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT); +} /* this function is only called from eal_hugepage_info_init which itself * is only called from a primary process */ @@ -70,6 +106,45 @@ get_num_hugepages(const char *subdir) return num_pages; } +static uint32_t +get_num_hugepages_on_node(const char *subdir, unsigned int socket) +{ + char path[PATH_MAX], socketpath[PATH_MAX]; + DIR *socketdir; + unsigned long num_pages = 0; + const char *nr_hp_file = "free_hugepages"; + + snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages", + sys_pages_numa_dir_path, socket); + + socketdir = opendir(socketpath); + if (socketdir) { + /* Keep calm and carry on */ + closedir(socketdir); + } else { + /* Can't find socket dir, so ignore it */ + return 0; + } + + snprintf(path, sizeof(path), "%s/%s/%s", + socketpath, subdir, nr_hp_file); + if (eal_parse_sysfs_value(path, &num_pages) < 0) + return 0; + + if (num_pages == 0) + RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", + subdir); + + /* + * we want to return a uint32_t and more than this looks suspicious + * anyway ... + */ + if (num_pages > UINT32_MAX) + num_pages = UINT32_MAX; + + return num_pages; +} + static uint64_t get_default_hp_size(void) { @@ -94,8 +169,8 @@ get_default_hp_size(void) return size; } -static const char * -get_hugepage_dir(uint64_t hugepage_sz) +static int +get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len) { enum proc_mount_fieldnames { DEVICE = 0, @@ -113,7 +188,7 @@ get_hugepage_dir(uint64_t hugepage_sz) const char split_tok = ' '; char *splitstr[_FIELDNAME_MAX]; char buf[BUFSIZ]; - char *retval = NULL; + int retval = -1; FILE *fd = fopen(proc_mounts, "r"); if (fd == NULL) @@ -140,7 +215,8 @@ get_hugepage_dir(uint64_t hugepage_sz) /* if no explicit page size, the default page size is compared */ if (pagesz_str == NULL){ if (hugepage_sz == default_size){ - retval = strdup(splitstr[MOUNTPT]); + strlcpy(hugedir, splitstr[MOUNTPT], len); + retval = 0; break; } } @@ -148,7 +224,8 @@ get_hugepage_dir(uint64_t hugepage_sz) else { uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]); if (pagesz == hugepage_sz) { - retval = strdup(splitstr[MOUNTPT]); + strlcpy(hugedir, splitstr[MOUNTPT], len); + retval = 0; break; } } @@ -207,11 +284,9 @@ clear_hugedir(const char * hugedir) /* non-blocking lock */ lck_result = flock(fd, LOCK_EX | LOCK_NB); - /* if lock succeeds, unlock and remove the file */ - if (lck_result != -1) { - flock(fd, LOCK_UN); + /* if lock succeeds, remove the file */ + if (lck_result != -1) unlinkat(dir_fd, dirent->d_name, 0); - } close (fd); dirent = readdir(dir); } @@ -238,17 +313,11 @@ compare_hpi(const void *a, const void *b) return hpi_b->hugepage_sz - hpi_a->hugepage_sz; } -/* - * when we initialize the hugepage info, everything goes - * to socket 0 by default. it will later get sorted by memory - * initialization procedure. - */ -int -eal_hugepage_info_init(void) -{ - const char dirent_start_text[] = "hugepages-"; +static int +hugepage_info_init(void) +{ const char dirent_start_text[] = "hugepages-"; const size_t dirent_start_len = sizeof(dirent_start_text) - 1; - unsigned i, num_sizes = 0; + unsigned int i, total_pages, num_sizes = 0; DIR *dir; struct dirent *dirent; @@ -273,10 +342,10 @@ eal_hugepage_info_init(void) hpi = &internal_config.hugepage_info[num_sizes]; hpi->hugepage_sz = rte_str_to_size(&dirent->d_name[dirent_start_len]); - hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz); /* first, check if we have a mountpoint */ - if (hpi->hugedir == NULL) { + if (get_hugepage_dir(hpi->hugepage_sz, + hpi->hugedir, sizeof(hpi->hugedir)) < 0) { uint32_t num_pages; num_pages = get_num_hugepages(dirent->d_name); @@ -302,9 +371,28 @@ eal_hugepage_info_init(void) if (clear_hugedir(hpi->hugedir) == -1) break; - /* for now, put all pages into socket 0, - * later they will be sorted */ - hpi->num_pages[0] = get_num_hugepages(dirent->d_name); + /* + * first, try to put all hugepages into relevant sockets, but + * if first attempts fails, fall back to collecting all pages + * in one socket and sorting them later + */ + total_pages = 0; + /* we also don't want to do this for legacy init */ + if (!internal_config.legacy_mem) + for (i = 0; i < rte_socket_count(); i++) { + int socket = rte_socket_id_by_idx(i); + unsigned int num_pages = + get_num_hugepages_on_node( + dirent->d_name, socket); + hpi->num_pages[socket] = num_pages; + total_pages += num_pages; + } + /* + * we failed to sort memory from the get go, so fall + * back to old way + */ + if (total_pages == 0) + hpi->num_pages[0] = get_num_hugepages(dirent->d_name); #ifndef RTE_ARCH_64 /* for 32-bit systems, limit number of hugepages to @@ -328,11 +416,79 @@ eal_hugepage_info_init(void) sizeof(internal_config.hugepage_info[0]), compare_hpi); /* now we have all info, check we have at least one valid size */ - for (i = 0; i < num_sizes; i++) - if (internal_config.hugepage_info[i].hugedir != NULL && - internal_config.hugepage_info[i].num_pages[0] > 0) + for (i = 0; i < num_sizes; i++) { + /* pages may no longer all be on socket 0, so check all */ + unsigned int j, num_pages = 0; + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + num_pages += hpi->num_pages[j]; + if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0 && + num_pages > 0) return 0; + } /* no valid hugepage mounts available, return error */ return -1; } + +/* + * when we initialize the hugepage info, everything goes + * to socket 0 by default. it will later get sorted by memory + * initialization procedure. + */ +int +eal_hugepage_info_init(void) +{ + struct hugepage_info *hpi, *tmp_hpi; + unsigned int i; + + if (hugepage_info_init() < 0) + return -1; + + hpi = &internal_config.hugepage_info[0]; + + tmp_hpi = create_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + return -1; + } + + memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info)); + + /* we've copied file descriptors along with everything else, but they + * will be invalid in secondary process, so overwrite them + */ + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + struct hugepage_info *tmp = &tmp_hpi[i]; + tmp->lock_descriptor = -1; + } + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} + +int eal_hugepage_info_read(void) +{ + struct hugepage_info *hpi = &internal_config.hugepage_info[0]; + struct hugepage_info *tmp_hpi; + + tmp_hpi = open_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to open shared memory!\n"); + return -1; + } + + memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info)); + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c index f86f22f7..056d41c1 100644 --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c @@ -559,6 +559,9 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle) return -1; break; #endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; /* unknown handle type */ default: RTE_LOG(ERR, EAL, @@ -606,6 +609,9 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle) return -1; break; #endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; /* unknown handle type */ default: RTE_LOG(ERR, EAL, @@ -674,7 +680,10 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds) bytes_read = 0; call = true; break; - + case RTE_INTR_HANDLE_DEV_EVENT: + bytes_read = 0; + call = true; + break; default: bytes_read = 1; break; @@ -844,8 +853,7 @@ eal_intr_thread_main(__rte_unused void *arg) int rte_eal_intr_init(void) { - int ret = 0, ret_1 = 0; - char thread_name[RTE_MAX_THREAD_NAME_LEN]; + int ret = 0; /* init the global interrupt source head */ TAILQ_INIT(&intr_sources); @@ -860,23 +868,15 @@ rte_eal_intr_init(void) } /* create the host thread to wait/handle the interrupt */ - ret = pthread_create(&intr_thread, NULL, + ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, eal_intr_thread_main, NULL); if (ret != 0) { - rte_errno = ret; + rte_errno = -ret; RTE_LOG(ERR, EAL, "Failed to create thread for interrupt handling\n"); - } else { - /* Set thread_name for aid in debugging. */ - snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, - "eal-intr-thread"); - ret_1 = rte_thread_setname(intr_thread, thread_name); - if (ret_1 != 0) - RTE_LOG(DEBUG, EAL, - "Failed to set thread name for interrupt handling\n"); } - return -ret; + return ret; } static void diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c new file mode 100644 index 00000000..8c11f98c --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c @@ -0,0 +1,1309 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#define _FILE_OFFSET_BITS 64 +#include <errno.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <inttypes.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/queue.h> +#include <sys/file.h> +#include <unistd.h> +#include <limits.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <signal.h> +#include <setjmp.h> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +#include <numa.h> +#include <numaif.h> +#endif +#include <linux/falloc.h> + +#include <rte_common.h> +#include <rte_log.h> +#include <rte_eal_memconfig.h> +#include <rte_eal.h> +#include <rte_memory.h> +#include <rte_spinlock.h> + +#include "eal_filesystem.h" +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" + +/* + * not all kernel version support fallocate on hugetlbfs, so fall back to + * ftruncate and disallow deallocation if fallocate is not supported. + */ +static int fallocate_supported = -1; /* unknown */ + +/* for single-file segments, we need some kind of mechanism to keep track of + * which hugepages can be freed back to the system, and which cannot. we cannot + * use flock() because they don't allow locking parts of a file, and we cannot + * use fcntl() due to issues with their semantics, so we will have to rely on a + * bunch of lockfiles for each page. + * + * we cannot know how many pages a system will have in advance, but we do know + * that they come in lists, and we know lengths of these lists. so, simply store + * a malloc'd array of fd's indexed by list and segment index. + * + * they will be initialized at startup, and filled as we allocate/deallocate + * segments. also, use this to track memseg list proper fd. + */ +static struct { + int *fds; /**< dynamically allocated array of segment lock fd's */ + int memseg_list_fd; /**< memseg list fd */ + int len; /**< total length of the array */ + int count; /**< entries used in an array */ +} lock_fds[RTE_MAX_MEMSEG_LISTS]; + +/** local copy of a memory map, used to synchronize memory hotplug in MP */ +static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; + +static sigjmp_buf huge_jmpenv; + +static void __rte_unused huge_sigbus_handler(int signo __rte_unused) +{ + siglongjmp(huge_jmpenv, 1); +} + +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, + * non-static local variable in the stack frame calling sigsetjmp might be + * clobbered by a call to longjmp. + */ +static int __rte_unused huge_wrap_sigsetjmp(void) +{ + return sigsetjmp(huge_jmpenv, 1); +} + +static struct sigaction huge_action_old; +static int huge_need_recover; + +static void __rte_unused +huge_register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = huge_sigbus_handler; + + huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); +} + +static void __rte_unused +huge_recover_sigbus(void) +{ + if (huge_need_recover) { + sigaction(SIGBUS, &huge_action_old, NULL); + huge_need_recover = 0; + } +} + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +static bool +check_numa(void) +{ + bool ret = true; + /* Check if kernel supports NUMA. */ + if (numa_available() != 0) { + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); + ret = false; + } + return ret; +} + +static void +prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) +{ + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + if (get_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + RTE_LOG(ERR, EAL, + "Failed to get current mempolicy: %s. " + "Assuming MPOL_DEFAULT.\n", strerror(errno)); + oldpolicy = MPOL_DEFAULT; + } + RTE_LOG(DEBUG, EAL, + "Setting policy MPOL_PREFERRED for socket %d\n", + socket_id); + numa_set_preferred(socket_id); +} + +static void +restore_numa(int *oldpolicy, struct bitmask *oldmask) +{ + RTE_LOG(DEBUG, EAL, + "Restoring previous memory policy: %d\n", *oldpolicy); + if (*oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else if (set_mempolicy(*oldpolicy, oldmask->maskp, + oldmask->size + 1) < 0) { + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", + strerror(errno)); + numa_set_localalloc(); + } + numa_free_cpumask(oldmask); +} +#endif + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +get_file_size(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +/* we cannot use rte_memseg_list_walk() here because we will be holding a + * write lock whenever we enter every function in this file, however copying + * the same iteration code everywhere is not ideal as well. so, use a lockless + * copy of memseg list walk here. + */ +static int +memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + + if (msl->base_va == NULL) + continue; + + ret = func(msl, arg); + if (ret < 0) + return -1; + if (ret > 0) + return 1; + } + return 0; +} + +/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ +static int lock(int fd, int type) +{ + int ret; + + /* flock may be interrupted */ + do { + ret = flock(fd, type | LOCK_NB); + } while (ret && errno == EINTR); + + if (ret && errno == EWOULDBLOCK) { + /* couldn't lock */ + return 0; + } else if (ret) { + RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n", + __func__, strerror(errno)); + return -1; + } + /* lock was successful */ + return 1; +} + +static int get_segment_lock_fd(int list_idx, int seg_idx) +{ + char path[PATH_MAX] = {0}; + int fd; + + if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds)) + return -1; + if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len) + return -1; + + fd = lock_fds[list_idx].fds[seg_idx]; + /* does this lock already exist? */ + if (fd >= 0) + return fd; + + eal_get_hugefile_lock_path(path, sizeof(path), + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + + fd = open(path, O_CREAT | O_RDWR, 0660); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): error creating lockfile '%s': %s\n", + __func__, path, strerror(errno)); + return -1; + } + /* take out a read lock */ + if (lock(fd, LOCK_SH) != 1) { + RTE_LOG(ERR, EAL, "%s(): failed to take out a readlock on '%s': %s\n", + __func__, path, strerror(errno)); + close(fd); + return -1; + } + /* store it for future reference */ + lock_fds[list_idx].fds[seg_idx] = fd; + lock_fds[list_idx].count++; + return fd; +} + +static int unlock_segment(int list_idx, int seg_idx) +{ + int fd, ret; + + if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds)) + return -1; + if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len) + return -1; + + fd = lock_fds[list_idx].fds[seg_idx]; + + /* upgrade lock to exclusive to see if we can remove the lockfile */ + ret = lock(fd, LOCK_EX); + if (ret == 1) { + /* we've succeeded in taking exclusive lock, this lockfile may + * be removed. + */ + char path[PATH_MAX] = {0}; + eal_get_hugefile_lock_path(path, sizeof(path), + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + if (unlink(path)) { + RTE_LOG(ERR, EAL, "%s(): error removing lockfile '%s': %s\n", + __func__, path, strerror(errno)); + } + } + /* we don't want to leak the fd, so even if we fail to lock, close fd + * and remove it from list anyway. + */ + close(fd); + lock_fds[list_idx].fds[seg_idx] = -1; + lock_fds[list_idx].count--; + + if (ret < 0) + return -1; + return 0; +} + +static int +get_seg_fd(char *path, int buflen, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + int fd; + + if (internal_config.single_file_segments) { + /* create a hugepage file path */ + eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); + + fd = lock_fds[list_idx].memseg_list_fd; + + if (fd < 0) { + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* take out a read lock and keep it indefinitely */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + lock_fds[list_idx].memseg_list_fd = fd; + } + } else { + /* create a hugepage file path */ + eal_get_hugefile_path(path, buflen, hi->hugedir, + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, + strerror(errno)); + return -1; + } + /* take out a read lock */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + } + return fd; +} + +static int +resize_hugefile(int fd, char *path, int list_idx, int seg_idx, + uint64_t fa_offset, uint64_t page_sz, bool grow) +{ + bool again = false; + do { + if (fallocate_supported == 0) { + /* we cannot deallocate memory if fallocate() is not + * supported, and hugepage file is already locked at + * creation, so no further synchronization needed. + */ + + if (!grow) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", + __func__); + return -1; + } + uint64_t new_size = fa_offset + page_sz; + uint64_t cur_size = get_file_size(fd); + + /* fallocate isn't supported, fall back to ftruncate */ + if (new_size > cur_size && + ftruncate(fd, new_size) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + return -1; + } + } else { + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret, lock_fd; + + /* if fallocate() is supported, we need to take out a + * read lock on allocate (to prevent other processes + * from deallocating this page), and take out a write + * lock on deallocate (to ensure nobody else is using + * this page). + * + * read locks on page itself are already taken out at + * file creation, in get_seg_fd(). + * + * we cannot rely on simple use of flock() call, because + * we need to be able to lock a section of the file, + * and we cannot use fcntl() locks, because of numerous + * problems with their semantics, so we will use + * deterministically named lock files for each section + * of the file. + * + * if we're shrinking the file, we want to upgrade our + * lock from shared to exclusive. + * + * lock_fd is an fd for a lockfile, not for the segment + * list. + */ + lock_fd = get_segment_lock_fd(list_idx, seg_idx); + + if (!grow) { + /* we are using this lockfile to determine + * whether this particular page is locked, as we + * are in single file segments mode and thus + * cannot use regular flock() to get this info. + * + * we want to try and take out an exclusive lock + * on the lock file to determine if we're the + * last ones using this page, and if not, we + * won't be shrinking it, and will instead exit + * prematurely. + */ + ret = lock(lock_fd, LOCK_EX); + + /* drop the lock on the lockfile, so that even + * if we couldn't shrink the file ourselves, we + * are signalling to other processes that we're + * no longer using this page. + */ + if (unlock_segment(list_idx, seg_idx)) + RTE_LOG(ERR, EAL, "Could not unlock segment\n"); + + /* additionally, if this was the last lock on + * this segment list, we can safely close the + * page file fd, so that one of the processes + * could then delete the file after shrinking. + */ + if (ret < 1 && lock_fds[list_idx].count == 0) { + close(fd); + lock_fds[list_idx].memseg_list_fd = -1; + } + + if (ret < 0) { + RTE_LOG(ERR, EAL, "Could not lock segment\n"); + return -1; + } + if (ret == 0) + /* failed to lock, not an error. */ + return 0; + } + + /* grow or shrink the file */ + ret = fallocate(fd, flags, fa_offset, page_sz); + + if (ret < 0) { + if (fallocate_supported == -1 && + errno == ENOTSUP) { + RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", + __func__); + again = true; + fallocate_supported = 0; + } else { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + } else { + fallocate_supported = 1; + + /* we've grew/shrunk the file, and we hold an + * exclusive lock now. check if there are no + * more segments active in this segment list, + * and remove the file if there aren't. + */ + if (lock_fds[list_idx].count == 0) { + if (unlink(path)) + RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", + __func__, path, + strerror(errno)); + close(fd); + lock_fds[list_idx].memseg_list_fd = -1; + } + } + } + } while (again); + return 0; +} + +static int +alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, + struct hugepage_info *hi, unsigned int list_idx, + unsigned int seg_idx) +{ +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + int cur_socket_id = 0; +#endif + uint64_t map_offset; + char path[PATH_MAX]; + int ret = 0; + int fd; + size_t alloc_sz; + + /* takes out a read lock on segment or segment list */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); + return -1; + } + + alloc_sz = hi->hugepage_sz; + if (internal_config.single_file_segments) { + map_offset = seg_idx * alloc_sz; + ret = resize_hugefile(fd, path, list_idx, seg_idx, map_offset, + alloc_sz, true); + if (ret < 0) + goto resized; + } else { + map_offset = 0; + if (ftruncate(fd, alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + } + + /* + * map the segment, and populate page tables, the kernel fills this + * segment with zeros if it's a new page. + */ + void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset); + + if (va == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, + strerror(errno)); + goto resized; + } + if (va != addr) { + RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); + munmap(va, alloc_sz); + goto resized; + } + + rte_iova_t iova = rte_mem_virt2iova(addr); + if (iova == RTE_BAD_PHYS_ADDR) { + RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", + __func__); + goto mapped; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0); + + if (cur_socket_id != socket_id) { + RTE_LOG(DEBUG, EAL, + "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", + __func__, socket_id, cur_socket_id); + goto mapped; + } +#endif + + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n", + (unsigned int)(alloc_sz >> 20)); + goto mapped; + } + /* for non-single file segments, we can close fd here */ + if (!internal_config.single_file_segments) + close(fd); + + /* we need to trigger a write to the page to enforce page fault and + * ensure that page is accessible to us, but we can't overwrite value + * that is already there, so read the old value, and write itback. + * kernel populates the page with zeroes initially. + */ + *(volatile int *)addr = *(volatile int *)addr; + + ms->addr = addr; + ms->hugepage_sz = alloc_sz; + ms->len = alloc_sz; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + ms->iova = iova; + ms->socket_id = socket_id; + + return 0; + +mapped: + munmap(addr, alloc_sz); +resized: + if (internal_config.single_file_segments) { + resize_hugefile(fd, path, list_idx, seg_idx, map_offset, + alloc_sz, false); + /* ignore failure, can't make it any worse */ + } else { + /* only remove file if we can take out a write lock */ + if (lock(fd, LOCK_EX) == 1) + unlink(path); + close(fd); + } + return -1; +} + +static int +free_seg(struct rte_memseg *ms, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + uint64_t map_offset; + char path[PATH_MAX]; + int fd, ret; + + /* erase page data */ + memset(ms->addr, 0, ms->len); + + if (mmap(ms->addr, ms->len, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == + MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); + return -1; + } + + /* if we are not in single file segments mode, we're going to unmap the + * segment and thus drop the lock on original fd, but hugepage dir is + * now locked so we can take out another one without races. + */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) + return -1; + + if (internal_config.single_file_segments) { + map_offset = seg_idx * ms->len; + if (resize_hugefile(fd, path, list_idx, seg_idx, map_offset, + ms->len, false)) + return -1; + ret = 0; + } else { + /* if we're able to take out a write lock, we're the last one + * holding onto this page. + */ + ret = lock(fd, LOCK_EX); + if (ret >= 0) { + /* no one else is using this page */ + if (ret == 1) + unlink(path); + } + /* closing fd will drop the lock */ + close(fd); + } + + memset(ms, 0, sizeof(*ms)); + + return ret < 0 ? -1 : 0; +} + +struct alloc_walk_param { + struct hugepage_info *hi; + struct rte_memseg **ms; + size_t page_sz; + unsigned int segs_allocated; + unsigned int n_segs; + int socket; + bool exact; +}; +static int +alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct alloc_walk_param *wa = arg; + struct rte_memseg_list *cur_msl; + size_t page_sz; + int cur_idx, start_idx, j, dir_fd = -1; + unsigned int msl_idx, need, i; + + if (msl->page_sz != wa->page_sz) + return 0; + if (msl->socket_id != wa->socket) + return 0; + + page_sz = (size_t)msl->page_sz; + + msl_idx = msl - mcfg->memsegs; + cur_msl = &mcfg->memsegs[msl_idx]; + + need = wa->n_segs; + + /* try finding space in memseg list */ + cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need); + if (cur_idx < 0) + return 0; + start_idx = cur_idx; + + /* do not allow any page allocations during the time we're allocating, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + * + * during init, we already hold a write lock, so don't try to take out + * another one. + */ + if (wa->hi->lock_descriptor == -1) { + dir_fd = open(wa->hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + } + + for (i = 0; i < need; i++, cur_idx++) { + struct rte_memseg *cur; + void *map_addr; + + cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); + map_addr = RTE_PTR_ADD(cur_msl->base_va, + cur_idx * page_sz); + + if (alloc_seg(cur, map_addr, wa->socket, wa->hi, + msl_idx, cur_idx)) { + RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n", + need, i); + + /* if exact number wasn't requested, stop */ + if (!wa->exact) + goto out; + + /* clean up */ + for (j = start_idx; j < cur_idx; j++) { + struct rte_memseg *tmp; + struct rte_fbarray *arr = + &cur_msl->memseg_arr; + + tmp = rte_fbarray_get(arr, j); + rte_fbarray_set_free(arr, j); + + /* free_seg may attempt to create a file, which + * may fail. + */ + if (free_seg(tmp, wa->hi, msl_idx, j)) + RTE_LOG(DEBUG, EAL, "Cannot free page\n"); + } + /* clear the list */ + if (wa->ms) + memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); + + if (dir_fd >= 0) + close(dir_fd); + return -1; + } + if (wa->ms) + wa->ms[i] = cur; + + rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); + } +out: + wa->segs_allocated = i; + if (i > 0) + cur_msl->version++; + if (dir_fd >= 0) + close(dir_fd); + return 1; +} + +struct free_walk_param { + struct hugepage_info *hi; + struct rte_memseg *ms; +}; +static int +free_seg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *found_msl; + struct free_walk_param *wa = arg; + uintptr_t start_addr, end_addr; + int msl_idx, seg_idx, ret, dir_fd = -1; + + start_addr = (uintptr_t) msl->base_va; + end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz; + + if ((uintptr_t)wa->ms->addr < start_addr || + (uintptr_t)wa->ms->addr >= end_addr) + return 0; + + msl_idx = msl - mcfg->memsegs; + seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; + + /* msl is const */ + found_msl = &mcfg->memsegs[msl_idx]; + + /* do not allow any page allocations during the time we're freeing, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + * + * during init, we already hold a write lock, so don't try to take out + * another one. + */ + if (wa->hi->lock_descriptor == -1) { + dir_fd = open(wa->hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + } + + found_msl->version++; + + rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); + + ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); + + if (dir_fd >= 0) + close(dir_fd); + + if (ret < 0) + return -1; + + return 1; +} + +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, + int socket, bool exact) +{ + int i, ret = -1; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + bool have_numa = false; + int oldpolicy; + struct bitmask *oldmask; +#endif + struct alloc_walk_param wa; + struct hugepage_info *hi = NULL; + + memset(&wa, 0, sizeof(wa)); + + /* dynamic allocation not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) { + if (page_sz == + internal_config.hugepage_info[i].hugepage_sz) { + hi = &internal_config.hugepage_info[i]; + break; + } + } + if (!hi) { + RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n", + __func__); + return -1; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (check_numa()) { + oldmask = numa_allocate_nodemask(); + prepare_numa(&oldpolicy, oldmask, socket); + have_numa = true; + } +#endif + + wa.exact = exact; + wa.hi = hi; + wa.ms = ms; + wa.n_segs = n_segs; + wa.page_sz = page_sz; + wa.socket = socket; + wa.segs_allocated = 0; + + ret = memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); + if (ret == 0) { + RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", + __func__); + ret = -1; + } else if (ret > 0) { + ret = (int)wa.segs_allocated; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (have_numa) + restore_numa(&oldpolicy, oldmask); +#endif + return ret; +} + +struct rte_memseg * +eal_memalloc_alloc_seg(size_t page_sz, int socket) +{ + struct rte_memseg *ms; + if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) + return NULL; + /* return pointer to newly allocated memseg */ + return ms; +} + +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) +{ + int seg, ret = 0; + + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + for (seg = 0; seg < n_segs; seg++) { + struct rte_memseg *cur = ms[seg]; + struct hugepage_info *hi = NULL; + struct free_walk_param wa; + int i, walk_res; + + /* if this page is marked as unfreeable, fail */ + if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { + RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); + ret = -1; + continue; + } + + memset(&wa, 0, sizeof(wa)); + + for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info); + i++) { + hi = &internal_config.hugepage_info[i]; + if (cur->hugepage_sz == hi->hugepage_sz) + break; + } + if (i == (int)RTE_DIM(internal_config.hugepage_info)) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + ret = -1; + continue; + } + + wa.ms = cur; + wa.hi = hi; + + walk_res = memseg_list_walk_thread_unsafe(free_seg_walk, &wa); + if (walk_res == 1) + continue; + if (walk_res == 0) + RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); + ret = -1; + } + return ret; +} + +int +eal_memalloc_free_seg(struct rte_memseg *ms) +{ + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + return eal_memalloc_free_seg_bulk(&ms, 1); +} + +static int +sync_chunk(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used, int start, int end) +{ + struct rte_fbarray *l_arr, *p_arr; + int i, ret, chunk_len, diff_len; + + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + /* we need to aggregate allocations/deallocations into bigger chunks, + * as we don't want to spam the user with per-page callbacks. + * + * to avoid any potential issues, we also want to trigger + * deallocation callbacks *before* we actually deallocate + * memory, so that the user application could wrap up its use + * before it goes away. + */ + + chunk_len = end - start; + + /* find how many contiguous pages we can map/unmap for this chunk */ + diff_len = used ? + rte_fbarray_find_contig_free(l_arr, start) : + rte_fbarray_find_contig_used(l_arr, start); + + /* has to be at least one page */ + if (diff_len < 1) + return -1; + + diff_len = RTE_MIN(chunk_len, diff_len); + + /* if we are freeing memory, notify the application */ + if (!used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + start_va, len); + } + + for (i = 0; i < diff_len; i++) { + struct rte_memseg *p_ms, *l_ms; + int seg_idx = start + i; + + l_ms = rte_fbarray_get(l_arr, seg_idx); + p_ms = rte_fbarray_get(p_arr, seg_idx); + + if (l_ms == NULL || p_ms == NULL) + return -1; + + if (used) { + ret = alloc_seg(l_ms, p_ms->addr, + p_ms->socket_id, hi, + msl_idx, seg_idx); + if (ret < 0) + return -1; + rte_fbarray_set_used(l_arr, seg_idx); + } else { + ret = free_seg(l_ms, hi, msl_idx, seg_idx); + rte_fbarray_set_free(l_arr, seg_idx); + if (ret < 0) + return -1; + } + } + + /* if we just allocated memory, notify the application */ + if (used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + start_va, len); + } + + /* calculate how much we can advance until next chunk */ + diff_len = used ? + rte_fbarray_find_contig_used(l_arr, start) : + rte_fbarray_find_contig_free(l_arr, start); + ret = RTE_MIN(chunk_len, diff_len); + + return ret; +} + +static int +sync_status(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used) +{ + struct rte_fbarray *l_arr, *p_arr; + int p_idx, l_chunk_len, p_chunk_len, ret; + int start, end; + + /* this is a little bit tricky, but the basic idea is - walk both lists + * and spot any places where there are discrepancies. walking both lists + * and noting discrepancies in a single go is a hard problem, so we do + * it in two passes - first we spot any places where allocated segments + * mismatch (i.e. ensure that everything that's allocated in the primary + * is also allocated in the secondary), and then we do it by looking at + * free segments instead. + * + * we also need to aggregate changes into chunks, as we have to call + * callbacks per allocation, not per page. + */ + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + if (used) + p_idx = rte_fbarray_find_next_used(p_arr, 0); + else + p_idx = rte_fbarray_find_next_free(p_arr, 0); + + while (p_idx >= 0) { + int next_chunk_search_idx; + + if (used) { + p_chunk_len = rte_fbarray_find_contig_used(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_used(l_arr, + p_idx); + } else { + p_chunk_len = rte_fbarray_find_contig_free(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_free(l_arr, + p_idx); + } + /* best case scenario - no differences (or bigger, which will be + * fixed during next iteration), look for next chunk + */ + if (l_chunk_len >= p_chunk_len) { + next_chunk_search_idx = p_idx + p_chunk_len; + goto next_chunk; + } + + /* if both chunks start at the same point, skip parts we know + * are identical, and sync the rest. each call to sync_chunk + * will only sync contiguous segments, so we need to call this + * until we are sure there are no more differences in this + * chunk. + */ + start = p_idx + l_chunk_len; + end = p_idx + p_chunk_len; + do { + ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, + used, start, end); + start += ret; + } while (start < end && ret >= 0); + /* if ret is negative, something went wrong */ + if (ret < 0) + return -1; + + next_chunk_search_idx = p_idx + p_chunk_len; +next_chunk: + /* skip to end of this chunk */ + if (used) { + p_idx = rte_fbarray_find_next_used(p_arr, + next_chunk_search_idx); + } else { + p_idx = rte_fbarray_find_next_free(p_arr, + next_chunk_search_idx); + } + } + return 0; +} + +static int +sync_existing(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx) +{ + int ret, dir_fd; + + /* do not allow any page allocations during the time we're allocating, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + */ + dir_fd = open(hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__, + hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__, + hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + + /* ensure all allocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); + if (ret < 0) + goto fail; + + /* ensure all unallocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); + if (ret < 0) + goto fail; + + /* update version number */ + local_msl->version = primary_msl->version; + + close(dir_fd); + + return 0; +fail: + close(dir_fd); + return -1; +} + +static int +sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *primary_msl, *local_msl; + struct hugepage_info *hi = NULL; + unsigned int i; + int msl_idx; + + msl_idx = msl - mcfg->memsegs; + primary_msl = &mcfg->memsegs[msl_idx]; + local_msl = &local_memsegs[msl_idx]; + + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + uint64_t cur_sz = + internal_config.hugepage_info[i].hugepage_sz; + uint64_t msl_sz = primary_msl->page_sz; + if (msl_sz == cur_sz) { + hi = &internal_config.hugepage_info[i]; + break; + } + } + if (!hi) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + return -1; + } + + /* if versions don't match, synchronize everything */ + if (local_msl->version != primary_msl->version && + sync_existing(primary_msl, local_msl, hi, msl_idx)) + return -1; + return 0; +} + + +int +eal_memalloc_sync_with_primary(void) +{ + /* nothing to be done in primary */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + return 0; + + if (memseg_list_walk_thread_unsafe(sync_walk, NULL)) + return -1; + return 0; +} + +static int +secondary_msl_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *primary_msl, *local_msl; + char name[PATH_MAX]; + int msl_idx, ret; + + msl_idx = msl - mcfg->memsegs; + primary_msl = &mcfg->memsegs[msl_idx]; + local_msl = &local_memsegs[msl_idx]; + + /* create distinct fbarrays for each secondary */ + snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", + primary_msl->memseg_arr.name, getpid()); + + ret = rte_fbarray_init(&local_msl->memseg_arr, name, + primary_msl->memseg_arr.len, + primary_msl->memseg_arr.elt_sz); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); + return -1; + } + local_msl->base_va = primary_msl->base_va; + + return 0; +} + +static int +secondary_lock_list_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i, len; + int msl_idx; + int *data; + + msl_idx = msl - mcfg->memsegs; + len = msl->memseg_arr.len; + + /* ensure we have space to store lock fd per each possible segment */ + data = malloc(sizeof(int) * len); + if (data == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate space for lock descriptors\n"); + return -1; + } + /* set all fd's as invalid */ + for (i = 0; i < len; i++) + data[i] = -1; + + lock_fds[msl_idx].fds = data; + lock_fds[msl_idx].len = len; + lock_fds[msl_idx].count = 0; + lock_fds[msl_idx].memseg_list_fd = -1; + + return 0; +} + +int +eal_memalloc_init(void) +{ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) + return -1; + + /* initialize all of the lock fd lists */ + if (internal_config.single_file_segments) + if (rte_memseg_list_walk(secondary_lock_list_create_walk, NULL)) + return -1; + return 0; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index 38853b75..c917de1c 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -28,6 +28,7 @@ #include <numaif.h> #endif +#include <rte_errno.h> #include <rte_log.h> #include <rte_memory.h> #include <rte_launch.h> @@ -39,6 +40,7 @@ #include <rte_string_fns.h> #include "eal_private.h" +#include "eal_memalloc.h" #include "eal_internal_cfg.h" #include "eal_filesystem.h" #include "eal_hugepages.h" @@ -57,8 +59,6 @@ * zone as well as a physical contiguous zone. */ -static uint64_t baseaddr_offset; - static bool phys_addrs_available = true; #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space" @@ -221,82 +221,6 @@ aslr_enabled(void) } } -/* - * Try to mmap *size bytes in /dev/zero. If it is successful, return the - * pointer to the mmap'd area and keep *size unmodified. Else, retry - * with a smaller zone: decrease *size by hugepage_sz until it reaches - * 0. In this case, return NULL. Note: this function returns an address - * which is a multiple of hugepage size. - */ -static void * -get_virtual_area(size_t *size, size_t hugepage_sz) -{ - void *addr; - void *addr_hint; - int fd; - long aligned_addr; - - if (internal_config.base_virtaddr != 0) { - int page_size = sysconf(_SC_PAGE_SIZE); - addr_hint = (void *) (uintptr_t) - (internal_config.base_virtaddr + baseaddr_offset); - addr_hint = RTE_PTR_ALIGN_FLOOR(addr_hint, page_size); - } else { - addr_hint = NULL; - } - - RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); - - - fd = open("/dev/zero", O_RDONLY); - if (fd < 0){ - RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n"); - return NULL; - } - do { - addr = mmap(addr_hint, (*size) + hugepage_sz, PROT_READ, -#ifdef RTE_ARCH_PPC_64 - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -#else - MAP_PRIVATE, -#endif - fd, 0); - if (addr == MAP_FAILED) { - *size -= hugepage_sz; - } else if (addr_hint != NULL && addr != addr_hint) { - RTE_LOG(WARNING, EAL, "WARNING! Base virtual address " - "hint (%p != %p) not respected!\n", - addr_hint, addr); - RTE_LOG(WARNING, EAL, " This may cause issues with " - "mapping memory into secondary processes\n"); - } - } while (addr == MAP_FAILED && *size > 0); - - if (addr == MAP_FAILED) { - close(fd); - RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", - strerror(errno)); - return NULL; - } - - munmap(addr, (*size) + hugepage_sz); - close(fd); - - /* align addr to a huge page size boundary */ - aligned_addr = (long)addr; - aligned_addr += (hugepage_sz - 1); - aligned_addr &= (~(hugepage_sz - 1)); - addr = (void *)(aligned_addr); - - RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", - addr, *size); - - /* increment offset */ - baseaddr_offset += *size; - - return addr; -} - static sigjmp_buf huge_jmpenv; static void huge_sigbus_handler(int signo __rte_unused) @@ -330,13 +254,11 @@ void numa_error(char *where) */ static unsigned map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, - uint64_t *essential_memory __rte_unused, int orig) + uint64_t *essential_memory __rte_unused) { int fd; unsigned i; void *virtaddr; - void *vma_addr = NULL; - size_t vma_len = 0; #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES int node_id = -1; int essential_prev = 0; @@ -351,7 +273,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, have_numa = false; } - if (orig && have_numa) { + if (have_numa) { RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); if (get_mempolicy(&oldpolicy, oldmask->maskp, oldmask->size + 1, 0, 0) < 0) { @@ -367,6 +289,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, #endif for (i = 0; i < hpi->num_pages[0]; i++) { + struct hugepage_file *hf = &hugepg_tbl[i]; uint64_t hugepage_sz = hpi->hugepage_sz; #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES @@ -401,57 +324,14 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, } #endif - if (orig) { - hugepg_tbl[i].file_id = i; - hugepg_tbl[i].size = hugepage_sz; - eal_get_hugefile_path(hugepg_tbl[i].filepath, - sizeof(hugepg_tbl[i].filepath), hpi->hugedir, - hugepg_tbl[i].file_id); - hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0'; - } -#ifndef RTE_ARCH_64 - /* for 32-bit systems, don't remap 1G and 16G pages, just reuse - * original map address as final map address. - */ - else if ((hugepage_sz == RTE_PGSIZE_1G) - || (hugepage_sz == RTE_PGSIZE_16G)) { - hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va; - hugepg_tbl[i].orig_va = NULL; - continue; - } -#endif - else if (vma_len == 0) { - unsigned j, num_pages; - - /* reserve a virtual area for next contiguous - * physical block: count the number of - * contiguous physical pages. */ - for (j = i+1; j < hpi->num_pages[0] ; j++) { -#ifdef RTE_ARCH_PPC_64 - /* The physical addresses are sorted in - * descending order on PPC64 */ - if (hugepg_tbl[j].physaddr != - hugepg_tbl[j-1].physaddr - hugepage_sz) - break; -#else - if (hugepg_tbl[j].physaddr != - hugepg_tbl[j-1].physaddr + hugepage_sz) - break; -#endif - } - num_pages = j - i; - vma_len = num_pages * hugepage_sz; - - /* get the biggest virtual memory area up to - * vma_len. If it fails, vma_addr is NULL, so - * let the kernel provide the address. */ - vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz); - if (vma_addr == NULL) - vma_len = hugepage_sz; - } + hf->file_id = i; + hf->size = hugepage_sz; + eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath), + hpi->hugedir, hf->file_id); + hf->filepath[sizeof(hf->filepath) - 1] = '\0'; /* try to create hugepage file */ - fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0600); + fd = open(hf->filepath, O_CREAT | O_RDWR, 0600); if (fd < 0) { RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, strerror(errno)); @@ -459,8 +339,11 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, } /* map the segment, and populate page tables, - * the kernel fills this segment with zeros */ - virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE, + * the kernel fills this segment with zeros. we don't care where + * this gets mapped - we already have contiguous memory areas + * ready for us to map into. + */ + virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0); if (virtaddr == MAP_FAILED) { RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__, @@ -469,41 +352,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, goto out; } - if (orig) { - hugepg_tbl[i].orig_va = virtaddr; - } - else { - hugepg_tbl[i].final_va = virtaddr; - } + hf->orig_va = virtaddr; - if (orig) { - /* In linux, hugetlb limitations, like cgroup, are - * enforced at fault time instead of mmap(), even - * with the option of MAP_POPULATE. Kernel will send - * a SIGBUS signal. To avoid to be killed, save stack - * environment here, if SIGBUS happens, we can jump - * back here. - */ - if (huge_wrap_sigsetjmp()) { - RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more " - "hugepages of size %u MB\n", - (unsigned)(hugepage_sz / 0x100000)); - munmap(virtaddr, hugepage_sz); - close(fd); - unlink(hugepg_tbl[i].filepath); + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more " + "hugepages of size %u MB\n", + (unsigned int)(hugepage_sz / 0x100000)); + munmap(virtaddr, hugepage_sz); + close(fd); + unlink(hugepg_tbl[i].filepath); #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (maxnode) - essential_memory[node_id] = - essential_prev; + if (maxnode) + essential_memory[node_id] = + essential_prev; #endif - goto out; - } - *(int *)virtaddr = 0; + goto out; } + *(int *)virtaddr = 0; - - /* set shared flock on the file. */ - if (flock(fd, LOCK_SH | LOCK_NB) == -1) { + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n", __func__, strerror(errno)); close(fd); @@ -511,9 +386,6 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, } close(fd); - - vma_addr = (char *)vma_addr + hugepage_sz; - vma_len -= hugepage_sz; } out: @@ -535,20 +407,6 @@ out: return i; } -/* Unmap all hugepages from original mapping */ -static int -unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) -{ - unsigned i; - for (i = 0; i < hpi->num_pages[0]; i++) { - if (hugepg_tbl[i].orig_va) { - munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz); - hugepg_tbl[i].orig_va = NULL; - } - } - return 0; -} - /* * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge * page. @@ -688,7 +546,7 @@ copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size, int src_pos, dst_pos = 0; for (src_pos = 0; src_pos < src_size; src_pos++) { - if (src[src_pos].final_va != NULL) { + if (src[src_pos].orig_va != NULL) { /* error on overflow attempt */ if (dst_pos == dest_size) return -1; @@ -759,9 +617,10 @@ unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl, unmap_len = hp->size; /* get start addr and len of the remaining segment */ - munmap(hp->final_va, (size_t) unmap_len); + munmap(hp->orig_va, + (size_t)unmap_len); - hp->final_va = NULL; + hp->orig_va = NULL; if (unlink(hp->filepath) == -1) { RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n", __func__, hp->filepath, strerror(errno)); @@ -780,6 +639,408 @@ unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl, return 0; } +static int +remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int cur_page, seg_len; + unsigned int msl_idx; + int ms_idx; + uint64_t page_sz; + size_t memseg_len; + int socket_id; + + page_sz = hugepages[seg_start].size; + socket_id = hugepages[seg_start].socket_id; + seg_len = seg_end - seg_start; + + RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n", + (seg_len * page_sz) >> 20ULL, socket_id); + + /* find free space in memseg lists */ + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + bool empty; + msl = &mcfg->memsegs[msl_idx]; + arr = &msl->memseg_arr; + + if (msl->page_sz != page_sz) + continue; + if (msl->socket_id != socket_id) + continue; + + /* leave space for a hole if array is not empty */ + empty = arr->count == 0; + ms_idx = rte_fbarray_find_next_n_free(arr, 0, + seg_len + (empty ? 0 : 1)); + + /* memseg list is full? */ + if (ms_idx < 0) + continue; + + /* leave some space between memsegs, they are not IOVA + * contiguous, so they shouldn't be VA contiguous either. + */ + if (!empty) + ms_idx++; + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE), + RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE)); + return -1; + } + +#ifdef RTE_ARCH_PPC64 + /* for PPC64 we go through the list backwards */ + for (cur_page = seg_end - 1; cur_page >= seg_start; + cur_page--, ms_idx++) { +#else + for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) { +#endif + struct hugepage_file *hfile = &hugepages[cur_page]; + struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx); + void *addr; + int fd; + + fd = open(hfile->filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open '%s': %s\n", + hfile->filepath, strerror(errno)); + return -1; + } + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n", + hfile->filepath, strerror(errno)); + close(fd); + return -1; + } + memseg_len = (size_t)page_sz; + addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len); + + /* we know this address is already mmapped by memseg list, so + * using MAP_FIXED here is safe + */ + addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n", + hfile->filepath, strerror(errno)); + close(fd); + return -1; + } + + /* we have a new address, so unmap previous one */ +#ifndef RTE_ARCH_64 + /* in 32-bit legacy mode, we have already unmapped the page */ + if (!internal_config.legacy_mem) + munmap(hfile->orig_va, page_sz); +#else + munmap(hfile->orig_va, page_sz); +#endif + + hfile->orig_va = NULL; + hfile->final_va = addr; + + /* rewrite physical addresses in IOVA as VA mode */ + if (rte_eal_iova_mode() == RTE_IOVA_VA) + hfile->physaddr = (uintptr_t)addr; + + /* set up memseg data */ + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->len = memseg_len; + ms->iova = hfile->physaddr; + ms->socket_id = hfile->socket_id; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + + rte_fbarray_set_used(arr, ms_idx); + + close(fd); + } + RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n", + (seg_len * page_sz) >> 20, socket_id); + return 0; +} + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" +static int +alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, + int n_segs, int socket_id, int type_msl_idx) +{ + char name[RTE_FBARRAY_NAME_LEN]; + + snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, + type_msl_idx); + if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", + rte_strerror(rte_errno)); + return -1; + } + + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->base_va = NULL; + + RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", + (size_t)page_sz >> 10, socket_id); + + return 0; +} + +static int +alloc_va_space(struct rte_memseg_list *msl) +{ + uint64_t page_sz; + size_t mem_sz; + void *addr; + int flags = 0; + +#ifdef RTE_ARCH_PPC_64 + flags |= MAP_HUGETLB; +#endif + + page_sz = msl->page_sz; + mem_sz = page_sz * msl->memseg_arr.len; + + addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); + if (addr == NULL) { + if (rte_errno == EADDRNOTAVAIL) + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n", + (unsigned long long)mem_sz, msl->base_va); + else + RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); + return -1; + } + msl->base_va = addr; + + return 0; +} + +/* + * Our VA space is not preallocated yet, so preallocate it here. We need to know + * how many segments there are in order to map all pages into one address space, + * and leave appropriate holes between segments so that rte_malloc does not + * concatenate them into one big segment. + * + * we also need to unmap original pages to free up address space. + */ +static int __rte_unused +prealloc_segments(struct hugepage_file *hugepages, int n_pages) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int cur_page, seg_start_page, end_seg, new_memseg; + unsigned int hpi_idx, socket, i; + int n_contig_segs, n_segs; + int msl_idx; + + /* before we preallocate segments, we need to free up our VA space. + * we're not removing files, and we already have information about + * PA-contiguousness, so it is safe to unmap everything. + */ + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *hpi = &hugepages[cur_page]; + munmap(hpi->orig_va, hpi->size); + hpi->orig_va = NULL; + } + + /* we cannot know how many page sizes and sockets we have discovered, so + * loop over all of them + */ + for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes; + hpi_idx++) { + uint64_t page_sz = + internal_config.hugepage_info[hpi_idx].hugepage_sz; + + for (i = 0; i < rte_socket_count(); i++) { + struct rte_memseg_list *msl; + + socket = rte_socket_id_by_idx(i); + n_contig_segs = 0; + n_segs = 0; + seg_start_page = -1; + + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *prev, *cur; + int prev_seg_start_page = -1; + + cur = &hugepages[cur_page]; + prev = cur_page == 0 ? NULL : + &hugepages[cur_page - 1]; + + new_memseg = 0; + end_seg = 0; + + if (cur->size == 0) + end_seg = 1; + else if (cur->socket_id != (int) socket) + end_seg = 1; + else if (cur->size != page_sz) + end_seg = 1; + else if (cur_page == 0) + new_memseg = 1; +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start + * from higher address to lower address. Here, + * physical addresses are in descending order. + */ + else if ((prev->physaddr - cur->physaddr) != + cur->size) + new_memseg = 1; +#else + else if ((cur->physaddr - prev->physaddr) != + cur->size) + new_memseg = 1; +#endif + if (new_memseg) { + /* if we're already inside a segment, + * new segment means end of current one + */ + if (seg_start_page != -1) { + end_seg = 1; + prev_seg_start_page = + seg_start_page; + } + seg_start_page = cur_page; + } + + if (end_seg) { + if (prev_seg_start_page != -1) { + /* we've found a new segment */ + n_contig_segs++; + n_segs += cur_page - + prev_seg_start_page; + } else if (seg_start_page != -1) { + /* we didn't find new segment, + * but did end current one + */ + n_contig_segs++; + n_segs += cur_page - + seg_start_page; + seg_start_page = -1; + continue; + } else { + /* we're skipping this page */ + continue; + } + } + /* segment continues */ + } + /* check if we missed last segment */ + if (seg_start_page != -1) { + n_contig_segs++; + n_segs += cur_page - seg_start_page; + } + + /* if no segments were found, do not preallocate */ + if (n_segs == 0) + continue; + + /* we now have total number of pages that we will + * allocate for this segment list. add separator pages + * to the total count, and preallocate VA space. + */ + n_segs += n_contig_segs - 1; + + /* now, preallocate VA space for these segments */ + + /* first, find suitable memseg list for this */ + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; + msl_idx++) { + msl = &mcfg->memsegs[msl_idx]; + + if (msl->base_va != NULL) + continue; + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + /* now, allocate fbarray itself */ + if (alloc_memseg_list(msl, page_sz, n_segs, socket, + msl_idx) < 0) + return -1; + + /* finally, allocate VA space */ + if (alloc_va_space(msl) < 0) + return -1; + } + } + return 0; +} + +/* + * We cannot reallocate memseg lists on the fly because PPC64 stores pages + * backwards, therefore we have to process the entire memseg first before + * remapping it into memseg list VA space. + */ +static int +remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages) +{ + int cur_page, seg_start_page, new_memseg, ret; + + seg_start_page = 0; + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *prev, *cur; + + new_memseg = 0; + + cur = &hugepages[cur_page]; + prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1]; + + /* if size is zero, no more pages left */ + if (cur->size == 0) + break; + + if (cur_page == 0) + new_memseg = 1; + else if (cur->socket_id != prev->socket_id) + new_memseg = 1; + else if (cur->size != prev->size) + new_memseg = 1; +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start from higher + * address to lower address. Here, physical addresses are in + * descending order. + */ + else if ((prev->physaddr - cur->physaddr) != cur->size) + new_memseg = 1; +#else + else if ((cur->physaddr - prev->physaddr) != cur->size) + new_memseg = 1; +#endif + + if (new_memseg) { + /* if this isn't the first time, remap segment */ + if (cur_page != 0) { + ret = remap_segment(hugepages, seg_start_page, + cur_page); + if (ret != 0) + return -1; + } + /* remember where we started */ + seg_start_page = cur_page; + } + /* continuation of previous memseg */ + } + /* we were stopped, but we didn't remap the last segment, do it now */ + if (cur_page != 0) { + ret = remap_segment(hugepages, seg_start_page, + cur_page); + if (ret != 0) + return -1; + } + return 0; +} + static inline uint64_t get_socket_mem_size(int socket) { @@ -788,7 +1049,7 @@ get_socket_mem_size(int socket) for (i = 0; i < internal_config.num_hugepage_sizes; i++){ struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - if (hpi->hugedir != NULL) + if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) size += hpi->hugepage_sz * hpi->num_pages[socket]; } @@ -818,8 +1079,10 @@ calc_num_pages_per_socket(uint64_t * memory, /* if specific memory amounts per socket weren't requested */ if (internal_config.force_sockets == 0) { + size_t total_size; +#ifdef RTE_ARCH_64 int cpu_per_socket[RTE_MAX_NUMA_NODES]; - size_t default_size, total_size; + size_t default_size; unsigned lcore_id; /* Compute number of cores per socket */ @@ -837,7 +1100,7 @@ calc_num_pages_per_socket(uint64_t * memory, /* Set memory amount per socket */ default_size = (internal_config.memory * cpu_per_socket[socket]) - / rte_lcore_count(); + / rte_lcore_count(); /* Limit to maximum available memory on socket */ default_size = RTE_MIN(default_size, get_socket_mem_size(socket)); @@ -854,18 +1117,40 @@ calc_num_pages_per_socket(uint64_t * memory, for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { /* take whatever is available */ default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket], - total_size); + total_size); /* Update sizes */ memory[socket] += default_size; total_size -= default_size; } +#else + /* in 32-bit mode, allocate all of the memory only on master + * lcore socket + */ + total_size = internal_config.memory; + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; + socket++) { + struct rte_config *cfg = rte_eal_get_configuration(); + unsigned int master_lcore_socket; + + master_lcore_socket = + rte_lcore_to_socket_id(cfg->master_lcore); + + if (master_lcore_socket != socket) + continue; + + /* Update sizes */ + memory[socket] = total_size; + break; + } +#endif } for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) { /* skips if the memory on specific socket wasn't requested */ for (i = 0; i < num_hp_info && memory[socket] != 0; i++){ - hp_used[i].hugedir = hp_info[i].hugedir; + strlcpy(hp_used[i].hugedir, hp_info[i].hugedir, + sizeof(hp_used[i].hugedir)); hp_used[i].num_pages[socket] = RTE_MIN( memory[socket] / hp_info[i].hugepage_sz, hp_info[i].num_pages[socket]); @@ -907,7 +1192,8 @@ calc_num_pages_per_socket(uint64_t * memory, } } /* if we didn't satisfy all memory requirements per socket */ - if (memory[socket] > 0) { + if (memory[socket] > 0 && + internal_config.socket_mem[socket] != 0) { /* to prevent icc errors */ requested = (unsigned) (internal_config.socket_mem[socket] / 0x100000); @@ -939,7 +1225,7 @@ eal_get_hugepage_mem_size(void) for (i = 0; i < internal_config.num_hugepage_sizes; i++) { struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - if (hpi->hugedir != NULL) { + if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) { for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { size += hpi->hugepage_sz * hpi->num_pages[j]; } @@ -987,17 +1273,19 @@ huge_recover_sigbus(void) * 6. unmap the first mapping * 7. fill memsegs in configuration with contiguous zones */ -int -rte_eal_hugepage_init(void) +static int +eal_legacy_hugepage_init(void) { struct rte_mem_config *mcfg; struct hugepage_file *hugepage = NULL, *tmp_hp = NULL; struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + struct rte_fbarray *arr; + struct rte_memseg *ms; uint64_t memory[RTE_MAX_NUMA_NODES]; unsigned hp_offset; - int i, j, new_memseg; + int i, j; int nr_hugefiles, nr_hugepages = 0; void *addr; @@ -1010,21 +1298,54 @@ rte_eal_hugepage_init(void) /* hugetlbfs can be disabled */ if (internal_config.no_hugetlbfs) { + struct rte_memseg_list *msl; + uint64_t page_sz; + int n_segs, cur_seg; + + /* nohuge mode is legacy mode */ + internal_config.legacy_mem = 1; + + /* create a memseg list */ + msl = &mcfg->memsegs[0]; + + page_sz = RTE_PGSIZE_4K; + n_segs = internal_config.memory / page_sz; + + if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) { RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, strerror(errno)); return -1; } - if (rte_eal_iova_mode() == RTE_IOVA_VA) - mcfg->memseg[0].iova = (uintptr_t)addr; - else - mcfg->memseg[0].iova = RTE_BAD_IOVA; - mcfg->memseg[0].addr = addr; - mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K; - mcfg->memseg[0].len = internal_config.memory; - mcfg->memseg[0].socket_id = 0; + msl->base_va = addr; + msl->page_sz = page_sz; + msl->socket_id = 0; + + /* populate memsegs. each memseg is one page long */ + for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { + arr = &msl->memseg_arr; + + ms = rte_fbarray_get(arr, cur_seg); + if (rte_eal_iova_mode() == RTE_IOVA_VA) + ms->iova = (uintptr_t)addr; + else + ms->iova = RTE_BAD_IOVA; + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->socket_id = 0; + ms->len = page_sz; + + rte_fbarray_set_used(arr, cur_seg); + + addr = RTE_PTR_ADD(addr, (size_t)page_sz); + } return 0; } @@ -1057,7 +1378,6 @@ rte_eal_hugepage_init(void) for (i = 0; i < RTE_MAX_NUMA_NODES; i++) memory[i] = internal_config.socket_mem[i]; - /* map all hugepages and sort them */ for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ unsigned pages_old, pages_new; @@ -1075,8 +1395,7 @@ rte_eal_hugepage_init(void) /* map all hugepages available */ pages_old = hpi->num_pages[0]; - pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, - memory, 1); + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory); if (pages_new < pages_old) { RTE_LOG(DEBUG, EAL, "%d not %d hugepages of size %u MB allocated\n", @@ -1091,7 +1410,8 @@ rte_eal_hugepage_init(void) continue; } - if (phys_addrs_available) { + if (phys_addrs_available && + rte_eal_iova_mode() != RTE_IOVA_VA) { /* find physical addresses for each hugepage */ if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { RTE_LOG(DEBUG, EAL, "Failed to find phys addr " @@ -1118,18 +1438,6 @@ rte_eal_hugepage_init(void) qsort(&tmp_hp[hp_offset], hpi->num_pages[0], sizeof(struct hugepage_file), cmp_physaddr); - /* remap all hugepages */ - if (map_all_hugepages(&tmp_hp[hp_offset], hpi, NULL, 0) != - hpi->num_pages[0]) { - RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n", - (unsigned)(hpi->hugepage_sz / 0x100000)); - goto fail; - } - - /* unmap original mappings */ - if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0) - goto fail; - /* we have processed a num of hugepages of this size, so inc offset */ hp_offset += hpi->num_pages[0]; } @@ -1191,7 +1499,7 @@ rte_eal_hugepage_init(void) } /* create shared memory */ - hugepage = create_shared_memory(eal_hugepage_info_path(), + hugepage = create_shared_memory(eal_hugepage_data_path(), nr_hugefiles * sizeof(struct hugepage_file)); if (hugepage == NULL) { @@ -1212,7 +1520,7 @@ rte_eal_hugepage_init(void) /* * copy stuff from malloc'd hugepage* to the actual shared memory. - * this procedure only copies those hugepages that have final_va + * this procedure only copies those hugepages that have orig_va * not NULL. has overflow protection. */ if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles, @@ -1221,6 +1529,23 @@ rte_eal_hugepage_init(void) goto fail; } +#ifndef RTE_ARCH_64 + /* for legacy 32-bit mode, we did not preallocate VA space, so do it */ + if (internal_config.legacy_mem && + prealloc_segments(hugepage, nr_hugefiles)) { + RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n"); + goto fail; + } +#endif + + /* remap all pages we do need into memseg list VA space, so that those + * pages become first-class citizens in DPDK memory subsystem + */ + if (remap_needed_hugepages(hugepage, nr_hugefiles)) { + RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n"); + goto fail; + } + /* free the hugepage backing files */ if (internal_config.hugepage_unlink && unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) { @@ -1232,75 +1557,30 @@ rte_eal_hugepage_init(void) free(tmp_hp); tmp_hp = NULL; - /* first memseg index shall be 0 after incrementing it below */ - j = -1; - for (i = 0; i < nr_hugefiles; i++) { - new_memseg = 0; - - /* if this is a new section, create a new memseg */ - if (i == 0) - new_memseg = 1; - else if (hugepage[i].socket_id != hugepage[i-1].socket_id) - new_memseg = 1; - else if (hugepage[i].size != hugepage[i-1].size) - new_memseg = 1; - -#ifdef RTE_ARCH_PPC_64 - /* On PPC64 architecture, the mmap always start from higher - * virtual address to lower address. Here, both the physical - * address and virtual address are in descending order */ - else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) != - hugepage[i].size) - new_memseg = 1; - else if (((unsigned long)hugepage[i-1].final_va - - (unsigned long)hugepage[i].final_va) != hugepage[i].size) - new_memseg = 1; -#else - else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) != - hugepage[i].size) - new_memseg = 1; - else if (((unsigned long)hugepage[i].final_va - - (unsigned long)hugepage[i-1].final_va) != hugepage[i].size) - new_memseg = 1; -#endif + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); - if (new_memseg) { - j += 1; - if (j == RTE_MAX_MEMSEG) - break; + /* we're not going to allocate more pages, so release VA space for + * unused memseg lists + */ + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + size_t mem_sz; - mcfg->memseg[j].iova = hugepage[i].physaddr; - mcfg->memseg[j].addr = hugepage[i].final_va; - mcfg->memseg[j].len = hugepage[i].size; - mcfg->memseg[j].socket_id = hugepage[i].socket_id; - mcfg->memseg[j].hugepage_sz = hugepage[i].size; - } - /* continuation of previous memseg */ - else { -#ifdef RTE_ARCH_PPC_64 - /* Use the phy and virt address of the last page as segment - * address for IBM Power architecture */ - mcfg->memseg[j].iova = hugepage[i].physaddr; - mcfg->memseg[j].addr = hugepage[i].final_va; -#endif - mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz; - } - hugepage[i].memseg_id = j; - } + /* skip inactive lists */ + if (msl->base_va == NULL) + continue; + /* skip lists where there is at least one page allocated */ + if (msl->memseg_arr.count > 0) + continue; + /* this is an unused list, deallocate it */ + mem_sz = (size_t)msl->page_sz * msl->memseg_arr.len; + munmap(msl->base_va, mem_sz); + msl->base_va = NULL; - if (i < nr_hugefiles) { - RTE_LOG(ERR, EAL, "Can only reserve %d pages " - "from %d requested\n" - "Current %s=%d is not enough\n" - "Please either increase it or request less amount " - "of memory.\n", - i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG), - RTE_MAX_MEMSEG); - goto fail; + /* destroy backing fbarray */ + rte_fbarray_destroy(&msl->memseg_arr); } - munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); - return 0; fail: @@ -1312,6 +1592,104 @@ fail: return -1; } +static int __rte_unused +hugepage_count_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct hugepage_info *hpi = arg; + + if (msl->page_sz != hpi->hugepage_sz) + return 0; + + hpi->num_pages[msl->socket_id] += msl->memseg_arr.len; + return 0; +} + +static int +eal_hugepage_init(void) +{ + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + uint64_t memory[RTE_MAX_NUMA_NODES]; + int hp_sz_idx, socket_id; + + test_phys_addrs_available(); + + memset(used_hp, 0, sizeof(used_hp)); + + for (hp_sz_idx = 0; + hp_sz_idx < (int) internal_config.num_hugepage_sizes; + hp_sz_idx++) { +#ifndef RTE_ARCH_64 + struct hugepage_info dummy; + unsigned int i; +#endif + /* also initialize used_hp hugepage sizes in used_hp */ + struct hugepage_info *hpi; + hpi = &internal_config.hugepage_info[hp_sz_idx]; + used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz; + +#ifndef RTE_ARCH_64 + /* for 32-bit, limit number of pages on socket to whatever we've + * preallocated, as we cannot allocate more. + */ + memset(&dummy, 0, sizeof(dummy)); + dummy.hugepage_sz = hpi->hugepage_sz; + if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0) + return -1; + + for (i = 0; i < RTE_DIM(dummy.num_pages); i++) { + hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i], + dummy.num_pages[i]); + } +#endif + } + + /* make a copy of socket_mem, needed for balanced allocation. */ + for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++) + memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx]; + + /* calculate final number of pages */ + if (calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes) < 0) + return -1; + + for (hp_sz_idx = 0; + hp_sz_idx < (int)internal_config.num_hugepage_sizes; + hp_sz_idx++) { + for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; + socket_id++) { + struct rte_memseg **pages; + struct hugepage_info *hpi = &used_hp[hp_sz_idx]; + unsigned int num_pages = hpi->num_pages[socket_id]; + int num_pages_alloc, i; + + if (num_pages == 0) + continue; + + pages = malloc(sizeof(*pages) * num_pages); + + RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n", + num_pages, hpi->hugepage_sz >> 20, socket_id); + + num_pages_alloc = eal_memalloc_alloc_seg_bulk(pages, + num_pages, hpi->hugepage_sz, + socket_id, true); + if (num_pages_alloc < 0) { + free(pages); + return -1; + } + + /* mark preallocated pages as unfreeable */ + for (i = 0; i < num_pages_alloc; i++) { + struct rte_memseg *ms = pages[i]; + ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; + } + free(pages); + } + } + return 0; +} + /* * uses fstat to report the size of a file on disk */ @@ -1330,16 +1708,15 @@ getFileSize(int fd) * configuration and finds the hugepages which form that segment, mapping them * in order to form a contiguous block in the virtual memory space */ -int -rte_eal_hugepage_attach(void) +static int +eal_legacy_hugepage_attach(void) { - const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct hugepage_file *hp = NULL; - unsigned num_hp = 0; - unsigned i, s = 0; /* s used to track the segment number */ - unsigned max_seg = RTE_MAX_MEMSEG; + unsigned int num_hp = 0; + unsigned int i = 0; + unsigned int cur_seg; off_t size = 0; - int fd, fd_zero = -1, fd_hugepage = -1; + int fd, fd_hugepage = -1; if (aslr_enabled() > 0) { RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization " @@ -1350,137 +1727,114 @@ rte_eal_hugepage_attach(void) test_phys_addrs_available(); - fd_zero = open("/dev/zero", O_RDONLY); - if (fd_zero < 0) { - RTE_LOG(ERR, EAL, "Could not open /dev/zero\n"); - goto error; - } - fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY); + fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY); if (fd_hugepage < 0) { - RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path()); + RTE_LOG(ERR, EAL, "Could not open %s\n", + eal_hugepage_data_path()); goto error; } - /* map all segments into memory to make sure we get the addrs */ - for (s = 0; s < RTE_MAX_MEMSEG; ++s) { - void *base_addr; - - /* - * the first memory segment with len==0 is the one that - * follows the last valid segment. - */ - if (mcfg->memseg[s].len == 0) - break; - - /* - * fdzero is mmapped to get a contiguous block of virtual - * addresses of the appropriate memseg size. - * use mmap to get identical addresses as the primary process. - */ - base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len, - PROT_READ, -#ifdef RTE_ARCH_PPC_64 - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -#else - MAP_PRIVATE, -#endif - fd_zero, 0); - if (base_addr == MAP_FAILED || - base_addr != mcfg->memseg[s].addr) { - max_seg = s; - if (base_addr != MAP_FAILED) { - /* errno is stale, don't use */ - RTE_LOG(ERR, EAL, "Could not mmap %llu bytes " - "in /dev/zero at [%p], got [%p] - " - "please use '--base-virtaddr' option\n", - (unsigned long long)mcfg->memseg[s].len, - mcfg->memseg[s].addr, base_addr); - munmap(base_addr, mcfg->memseg[s].len); - } else { - RTE_LOG(ERR, EAL, "Could not mmap %llu bytes " - "in /dev/zero at [%p]: '%s'\n", - (unsigned long long)mcfg->memseg[s].len, - mcfg->memseg[s].addr, strerror(errno)); - } - if (aslr_enabled() > 0) { - RTE_LOG(ERR, EAL, "It is recommended to " - "disable ASLR in the kernel " - "and retry running both primary " - "and secondary processes\n"); - } - goto error; - } - } - size = getFileSize(fd_hugepage); hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0); if (hp == MAP_FAILED) { - RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path()); + RTE_LOG(ERR, EAL, "Could not mmap %s\n", + eal_hugepage_data_path()); goto error; } num_hp = size / sizeof(struct hugepage_file); RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp); - s = 0; - while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){ - void *addr, *base_addr; - uintptr_t offset = 0; - size_t mapping_size; - /* - * free previously mapped memory so we can map the - * hugepages into the space - */ - base_addr = mcfg->memseg[s].addr; - munmap(base_addr, mcfg->memseg[s].len); - - /* find the hugepages for this segment and map them - * we don't need to worry about order, as the server sorted the - * entries before it did the second mmap of them */ - for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){ - if (hp[i].memseg_id == (int)s){ - fd = open(hp[i].filepath, O_RDWR); - if (fd < 0) { - RTE_LOG(ERR, EAL, "Could not open %s\n", - hp[i].filepath); - goto error; - } - mapping_size = hp[i].size; - addr = mmap(RTE_PTR_ADD(base_addr, offset), - mapping_size, PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - close(fd); /* close file both on success and on failure */ - if (addr == MAP_FAILED || - addr != RTE_PTR_ADD(base_addr, offset)) { - RTE_LOG(ERR, EAL, "Could not mmap %s\n", - hp[i].filepath); - goto error; - } - offset+=mapping_size; - } + /* map all segments into memory to make sure we get the addrs. the + * segments themselves are already in memseg list (which is shared and + * has its VA space already preallocated), so we just need to map + * everything into correct addresses. + */ + for (i = 0; i < num_hp; i++) { + struct hugepage_file *hf = &hp[i]; + size_t map_sz = hf->size; + void *map_addr = hf->final_va; + + /* if size is zero, no more pages left */ + if (map_sz == 0) + break; + + fd = open(hf->filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open %s: %s\n", + hf->filepath, strerror(errno)); + goto error; + } + + map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (map_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not map %s: %s\n", + hf->filepath, strerror(errno)); + close(fd); + goto error; + } + + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n", + __func__, strerror(errno)); + close(fd); + goto error; } - RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s, - (unsigned long long)mcfg->memseg[s].len); - s++; + + close(fd); } /* unmap the hugepage config file, since we are done using it */ munmap(hp, size); - close(fd_zero); close(fd_hugepage); return 0; error: - for (i = 0; i < max_seg && mcfg->memseg[i].len > 0; i++) - munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len); + /* map all segments into memory to make sure we get the addrs */ + cur_seg = 0; + for (cur_seg = 0; cur_seg < i; cur_seg++) { + struct hugepage_file *hf = &hp[i]; + size_t map_sz = hf->size; + void *map_addr = hf->final_va; + + munmap(map_addr, map_sz); + } if (hp != NULL && hp != MAP_FAILED) munmap(hp, size); - if (fd_zero >= 0) - close(fd_zero); if (fd_hugepage >= 0) close(fd_hugepage); return -1; } +static int +eal_hugepage_attach(void) +{ + if (eal_memalloc_sync_with_primary()) { + RTE_LOG(ERR, EAL, "Could not map memory from primary process\n"); + if (aslr_enabled() > 0) + RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n"); + return -1; + } + return 0; +} + +int +rte_eal_hugepage_init(void) +{ + return internal_config.legacy_mem ? + eal_legacy_hugepage_init() : + eal_hugepage_init(); +} + +int +rte_eal_hugepage_attach(void) +{ + return internal_config.legacy_mem ? + eal_legacy_hugepage_attach() : + eal_hugepage_attach(); +} + int rte_eal_using_phys_addrs(void) { diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c index 08e150b7..f652ff98 100644 --- a/lib/librte_eal/linuxapp/eal/eal_thread.c +++ b/lib/librte_eal/linuxapp/eal/eal_thread.c @@ -119,7 +119,7 @@ eal_thread_loop(__attribute__((unused)) void *arg) if (eal_thread_set_affinity() < 0) rte_panic("cannot set affinity\n"); - ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN); + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n", lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "..."); diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c index 161322f2..2766bd78 100644 --- a/lib/librte_eal/linuxapp/eal/eal_timer.c +++ b/lib/librte_eal/linuxapp/eal/eal_timer.c @@ -137,7 +137,6 @@ int rte_eal_hpet_init(int make_default) { int fd, ret; - char thread_name[RTE_MAX_THREAD_NAME_LEN]; if (internal_config.no_hpet) { RTE_LOG(NOTICE, EAL, "HPET is disabled\n"); @@ -178,7 +177,7 @@ rte_eal_hpet_init(int make_default) /* create a thread that will increment a global variable for * msb (hpet is 32 bits by default under linux) */ - ret = pthread_create(&msb_inc_thread_id, NULL, + ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL, (void *(*)(void *))hpet_msb_inc, NULL); if (ret != 0) { RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); @@ -186,15 +185,6 @@ rte_eal_hpet_init(int make_default) return -1; } - /* - * Set thread_name for aid in debugging. - */ - snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "hpet-msb-inc"); - ret = rte_thread_setname(msb_inc_thread_id, thread_name); - if (ret != 0) - RTE_LOG(DEBUG, EAL, - "Cannot set HPET timer thread name!\n"); - if (make_default) eal_timer_source = EAL_TIMER_HPET; return 0; diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c index e44ae4d0..a2bbdfbf 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c @@ -1,12 +1,14 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation + * Copyright(c) 2010-2018 Intel Corporation */ +#include <inttypes.h> #include <string.h> #include <fcntl.h> #include <unistd.h> #include <sys/ioctl.h> +#include <rte_errno.h> #include <rte_log.h> #include <rte_memory.h> #include <rte_eal_memconfig.h> @@ -18,59 +20,294 @@ #ifdef VFIO_PRESENT +#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb" + +/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can + * recreate the mappings for DPDK segments, but we cannot do so for memory that + * was registered by the user themselves, so we need to store the user mappings + * somewhere, to recreate them later. + */ +#define VFIO_MAX_USER_MEM_MAPS 256 +struct user_mem_map { + uint64_t addr; + uint64_t iova; + uint64_t len; +}; + +struct user_mem_maps { + rte_spinlock_recursive_t lock; + int n_maps; + struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS]; +}; + +struct vfio_config { + int vfio_enabled; + int vfio_container_fd; + int vfio_active_groups; + const struct vfio_iommu_type *vfio_iommu_type; + struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; + struct user_mem_maps mem_maps; +}; + /* per-process VFIO config */ -static struct vfio_config vfio_cfg; +static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS]; +static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0]; static int vfio_type1_dma_map(int); +static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); static int vfio_spapr_dma_map(int); +static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); static int vfio_noiommu_dma_map(int); +static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, + uint64_t iova, uint64_t len, int do_map); /* IOMMU types we support */ static const struct vfio_iommu_type iommu_types[] = { /* x86 IOMMU, otherwise known as type 1 */ - { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map}, + { + .type_id = RTE_VFIO_TYPE1, + .name = "Type 1", + .dma_map_func = &vfio_type1_dma_map, + .dma_user_map_func = &vfio_type1_dma_mem_map + }, /* ppc64 IOMMU, otherwise known as spapr */ - { RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map}, + { + .type_id = RTE_VFIO_SPAPR, + .name = "sPAPR", + .dma_map_func = &vfio_spapr_dma_map, + .dma_user_map_func = &vfio_spapr_dma_mem_map + }, /* IOMMU-less mode */ - { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map}, + { + .type_id = RTE_VFIO_NOIOMMU, + .name = "No-IOMMU", + .dma_map_func = &vfio_noiommu_dma_map, + .dma_user_map_func = &vfio_noiommu_dma_mem_map + }, }; -int -vfio_get_group_fd(int iommu_group_no) +/* for sPAPR IOMMU, we will need to walk memseg list, but we cannot use + * rte_memseg_walk() because by the time we enter callback we will be holding a + * write lock, so regular rte-memseg_walk will deadlock. copying the same + * iteration code everywhere is not ideal as well. so, use a lockless copy of + * memseg walk here. + */ +static int +memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg) { - int i; - int vfio_group_fd; - char filename[PATH_MAX]; - struct vfio_group *cur_grp; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; - /* check if we already have the group descriptor open */ - for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no) - return vfio_cfg.vfio_groups[i].fd; + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + const struct rte_memseg *ms; + struct rte_fbarray *arr; - /* Lets see first if there is room for a new group */ - if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); - return -1; - } + if (msl->memseg_arr.count == 0) + continue; - /* Now lets get an index for the new group */ - for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg.vfio_groups[i].group_no == -1) { - cur_grp = &vfio_cfg.vfio_groups[i]; - break; + arr = &msl->memseg_arr; + + ms_idx = rte_fbarray_find_next_used(arr, 0); + while (ms_idx >= 0) { + ms = rte_fbarray_get(arr, ms_idx); + ret = func(msl, ms, arg); + if (ret < 0) + return -1; + if (ret > 0) + return 1; + ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1); } + } + return 0; +} - /* This should not happen */ - if (i == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); +static int +is_null_map(const struct user_mem_map *map) +{ + return map->addr == 0 && map->iova == 0 && map->len == 0; +} + +/* we may need to merge user mem maps together in case of user mapping/unmapping + * chunks of memory, so we'll need a comparator function to sort segments. + */ +static int +user_mem_map_cmp(const void *a, const void *b) +{ + const struct user_mem_map *umm_a = a; + const struct user_mem_map *umm_b = b; + + /* move null entries to end */ + if (is_null_map(umm_a)) + return 1; + if (is_null_map(umm_b)) + return -1; + + /* sort by iova first */ + if (umm_a->iova < umm_b->iova) return -1; + if (umm_a->iova > umm_b->iova) + return 1; + + if (umm_a->addr < umm_b->addr) + return -1; + if (umm_a->addr > umm_b->addr) + return 1; + + if (umm_a->len < umm_b->len) + return -1; + if (umm_a->len > umm_b->len) + return 1; + + return 0; +} + +/* adjust user map entry. this may result in shortening of existing map, or in + * splitting existing map in two pieces. + */ +static void +adjust_map(struct user_mem_map *src, struct user_mem_map *end, + uint64_t remove_va_start, uint64_t remove_len) +{ + /* if va start is same as start address, we're simply moving start */ + if (remove_va_start == src->addr) { + src->addr += remove_len; + src->iova += remove_len; + src->len -= remove_len; + } else if (remove_va_start + remove_len == src->addr + src->len) { + /* we're shrinking mapping from the end */ + src->len -= remove_len; + } else { + /* we're blowing a hole in the middle */ + struct user_mem_map tmp; + uint64_t total_len = src->len; + + /* adjust source segment length */ + src->len = remove_va_start - src->addr; + + /* create temporary segment in the middle */ + tmp.addr = src->addr + src->len; + tmp.iova = src->iova + src->len; + tmp.len = remove_len; + + /* populate end segment - this one we will be keeping */ + end->addr = tmp.addr + tmp.len; + end->iova = tmp.iova + tmp.len; + end->len = total_len - src->len - tmp.len; } +} + +/* try merging two maps into one, return 1 if succeeded */ +static int +merge_map(struct user_mem_map *left, struct user_mem_map *right) +{ + if (left->addr + left->len != right->addr) + return 0; + if (left->iova + left->len != right->iova) + return 0; + + left->len += right->len; + + memset(right, 0, sizeof(*right)); + + return 1; +} + +static struct user_mem_map * +find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr, + uint64_t iova, uint64_t len) +{ + uint64_t va_end = addr + len; + uint64_t iova_end = iova + len; + int i; + + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = &user_mem_maps->maps[i]; + uint64_t map_va_end = map->addr + map->len; + uint64_t map_iova_end = map->iova + map->len; + + /* check start VA */ + if (addr < map->addr || addr >= map_va_end) + continue; + /* check if VA end is within boundaries */ + if (va_end <= map->addr || va_end > map_va_end) + continue; + + /* check start IOVA */ + if (iova < map->iova || iova >= map_iova_end) + continue; + /* check if IOVA end is within boundaries */ + if (iova_end <= map->iova || iova_end > map_iova_end) + continue; + + /* we've found our map */ + return map; + } + return NULL; +} + +/* this will sort all user maps, and merge/compact any adjacent maps */ +static void +compact_user_maps(struct user_mem_maps *user_mem_maps) +{ + int i, n_merged, cur_idx; + + qsort(user_mem_maps->maps, user_mem_maps->n_maps, + sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); + + /* we'll go over the list backwards when merging */ + n_merged = 0; + for (i = user_mem_maps->n_maps - 2; i >= 0; i--) { + struct user_mem_map *l, *r; + + l = &user_mem_maps->maps[i]; + r = &user_mem_maps->maps[i + 1]; + + if (is_null_map(l) || is_null_map(r)) + continue; + + if (merge_map(l, r)) + n_merged++; + } + + /* the entries are still sorted, but now they have holes in them, so + * walk through the list and remove the holes + */ + if (n_merged > 0) { + cur_idx = 0; + for (i = 0; i < user_mem_maps->n_maps; i++) { + if (!is_null_map(&user_mem_maps->maps[i])) { + struct user_mem_map *src, *dst; + + src = &user_mem_maps->maps[i]; + dst = &user_mem_maps->maps[cur_idx++]; + + if (src != dst) { + memcpy(dst, src, sizeof(*src)); + memset(src, 0, sizeof(*src)); + } + } + } + user_mem_maps->n_maps = cur_idx; + } +} + +static int +vfio_open_group_fd(int iommu_group_num) +{ + int vfio_group_fd; + char filename[PATH_MAX]; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + /* if primary, try to open the group */ if (internal_config.process_type == RTE_PROC_PRIMARY) { /* try regular group format */ snprintf(filename, sizeof(filename), - VFIO_GROUP_FMT, iommu_group_no); + VFIO_GROUP_FMT, iommu_group_num); vfio_group_fd = open(filename, O_RDWR); if (vfio_group_fd < 0) { /* if file not found, it's not an error */ @@ -82,7 +319,8 @@ vfio_get_group_fd(int iommu_group_no) /* special case: try no-IOMMU path as well */ snprintf(filename, sizeof(filename), - VFIO_NOIOMMU_GROUP_FMT, iommu_group_no); + VFIO_NOIOMMU_GROUP_FMT, + iommu_group_num); vfio_group_fd = open(filename, O_RDWR); if (vfio_group_fd < 0) { if (errno != ENOENT) { @@ -95,162 +333,293 @@ vfio_get_group_fd(int iommu_group_no) /* noiommu group found */ } - cur_grp->group_no = iommu_group_no; - cur_grp->fd = vfio_group_fd; - vfio_cfg.vfio_active_groups++; return vfio_group_fd; } /* if we're in a secondary process, request group fd from the primary - * process via our socket + * process via mp channel. */ - else { - int socket_fd, ret; + p->req = SOCKET_REQ_GROUP; + p->group_num = iommu_group_num; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + vfio_group_fd = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + vfio_group_fd = mp_rep->fds[0]; + } else if (p->result == SOCKET_NO_FD) { + RTE_LOG(ERR, EAL, " bad VFIO group fd\n"); + vfio_group_fd = 0; + } + free(mp_reply.msgs); + } - socket_fd = vfio_mp_sync_connect_to_primary(); + if (vfio_group_fd < 0) + RTE_LOG(ERR, EAL, " cannot request group fd\n"); + return vfio_group_fd; +} - if (socket_fd < 0) { - RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); - return -1; - } - if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) { - RTE_LOG(ERR, EAL, " cannot request container fd!\n"); - close(socket_fd); - return -1; - } - if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) { - RTE_LOG(ERR, EAL, " cannot send group number!\n"); - close(socket_fd); - return -1; - } - ret = vfio_mp_sync_receive_request(socket_fd); - switch (ret) { - case SOCKET_NO_FD: - close(socket_fd); - return 0; - case SOCKET_OK: - vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd); - /* if we got the fd, store it and return it */ - if (vfio_group_fd > 0) { - close(socket_fd); - cur_grp->group_no = iommu_group_no; - cur_grp->fd = vfio_group_fd; - vfio_cfg.vfio_active_groups++; - return vfio_group_fd; - } - /* fall-through on error */ - default: - RTE_LOG(ERR, EAL, " cannot get container fd!\n"); - close(socket_fd); - return -1; +static struct vfio_config * +get_vfio_cfg_by_group_num(int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) { + if (vfio_cfg->vfio_groups[j].group_num == + iommu_group_num) + return vfio_cfg; } } - return -1; + + return NULL; } +static struct vfio_config * +get_vfio_cfg_by_group_fd(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return vfio_cfg; + } -static int -get_vfio_group_idx(int vfio_group_fd) + return NULL; +} + +static struct vfio_config * +get_vfio_cfg_by_container_fd(int container_fd) { int i; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == container_fd) + return &vfio_cfgs[i]; + } + + return NULL; +} + +int +rte_vfio_get_group_fd(int iommu_group_num) +{ + int i; + int vfio_group_fd; + struct vfio_group *cur_grp; + struct vfio_config *vfio_cfg; + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + /* check if we already have the group descriptor open */ for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd) - return i; + if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) + return vfio_cfg->vfio_groups[i].fd; + + /* Lets see first if there is room for a new group */ + if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); + return -1; + } + + /* Now lets get an index for the new group */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num == -1) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); + return -1; + } + + vfio_group_fd = vfio_open_group_fd(iommu_group_num); + if (vfio_group_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); + return -1; + } + + cur_grp->group_num = iommu_group_num; + cur_grp->fd = vfio_group_fd; + vfio_cfg->vfio_active_groups++; + + return vfio_group_fd; +} + +static int +get_vfio_group_idx(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return j; + } + return -1; } static void vfio_group_device_get(int vfio_group_fd) { + struct vfio_config *vfio_cfg; int i; + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return; + } + i = get_vfio_group_idx(vfio_group_fd); if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); else - vfio_cfg.vfio_groups[i].devices++; + vfio_cfg->vfio_groups[i].devices++; } static void vfio_group_device_put(int vfio_group_fd) { + struct vfio_config *vfio_cfg; int i; + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return; + } + i = get_vfio_group_idx(vfio_group_fd); if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); else - vfio_cfg.vfio_groups[i].devices--; + vfio_cfg->vfio_groups[i].devices--; } static int vfio_group_device_count(int vfio_group_fd) { + struct vfio_config *vfio_cfg; int i; + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return -1; + } + i = get_vfio_group_idx(vfio_group_fd); if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) { RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); return -1; } - return vfio_cfg.vfio_groups[i].devices; + return vfio_cfg->vfio_groups[i].devices; +} + +static void +vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, + void *arg __rte_unused) +{ + struct rte_memseg_list *msl; + struct rte_memseg *ms; + size_t cur_len = 0; + + msl = rte_mem_virt2memseg_list(addr); + + /* for IOVA as VA mode, no need to care for IOVA addresses */ + if (rte_eal_iova_mode() == RTE_IOVA_VA) { + uint64_t vfio_va = (uint64_t)(uintptr_t)addr; + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, + len, 1); + else + vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, + len, 0); + return; + } + + /* memsegs are contiguous in memory */ + ms = rte_mem_virt2memseg(addr, msl); + while (cur_len < len) { + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, + ms->iova, ms->len, 1); + else + vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, + ms->iova, ms->len, 0); + + cur_len += ms->len; + ++ms; + } } int rte_vfio_clear_group(int vfio_group_fd) { int i; - int socket_fd, ret; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + struct vfio_config *vfio_cfg; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return -1; + } if (internal_config.process_type == RTE_PROC_PRIMARY) { i = get_vfio_group_idx(vfio_group_fd); if (i < 0) return -1; - vfio_cfg.vfio_groups[i].group_no = -1; - vfio_cfg.vfio_groups[i].fd = -1; - vfio_cfg.vfio_groups[i].devices = 0; - vfio_cfg.vfio_active_groups--; + vfio_cfg->vfio_groups[i].group_num = -1; + vfio_cfg->vfio_groups[i].fd = -1; + vfio_cfg->vfio_groups[i].devices = 0; + vfio_cfg->vfio_active_groups--; return 0; } - /* This is just for SECONDARY processes */ - socket_fd = vfio_mp_sync_connect_to_primary(); - - if (socket_fd < 0) { - RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); - return -1; - } - - if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) { - RTE_LOG(ERR, EAL, " cannot request container fd!\n"); - close(socket_fd); - return -1; - } + p->req = SOCKET_CLR_GROUP; + p->group_num = vfio_group_fd; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK) { + free(mp_reply.msgs); + return 0; + } else if (p->result == SOCKET_NO_FD) + RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n"); + else + RTE_LOG(ERR, EAL, " no such VFIO group fd!\n"); - if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) { - RTE_LOG(ERR, EAL, " cannot send group fd!\n"); - close(socket_fd); - return -1; + free(mp_reply.msgs); } - ret = vfio_mp_sync_receive_request(socket_fd); - switch (ret) { - case SOCKET_NO_FD: - RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n"); - close(socket_fd); - break; - case SOCKET_OK: - close(socket_fd); - return 0; - case SOCKET_ERR: - RTE_LOG(ERR, EAL, " Socket error\n"); - close(socket_fd); - break; - default: - RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret); - close(socket_fd); - } return -1; } @@ -258,15 +627,20 @@ int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, int *vfio_dev_fd, struct vfio_device_info *device_info) { + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock; struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; + struct vfio_config *vfio_cfg; + struct user_mem_maps *user_mem_maps; + int vfio_container_fd; int vfio_group_fd; - int iommu_group_no; - int ret; + int iommu_group_num; + int i, ret; /* get group number */ - ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no); + ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); if (ret == 0) { RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", dev_addr); @@ -278,7 +652,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, return -1; /* get the actual group fd */ - vfio_group_fd = vfio_get_group_fd(iommu_group_no); + vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); if (vfio_group_fd < 0) return -1; @@ -309,12 +683,18 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, return -1; } + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + vfio_container_fd = vfio_cfg->vfio_container_fd; + user_mem_maps = &vfio_cfg->mem_maps; + /* check if group does not have a container yet */ if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { /* add group to a container */ ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, - &vfio_cfg.vfio_container_fd); + &vfio_container_fd); if (ret) { RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " "error %i (%s)\n", dev_addr, errno, strerror(errno)); @@ -332,10 +712,12 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, * functionality. */ if (internal_config.process_type == RTE_PROC_PRIMARY && - vfio_cfg.vfio_active_groups == 1) { + vfio_cfg->vfio_active_groups == 1 && + vfio_group_device_count(vfio_group_fd) == 0) { + const struct vfio_iommu_type *t; + /* select an IOMMU type which we will be using */ - const struct vfio_iommu_type *t = - vfio_set_iommu_type(vfio_cfg.vfio_container_fd); + t = vfio_set_iommu_type(vfio_container_fd); if (!t) { RTE_LOG(ERR, EAL, " %s failed to select IOMMU type\n", @@ -344,15 +726,75 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, rte_vfio_clear_group(vfio_group_fd); return -1; } - ret = t->dma_map_func(vfio_cfg.vfio_container_fd); + /* lock memory hotplug before mapping and release it + * after registering callback, to prevent races + */ + rte_rwlock_read_lock(mem_lock); + if (vfio_cfg == default_vfio_cfg) + ret = t->dma_map_func(vfio_container_fd); + else + ret = 0; if (ret) { RTE_LOG(ERR, EAL, " %s DMA remapping failed, error %i (%s)\n", dev_addr, errno, strerror(errno)); close(vfio_group_fd); rte_vfio_clear_group(vfio_group_fd); + rte_rwlock_read_unlock(mem_lock); return -1; } + + vfio_cfg->vfio_iommu_type = t; + + /* re-map all user-mapped segments */ + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* this IOMMU type may not support DMA mapping, but + * if we have mappings in the list - that means we have + * previously mapped something successfully, so we can + * be sure that DMA mapping is supported. + */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map; + map = &user_mem_maps->maps[i]; + + ret = t->dma_user_map_func( + vfio_container_fd, + map->addr, map->iova, map->len, + 1); + if (ret) { + RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: " + "va: 0x%" PRIx64 " " + "iova: 0x%" PRIx64 " " + "len: 0x%" PRIu64 "\n", + map->addr, map->iova, + map->len); + rte_spinlock_recursive_unlock( + &user_mem_maps->lock); + rte_rwlock_read_unlock(mem_lock); + return -1; + } + } + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + + /* register callback for mem events */ + if (vfio_cfg == default_vfio_cfg) + ret = rte_mem_event_callback_register( + VFIO_MEM_EVENT_CLB_NAME, + vfio_mem_event_callback, NULL); + else + ret = 0; + /* unlock memory hotplug */ + rte_rwlock_read_unlock(mem_lock); + + if (ret && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n"); + return -1; + } + if (ret) + RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n"); + else + RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n"); } } @@ -390,30 +832,45 @@ int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int vfio_dev_fd) { + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock; struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; + struct vfio_config *vfio_cfg; int vfio_group_fd; - int iommu_group_no; + int iommu_group_num; int ret; + /* we don't want any DMA mapping messages to come while we're detaching + * VFIO device, because this might be the last device and we might need + * to unregister the callback. + */ + rte_rwlock_read_lock(mem_lock); + /* get group number */ - ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no); + ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); if (ret <= 0) { RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n", dev_addr); /* This is an error at this point. */ - return -1; + ret = -1; + goto out; } /* get the actual group fd */ - vfio_group_fd = vfio_get_group_fd(iommu_group_no); + vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); if (vfio_group_fd <= 0) { - RTE_LOG(INFO, EAL, "vfio_get_group_fd failed for %s\n", + RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n", dev_addr); - return -1; + ret = -1; + goto out; } + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + /* At this point we got an active group. Closing it will make the * container detachment. If this is the last active group, VFIO kernel * code will unset the container and the IOMMU mappings. @@ -423,7 +880,8 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, if (close(vfio_dev_fd) < 0) { RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n", dev_addr); - return -1; + ret = -1; + goto out; } /* An VFIO group can have several devices attached. Just when there is @@ -435,30 +893,53 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, if (close(vfio_group_fd) < 0) { RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n", dev_addr); - return -1; + ret = -1; + goto out; } if (rte_vfio_clear_group(vfio_group_fd) < 0) { RTE_LOG(INFO, EAL, "Error when clearing group for %s\n", dev_addr); - return -1; + ret = -1; + goto out; } } - return 0; + /* if there are no active device groups, unregister the callback to + * avoid spurious attempts to map/unmap memory from VFIO. + */ + if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0) + rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, + NULL); + + /* success */ + ret = 0; + +out: + rte_rwlock_read_unlock(mem_lock); + return ret; } int rte_vfio_enable(const char *modname) { /* initialize group list */ - int i; + int i, j; int vfio_available; - for (i = 0; i < VFIO_MAX_GROUPS; i++) { - vfio_cfg.vfio_groups[i].fd = -1; - vfio_cfg.vfio_groups[i].group_no = -1; - vfio_cfg.vfio_groups[i].devices = 0; + rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfgs[i].vfio_container_fd = -1; + vfio_cfgs[i].vfio_active_groups = 0; + vfio_cfgs[i].vfio_iommu_type = NULL; + vfio_cfgs[i].mem_maps.lock = lock; + + for (j = 0; j < VFIO_MAX_GROUPS; j++) { + vfio_cfgs[i].vfio_groups[j].fd = -1; + vfio_cfgs[i].vfio_groups[j].group_num = -1; + vfio_cfgs[i].vfio_groups[j].devices = 0; + } } /* inform the user that we are probing for VFIO */ @@ -480,12 +961,12 @@ rte_vfio_enable(const char *modname) return 0; } - vfio_cfg.vfio_container_fd = vfio_get_container_fd(); + default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd(); /* check if we have VFIO driver enabled */ - if (vfio_cfg.vfio_container_fd != -1) { + if (default_vfio_cfg->vfio_container_fd != -1) { RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); - vfio_cfg.vfio_enabled = 1; + default_vfio_cfg->vfio_enabled = 1; } else { RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); } @@ -497,7 +978,7 @@ int rte_vfio_is_enabled(const char *modname) { const int mod_available = rte_eal_check_module(modname) > 0; - return vfio_cfg.vfio_enabled && mod_available; + return default_vfio_cfg->vfio_enabled && mod_available; } const struct vfio_iommu_type * @@ -558,9 +1039,14 @@ vfio_has_supported_extensions(int vfio_container_fd) } int -vfio_get_container_fd(void) +rte_vfio_get_container_fd(void) { int ret, vfio_container_fd; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + /* if we're in a primary process, try to open the container */ if (internal_config.process_type == RTE_PROC_PRIMARY) { @@ -591,39 +1077,35 @@ vfio_get_container_fd(void) } return vfio_container_fd; - } else { - /* - * if we're in a secondary process, request container fd from the - * primary process via our socket - */ - int socket_fd; - - socket_fd = vfio_mp_sync_connect_to_primary(); - if (socket_fd < 0) { - RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); - return -1; - } - if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) { - RTE_LOG(ERR, EAL, " cannot request container fd!\n"); - close(socket_fd); - return -1; - } - vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd); - if (vfio_container_fd < 0) { - RTE_LOG(ERR, EAL, " cannot get container fd!\n"); - close(socket_fd); - return -1; + } + /* + * if we're in a secondary process, request container fd from the + * primary process via mp channel + */ + p->req = SOCKET_REQ_CONTAINER; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + vfio_container_fd = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + free(mp_reply.msgs); + return mp_rep->fds[0]; } - close(socket_fd); - return vfio_container_fd; + free(mp_reply.msgs); } + RTE_LOG(ERR, EAL, " cannot request container fd\n"); return -1; } int -vfio_get_group_no(const char *sysfs_base, - const char *dev_addr, int *iommu_group_no) +rte_vfio_get_group_num(const char *sysfs_base, + const char *dev_addr, int *iommu_group_num) { char linkname[PATH_MAX]; char filename[PATH_MAX]; @@ -655,7 +1137,7 @@ vfio_get_group_no(const char *sysfs_base, errno = 0; group_tok = tok[ret - 1]; end = group_tok; - *iommu_group_no = strtol(group_tok, &end, 10); + *iommu_group_num = strtol(group_tok, &end, 10); if ((end != group_tok && *end != '\0') || errno != 0) { RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr); return -1; @@ -665,34 +1147,49 @@ vfio_get_group_no(const char *sysfs_base, } static int -vfio_type1_dma_map(int vfio_container_fd) +type1_map(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) { - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - int i, ret; + int *vfio_container_fd = arg; - /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - struct vfio_iommu_type1_dma_map dma_map; + return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} - if (ms[i].addr == NULL) - break; +static int +vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + if (do_map != 0) { memset(&dma_map, 0, sizeof(dma_map)); dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); - dma_map.vaddr = ms[i].addr_64; - dma_map.size = ms[i].len; - if (rte_eal_iova_mode() == RTE_IOVA_VA) - dma_map.iova = dma_map.vaddr; - else - dma_map.iova = ms[i].iova; - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } else { + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); if (ret) { - RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " - "error %i (%s)\n", errno, - strerror(errno)); + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); return -1; } } @@ -701,24 +1198,107 @@ vfio_type1_dma_map(int vfio_container_fd) } static int -vfio_spapr_dma_map(int vfio_container_fd) +vfio_type1_dma_map(int vfio_container_fd) { - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - int i, ret; + return rte_memseg_walk(type1_map, &vfio_container_fd); +} + +static int +vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + + if (do_map != 0) { + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + } else { + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0 + }; + reg.vaddr = (uintptr_t) vaddr; + reg.size = len; + + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); + if (ret) { + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + int *vfio_container_fd = arg; + + return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} + +struct spapr_walk_param { + uint64_t window_size; + uint64_t hugepage_sz; +}; +static int +vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + struct spapr_walk_param *param = arg; + uint64_t max = ms->iova + ms->len; + + if (max > param->window_size) { + param->hugepage_sz = ms->hugepage_sz; + param->window_size = max; + } + + return 0; +} - struct vfio_iommu_spapr_register_memory reg = { - .argsz = sizeof(reg), - .flags = 0 +static int +vfio_spapr_create_new_dma_window(int vfio_container_fd, + struct vfio_iommu_spapr_tce_create *create) { + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), }; struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info), }; - struct vfio_iommu_spapr_tce_create create = { - .argsz = sizeof(create), - }; - struct vfio_iommu_spapr_tce_remove remove = { - .argsz = sizeof(remove), - }; + int ret; /* query spapr iommu info */ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); @@ -737,69 +1317,155 @@ vfio_spapr_dma_map(int vfio_container_fd) return -1; } - /* create DMA window from 0 to max(phys_addr + len) */ - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - if (ms[i].addr == NULL) - break; - - create.window_size = RTE_MAX(create.window_size, - ms[i].iova + ms[i].len); - } - - /* sPAPR requires window size to be a power of 2 */ - create.window_size = rte_align64pow2(create.window_size); - create.page_shift = __builtin_ctzll(ms->hugepage_sz); - create.levels = 1; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); + /* create new DMA window */ + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create); if (ret) { RTE_LOG(ERR, EAL, " cannot create new DMA window, " "error %i (%s)\n", errno, strerror(errno)); return -1; } - if (create.start_addr != 0) { + if (create->start_addr != 0) { RTE_LOG(ERR, EAL, " DMA window start address != 0\n"); return -1; } - /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - struct vfio_iommu_type1_dma_map dma_map; + return 0; +} - if (ms[i].addr == NULL) - break; +static int +vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct spapr_walk_param param; + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct vfio_config *vfio_cfg; + struct user_mem_maps *user_mem_maps; + int i, ret = 0; - reg.vaddr = (uintptr_t) ms[i].addr; - reg.size = ms[i].len; - ret = ioctl(vfio_container_fd, - VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); - if (ret) { - RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } + vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid container fd!\n"); + return -1; + } - memset(&dma_map, 0, sizeof(dma_map)); - dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); - dma_map.vaddr = ms[i].addr_64; - dma_map.size = ms[i].len; - if (rte_eal_iova_mode() == RTE_IOVA_VA) - dma_map.iova = dma_map.vaddr; - else - dma_map.iova = ms[i].iova; - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | - VFIO_DMA_MAP_FLAG_WRITE; + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); - ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + /* check if window size needs to be adjusted */ + memset(¶m, 0, sizeof(param)); - if (ret) { - RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; + if (memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, + ¶m) < 0) { + RTE_LOG(ERR, EAL, "Could not get window size\n"); + ret = -1; + goto out; + } + + /* also check user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + uint64_t max = user_mem_maps->maps[i].iova + + user_mem_maps->maps[i].len; + create.window_size = RTE_MAX(create.window_size, max); + } + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (do_map) { + void *addr; + /* re-create window and remap the entire memory */ + if (iova > create.window_size) { + if (vfio_spapr_create_new_dma_window(vfio_container_fd, + &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + ret = -1; + goto out; + } + if (memseg_walk_thread_unsafe(vfio_spapr_map_walk, + &vfio_container_fd) < 0) { + RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n"); + ret = -1; + goto out; + } + /* remap all user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = + &user_mem_maps->maps[i]; + if (vfio_spapr_dma_do_map(vfio_container_fd, + map->addr, map->iova, map->len, + 1)) { + RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n"); + ret = -1; + goto out; + } + } + } + + /* now that we've remapped all of the memory that was present + * before, map the segment that we were requested to map. + * + * however, if we were called by the callback, the memory we + * were called with was already in the memseg list, so previous + * mapping should've mapped that segment already. + * + * virt2memseg_list is a relatively cheap check, so use that. if + * memory is within any memseg list, it's a memseg, so it's + * already mapped. + */ + addr = (void *)(uintptr_t)vaddr; + if (rte_mem_virt2memseg_list(addr) == NULL && + vfio_spapr_dma_do_map(vfio_container_fd, + vaddr, iova, len, 1) < 0) { + RTE_LOG(ERR, EAL, "Could not map segment\n"); + ret = -1; + goto out; + } + } else { + /* for unmap, check if iova within DMA window */ + if (iova > create.window_size) { + RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap"); + ret = -1; + goto out; } + vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0); } +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +static int +vfio_spapr_dma_map(int vfio_container_fd) +{ + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct spapr_walk_param param; + + memset(¶m, 0, sizeof(param)); + + /* create DMA window from 0 to max(phys_addr + len) */ + rte_memseg_walk(vfio_spapr_window_size_walk, ¶m); + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + return -1; + } + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0) + return -1; return 0; } @@ -811,6 +1477,175 @@ vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) return 0; } +static int +vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd, + uint64_t __rte_unused vaddr, + uint64_t __rte_unused iova, uint64_t __rte_unused len, + int __rte_unused do_map) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +static int +vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type; + + if (!t) { + RTE_LOG(ERR, EAL, " VFIO support not initialized\n"); + rte_errno = ENODEV; + return -1; + } + + if (!t->dma_user_map_func) { + RTE_LOG(ERR, EAL, + " VFIO custom DMA region maping not supported by IOMMU %s\n", + t->name); + rte_errno = ENOTSUP; + return -1; + } + + return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova, + len, do_map); +} + +static int +container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct user_mem_map *new_map; + struct user_mem_maps *user_mem_maps; + int ret = 0; + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "No more space for user mem maps\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + /* map the entry */ + if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) { + /* technically, this will fail if there are currently no devices + * plugged in, even if a device were added later, this mapping + * might have succeeded. however, since we cannot verify if this + * is a valid mapping without having a device attached, consider + * this to be unsupported, because we can't just store any old + * mapping and pollute list of active mappings willy-nilly. + */ + RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n"); + ret = -1; + goto out; + } + /* create new user mem map entry */ + new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; + new_map->addr = vaddr; + new_map->iova = iova; + new_map->len = len; + + compact_user_maps(user_mem_maps); +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +static int +container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct user_mem_map *map, *new_map = NULL; + struct user_mem_maps *user_mem_maps; + int ret = 0; + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* find our mapping */ + map = find_user_mem_map(user_mem_maps, vaddr, iova, len); + if (!map) { + RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n"); + rte_errno = EINVAL; + ret = -1; + goto out; + } + if (map->addr != vaddr || map->iova != iova || map->len != len) { + /* we're partially unmapping a previously mapped region, so we + * need to split entry into two. + */ + if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; + } + + /* unmap the entry */ + if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) { + /* there may not be any devices plugged in, so unmapping will + * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't + * stop us from removing the mapping, as the assumption is we + * won't be needing this memory any more and thus will want to + * prevent it from being remapped again on hotplug. so, only + * fail if we indeed failed to unmap (e.g. if the mapping was + * within our mapped range but had invalid alignment). + */ + if (rte_errno != ENODEV && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n"); + ret = -1; + goto out; + } else { + RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n"); + } + } + /* remove map from the list of active mappings */ + if (new_map != NULL) { + adjust_map(map, new_map, vaddr, len); + + /* if we've created a new map by splitting, sort everything */ + if (!is_null_map(new_map)) { + compact_user_maps(user_mem_maps); + } else { + /* we've created a new mapping, but it was unused */ + user_mem_maps->n_maps--; + } + } else { + memset(map, 0, sizeof(*map)); + compact_user_maps(user_mem_maps); + user_mem_maps->n_maps--; + } + +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +int __rte_experimental +rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len) +{ + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + return container_dma_map(default_vfio_cfg, vaddr, iova, len); +} + +int __rte_experimental +rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len) +{ + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + return container_dma_unmap(default_vfio_cfg, vaddr, iova, len); +} + int rte_vfio_noiommu_is_enabled(void) { @@ -843,4 +1678,299 @@ rte_vfio_noiommu_is_enabled(void) return c == 'Y'; } -#endif +int __rte_experimental +rte_vfio_container_create(void) +{ + int i; + + /* Find an empty slot to store new vfio config */ + for (i = 1; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == -1) + break; + } + + if (i == VFIO_MAX_CONTAINERS) { + RTE_LOG(ERR, EAL, "exceed max vfio container limit\n"); + return -1; + } + + vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd(); + if (vfio_cfgs[i].vfio_container_fd < 0) { + RTE_LOG(NOTICE, EAL, "fail to create a new container\n"); + return -1; + } + + return vfio_cfgs[i].vfio_container_fd; +} + +int __rte_experimental +rte_vfio_container_destroy(int container_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num != -1) + rte_vfio_container_group_unbind(container_fd, + vfio_cfg->vfio_groups[i].group_num); + + close(container_fd); + vfio_cfg->vfio_container_fd = -1; + vfio_cfg->vfio_active_groups = 0; + vfio_cfg->vfio_iommu_type = NULL; + + return 0; +} + +int __rte_experimental +rte_vfio_container_group_bind(int container_fd, int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + struct vfio_group *cur_grp; + int vfio_group_fd; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + /* Check room for new group */ + if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); + return -1; + } + + /* Get an index for the new group */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num == -1) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); + return -1; + } + + vfio_group_fd = vfio_open_group_fd(iommu_group_num); + if (vfio_group_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); + return -1; + } + cur_grp->group_num = iommu_group_num; + cur_grp->fd = vfio_group_fd; + cur_grp->devices = 0; + vfio_cfg->vfio_active_groups++; + + return vfio_group_fd; +} + +int __rte_experimental +rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + struct vfio_group *cur_grp; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + for (i = 0; i < VFIO_MAX_GROUPS; i++) { + if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "Specified group number not found\n"); + return -1; + } + + if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) { + RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for" + " iommu_group_num %d\n", iommu_group_num); + return -1; + } + cur_grp->group_num = -1; + cur_grp->fd = -1; + cur_grp->devices = 0; + vfio_cfg->vfio_active_groups--; + + return 0; +} + +int __rte_experimental +rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct vfio_config *vfio_cfg; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return container_dma_map(vfio_cfg, vaddr, iova, len); +} + +int __rte_experimental +rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct vfio_config *vfio_cfg; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return container_dma_unmap(vfio_cfg, vaddr, iova, len); +} + +#else + +int __rte_experimental +rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int __rte_experimental +rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_setup_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *vfio_dev_fd, + __rte_unused struct vfio_device_info *device_info) +{ + return -1; +} + +int +rte_vfio_release_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, __rte_unused int fd) +{ + return -1; +} + +int +rte_vfio_enable(__rte_unused const char *modname) +{ + return -1; +} + +int +rte_vfio_is_enabled(__rte_unused const char *modname) +{ + return -1; +} + +int +rte_vfio_noiommu_is_enabled(void) +{ + return -1; +} + +int +rte_vfio_clear_group(__rte_unused int vfio_group_fd) +{ + return -1; +} + +int __rte_experimental +rte_vfio_get_group_num(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *iommu_group_num) +{ + return -1; +} + +int __rte_experimental +rte_vfio_get_container_fd(void) +{ + return -1; +} + +int __rte_experimental +rte_vfio_get_group_fd(__rte_unused int iommu_group_num) +{ + return -1; +} + +int __rte_experimental +rte_vfio_container_create(void) +{ + return -1; +} + +int __rte_experimental +rte_vfio_container_destroy(__rte_unused int container_fd) +{ + return -1; +} + +int __rte_experimental +rte_vfio_container_group_bind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int __rte_experimental +rte_vfio_container_group_unbind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int __rte_experimental +rte_vfio_container_dma_map(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int __rte_experimental +rte_vfio_container_dma_unmap(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +#endif /* VFIO_PRESENT */ diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h index 80595773..e65b1037 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h @@ -19,6 +19,7 @@ #ifdef VFIO_PRESENT +#include <stdint.h> #include <linux/vfio.h> #define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU @@ -26,6 +27,7 @@ #ifndef VFIO_SPAPR_TCE_v2_IOMMU #define RTE_VFIO_SPAPR 7 #define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) +#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) #define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) #define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) @@ -79,49 +81,37 @@ struct vfio_iommu_spapr_tce_info { #define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0) -#define RTE_VFIO_NOIOMMU 8 -#else -#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU -#endif - #define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS - -/* - * Function prototypes for VFIO multiprocess sync functions - */ -int vfio_mp_sync_send_request(int socket, int req); -int vfio_mp_sync_receive_request(int socket); -int vfio_mp_sync_send_fd(int socket, int fd); -int vfio_mp_sync_receive_fd(int socket); -int vfio_mp_sync_connect_to_primary(void); +#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS /* * we don't need to store device fd's anywhere since they can be obtained from * the group fd via an ioctl() call. */ struct vfio_group { - int group_no; + int group_num; int fd; int devices; }; -struct vfio_config { - int vfio_enabled; - int vfio_container_fd; - int vfio_active_groups; - struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; -}; - /* DMA mapping function prototype. * Takes VFIO container fd as a parameter. * Returns 0 on success, -1 on error. * */ typedef int (*vfio_dma_func_t)(int); +/* Custom memory region DMA mapping function prototype. + * Takes VFIO container fd, virtual address, phisical address, length and + * operation type (0 to unmap 1 for map) as a parameters. + * Returns 0 on success, -1 on error. + **/ +typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map); + struct vfio_iommu_type { int type_id; const char *name; + vfio_dma_user_func_t dma_user_map_func; vfio_dma_func_t dma_map_func; }; @@ -133,23 +123,10 @@ vfio_set_iommu_type(int vfio_container_fd); int vfio_has_supported_extensions(int vfio_container_fd); -/* open container fd or get an existing one */ -int -vfio_get_container_fd(void); - -/* parse IOMMU group number for a device - * returns 1 on success, -1 for errors, 0 for non-existent group - */ -int -vfio_get_group_no(const char *sysfs_base, - const char *dev_addr, int *iommu_group_no); - -/* open group fd or get an existing one */ -int -vfio_get_group_fd(int iommu_group_no); - int vfio_mp_sync_setup(void); +#define EAL_VFIO_MP "eal_vfio_mp_sync" + #define SOCKET_REQ_CONTAINER 0x100 #define SOCKET_REQ_GROUP 0x200 #define SOCKET_CLR_GROUP 0x300 @@ -157,6 +134,12 @@ int vfio_mp_sync_setup(void); #define SOCKET_NO_FD 0x1 #define SOCKET_ERR 0xFF +struct vfio_mp_param { + int req; + int result; + int group_num; +}; + #endif /* VFIO_PRESENT */ #endif /* EAL_VFIO_H_ */ diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c index 7cc3c152..9c202bb0 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c +++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c @@ -1,32 +1,16 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation + * Copyright(c) 2010-2018 Intel Corporation */ +#include <unistd.h> #include <string.h> -#include <fcntl.h> -#include <sys/socket.h> -#include <pthread.h> - -/* sys/un.h with __USE_MISC uses strlen, which is unsafe */ -#ifdef __USE_MISC -#define REMOVED_USE_MISC -#undef __USE_MISC -#endif -#include <sys/un.h> -/* make sure we redefine __USE_MISC only if it was previously undefined */ -#ifdef REMOVED_USE_MISC -#define __USE_MISC -#undef REMOVED_USE_MISC -#endif +#include <rte_compat.h> #include <rte_log.h> -#include <rte_eal_memconfig.h> -#include <rte_malloc.h> #include <rte_vfio.h> +#include <rte_eal.h> -#include "eal_filesystem.h" #include "eal_vfio.h" -#include "eal_thread.h" /** * @file @@ -37,358 +21,78 @@ #ifdef VFIO_PRESENT -#define SOCKET_PATH_FMT "%s/.%s_mp_socket" -#define CMSGLEN (CMSG_LEN(sizeof(int))) -#define FD_TO_CMSGHDR(fd, chdr) \ - do {\ - (chdr).cmsg_len = CMSGLEN;\ - (chdr).cmsg_level = SOL_SOCKET;\ - (chdr).cmsg_type = SCM_RIGHTS;\ - memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\ - } while (0) -#define CMSGHDR_TO_FD(chdr, fd) \ - memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd)) - -static pthread_t socket_thread; -static int mp_socket_fd; - - -/* get socket path (/var/run if root, $HOME otherwise) */ -static void -get_socket_path(char *buffer, int bufsz) -{ - const char *dir = "/var/run"; - const char *home_dir = getenv("HOME"); - - if (getuid() != 0 && home_dir != NULL) - dir = home_dir; - - /* use current prefix as file path */ - snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir, - internal_config.hugefile_prefix); -} - - - -/* - * data flow for socket comm protocol: - * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP - * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number - * 2. server receives message - * 2a. in case of invalid group, SOCKET_ERR is sent back to client - * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client - * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd - * - * in case of any error, socket is closed. - */ - -/* send a request, return -1 on error */ -int -vfio_mp_sync_send_request(int socket, int req) -{ - struct msghdr hdr; - struct iovec iov; - int buf; - int ret; - - memset(&hdr, 0, sizeof(hdr)); - - buf = req; - - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - iov.iov_base = (char *) &buf; - iov.iov_len = sizeof(buf); - - ret = sendmsg(socket, &hdr, 0); - if (ret < 0) - return -1; - return 0; -} - -/* receive a request and return it */ -int -vfio_mp_sync_receive_request(int socket) -{ - int buf; - struct msghdr hdr; - struct iovec iov; - int ret, req; - - memset(&hdr, 0, sizeof(hdr)); - - buf = SOCKET_ERR; - - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - iov.iov_base = (char *) &buf; - iov.iov_len = sizeof(buf); - - ret = recvmsg(socket, &hdr, 0); - if (ret < 0) - return -1; - - req = buf; - - return req; -} - -/* send OK in message, fd in control message */ -int -vfio_mp_sync_send_fd(int socket, int fd) +static int +vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer) { - int buf; - struct msghdr hdr; - struct cmsghdr *chdr; - char chdr_buf[CMSGLEN]; - struct iovec iov; + int fd = -1; int ret; + struct rte_mp_msg reply; + struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param; + const struct vfio_mp_param *m = + (const struct vfio_mp_param *)msg->param; - chdr = (struct cmsghdr *) chdr_buf; - memset(chdr, 0, sizeof(chdr_buf)); - memset(&hdr, 0, sizeof(hdr)); - - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - iov.iov_base = (char *) &buf; - iov.iov_len = sizeof(buf); - hdr.msg_control = chdr; - hdr.msg_controllen = CMSGLEN; - - buf = SOCKET_OK; - FD_TO_CMSGHDR(fd, *chdr); - - ret = sendmsg(socket, &hdr, 0); - if (ret < 0) - return -1; - return 0; -} - -/* receive OK in message, fd in control message */ -int -vfio_mp_sync_receive_fd(int socket) -{ - int buf; - struct msghdr hdr; - struct cmsghdr *chdr; - char chdr_buf[CMSGLEN]; - struct iovec iov; - int ret, req, fd; - - buf = SOCKET_ERR; - - chdr = (struct cmsghdr *) chdr_buf; - memset(chdr, 0, sizeof(chdr_buf)); - memset(&hdr, 0, sizeof(hdr)); - - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - iov.iov_base = (char *) &buf; - iov.iov_len = sizeof(buf); - hdr.msg_control = chdr; - hdr.msg_controllen = CMSGLEN; - - ret = recvmsg(socket, &hdr, 0); - if (ret < 0) - return -1; - - req = buf; - - if (req != SOCKET_OK) - return -1; - - CMSGHDR_TO_FD(*chdr, fd); - - return fd; -} - -/* connect socket_fd in secondary process to the primary process's socket */ -int -vfio_mp_sync_connect_to_primary(void) -{ - struct sockaddr_un addr; - socklen_t sockaddr_len; - int socket_fd; - - /* set up a socket */ - socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); - if (socket_fd < 0) { - RTE_LOG(ERR, EAL, "Failed to create socket!\n"); + if (msg->len_param != sizeof(*m)) { + RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); return -1; } - get_socket_path(addr.sun_path, sizeof(addr.sun_path)); - addr.sun_family = AF_UNIX; - - sockaddr_len = sizeof(struct sockaddr_un); - - if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0) - return socket_fd; - - /* if connect failed */ - close(socket_fd); - return -1; -} - + memset(&reply, 0, sizeof(reply)); - -/* - * socket listening thread for primary process - */ -static __attribute__((noreturn)) void * -vfio_mp_sync_thread(void __rte_unused * arg) -{ - int ret, fd, vfio_data; - - /* wait for requests on the socket */ - for (;;) { - int conn_sock; - struct sockaddr_un addr; - socklen_t sockaddr_len = sizeof(addr); - - /* this is a blocking call */ - conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr, - &sockaddr_len); - - /* just restart on error */ - if (conn_sock == -1) - continue; - - /* set socket to linger after close */ - struct linger l; - l.l_onoff = 1; - l.l_linger = 60; - - if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0) - RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option " - "on listen socket (%s)\n", strerror(errno)); - - ret = vfio_mp_sync_receive_request(conn_sock); - - switch (ret) { - case SOCKET_REQ_CONTAINER: - fd = vfio_get_container_fd(); - if (fd < 0) - vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); - else - vfio_mp_sync_send_fd(conn_sock, fd); - if (fd >= 0) - close(fd); - break; - case SOCKET_REQ_GROUP: - /* wait for group number */ - vfio_data = vfio_mp_sync_receive_request(conn_sock); - if (vfio_data < 0) { - close(conn_sock); - continue; - } - - fd = vfio_get_group_fd(vfio_data); - - if (fd < 0) - vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); + switch (m->req) { + case SOCKET_REQ_GROUP: + r->req = SOCKET_REQ_GROUP; + r->group_num = m->group_num; + fd = rte_vfio_get_group_fd(m->group_num); + if (fd < 0) + r->result = SOCKET_ERR; + else if (fd == 0) /* if VFIO group exists but isn't bound to VFIO driver */ - else if (fd == 0) - vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD); + r->result = SOCKET_NO_FD; + else { /* if group exists and is bound to VFIO driver */ - else { - vfio_mp_sync_send_request(conn_sock, SOCKET_OK); - vfio_mp_sync_send_fd(conn_sock, fd); - } - break; - case SOCKET_CLR_GROUP: - /* wait for group fd */ - vfio_data = vfio_mp_sync_receive_request(conn_sock); - if (vfio_data < 0) { - close(conn_sock); - continue; - } - - ret = rte_vfio_clear_group(vfio_data); - - if (ret < 0) - vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD); - else - vfio_mp_sync_send_request(conn_sock, SOCKET_OK); - break; - default: - vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); - break; + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; } - close(conn_sock); - } -} - -static int -vfio_mp_sync_socket_setup(void) -{ - int ret, socket_fd; - struct sockaddr_un addr; - socklen_t sockaddr_len; - - /* set up a socket */ - socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); - if (socket_fd < 0) { - RTE_LOG(ERR, EAL, "Failed to create socket!\n"); - return -1; - } - - get_socket_path(addr.sun_path, sizeof(addr.sun_path)); - addr.sun_family = AF_UNIX; - - sockaddr_len = sizeof(struct sockaddr_un); - - unlink(addr.sun_path); - - ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len); - if (ret) { - RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno)); - close(socket_fd); - return -1; - } - - ret = listen(socket_fd, 50); - if (ret) { - RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno)); - close(socket_fd); + break; + case SOCKET_CLR_GROUP: + r->req = SOCKET_CLR_GROUP; + r->group_num = m->group_num; + if (rte_vfio_clear_group(m->group_num) < 0) + r->result = SOCKET_NO_FD; + else + r->result = SOCKET_OK; + break; + case SOCKET_REQ_CONTAINER: + r->req = SOCKET_REQ_CONTAINER; + fd = rte_vfio_get_container_fd(); + if (fd < 0) + r->result = SOCKET_ERR; + else { + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + default: + RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); return -1; } - /* save the socket in local configuration */ - mp_socket_fd = socket_fd; + strcpy(reply.name, EAL_VFIO_MP); + reply.len_param = sizeof(*r); - return 0; + ret = rte_mp_reply(&reply, peer); + if (m->req == SOCKET_REQ_CONTAINER && fd >= 0) + close(fd); + return ret; } -/* - * set up a local socket and tell it to listen for incoming connections - */ int vfio_mp_sync_setup(void) { - int ret; - char thread_name[RTE_MAX_THREAD_NAME_LEN]; - - if (vfio_mp_sync_socket_setup() < 0) { - RTE_LOG(ERR, EAL, "Failed to set up local socket!\n"); - return -1; - } - - ret = pthread_create(&socket_thread, NULL, - vfio_mp_sync_thread, NULL); - if (ret) { - RTE_LOG(ERR, EAL, - "Failed to create thread for communication with secondary processes!\n"); - close(mp_socket_fd); - return -1; - } - - /* Set thread_name for aid in debugging. */ - snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync"); - ret = rte_thread_setname(socket_thread, thread_name); - if (ret) - RTE_LOG(DEBUG, EAL, - "Failed to set thread name for secondary processes!\n"); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary); return 0; } diff --git a/lib/librte_eal/linuxapp/eal/meson.build b/lib/librte_eal/linuxapp/eal/meson.build index 03974ff2..cce37712 100644 --- a/lib/librte_eal/linuxapp/eal/meson.build +++ b/lib/librte_eal/linuxapp/eal/meson.build @@ -7,9 +7,11 @@ install_subdir('include/exec-env', install_dir: get_option('includedir')) env_objs = [] env_headers = [] env_sources = files('eal_alarm.c', + 'eal_cpuflags.c', 'eal_debug.c', 'eal_hugepage_info.c', 'eal_interrupts.c', + 'eal_memalloc.c', 'eal_lcore.c', 'eal_log.c', 'eal_thread.c', @@ -18,6 +20,7 @@ env_sources = files('eal_alarm.c', 'eal_vfio_mp_sync.c', 'eal.c', 'eal_memory.c', + 'eal_dev.c', ) if has_libnuma == 1 |