diff options
author | C.J. Collier <cjcollier@linuxfoundation.org> | 2016-06-14 07:50:17 -0700 |
---|---|---|
committer | C.J. Collier <cjcollier@linuxfoundation.org> | 2016-06-14 12:17:54 -0700 |
commit | 97f17497d162afdb82c8704bf097f0fee3724b2e (patch) | |
tree | 1c6269614c0c15ffef8451c58ae8f8b30a1bc804 /lib/librte_eal/linuxapp/eal | |
parent | e04be89c2409570e0055b2cda60bd11395bb93b0 (diff) |
Imported Upstream version 16.04
Change-Id: I77eadcd8538a9122e4773cbe55b24033dc451757
Signed-off-by: C.J. Collier <cjcollier@linuxfoundation.org>
Diffstat (limited to 'lib/librte_eal/linuxapp/eal')
23 files changed, 10312 insertions, 0 deletions
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile new file mode 100644 index 00000000..e1093619 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/Makefile @@ -0,0 +1,139 @@ +# BSD LICENSE +# +# Copyright(c) 2010-2016 Intel Corporation. All rights reserved. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include $(RTE_SDK)/mk/rte.vars.mk + +LIB = librte_eal.a + +ARCH_DIR ?= $(RTE_ARCH) +EXPORT_MAP := rte_eal_version.map +VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) + +LIBABIVER := 2 + +VPATH += $(RTE_SDK)/lib/librte_eal/common + +CFLAGS += -I$(SRCDIR)/include +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include +CFLAGS += -I$(RTE_SDK)/lib/librte_ring +CFLAGS += -I$(RTE_SDK)/lib/librte_mempool +CFLAGS += -I$(RTE_SDK)/lib/librte_ivshmem +CFLAGS += $(WERROR_FLAGS) -O3 + +LDLIBS += -ldl +LDLIBS += -lpthread +LDLIBS += -lgcc_s +LDLIBS += -lrt + +# specific to linuxapp exec-env +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_hugepage_info.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memory.c +ifeq ($(CONFIG_RTE_LIBRTE_XEN_DOM0),y) +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_xen_memory.c +endif +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci_uio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci_vfio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci_vfio_mp_sync.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c +ifeq ($(CONFIG_RTE_LIBRTE_IVSHMEM),y) +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_ivshmem.c +endif + +# from common dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_pci.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_pci_uio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_string_fns.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hexdump.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_devargs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c + +# from arch dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_cpuflags.c + +CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) + +CFLAGS_eal.o := -D_GNU_SOURCE +CFLAGS_eal_interrupts.o := -D_GNU_SOURCE +CFLAGS_eal_pci_vfio_mp_sync.o := -D_GNU_SOURCE +CFLAGS_eal_timer.o := -D_GNU_SOURCE +CFLAGS_eal_lcore.o := -D_GNU_SOURCE +CFLAGS_eal_thread.o := -D_GNU_SOURCE +CFLAGS_eal_log.o := -D_GNU_SOURCE +CFLAGS_eal_common_log.o := -D_GNU_SOURCE +CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE +CFLAGS_eal_pci.o := -D_GNU_SOURCE +CFLAGS_eal_pci_uio.o := -D_GNU_SOURCE +CFLAGS_eal_pci_vfio.o := -D_GNU_SOURCE +CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE +CFLAGS_eal_common_options.o := -D_GNU_SOURCE +CFLAGS_eal_common_thread.o := -D_GNU_SOURCE +CFLAGS_eal_common_lcore.o := -D_GNU_SOURCE + +# workaround for a gcc bug with noreturn attribute +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_eal_thread.o += -Wno-return-type +endif + +INC := rte_interrupts.h rte_kni_common.h rte_dom0_common.h + +SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \ + $(addprefix include/exec-env/,$(INC)) + +DEPDIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += lib/librte_eal/common +DEPDIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += lib/librte_eal/common/arch/$(ARCH_DIR) + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c new file mode 100644 index 00000000..8aafd519 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal.c @@ -0,0 +1,936 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * Copyright(c) 2012-2014 6WIND S.A. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <stdarg.h> +#include <unistd.h> +#include <pthread.h> +#include <syslog.h> +#include <getopt.h> +#include <sys/file.h> +#include <fcntl.h> +#include <stddef.h> +#include <errno.h> +#include <limits.h> +#include <errno.h> +#include <sys/mman.h> +#include <sys/queue.h> +#include <sys/stat.h> +#if defined(RTE_ARCH_X86) +#include <sys/io.h> +#endif + +#include <rte_common.h> +#include <rte_debug.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_log.h> +#include <rte_random.h> +#include <rte_cycles.h> +#include <rte_string_fns.h> +#include <rte_cpuflags.h> +#include <rte_interrupts.h> +#include <rte_pci.h> +#include <rte_devargs.h> +#include <rte_common.h> +#include <rte_version.h> +#include <rte_atomic.h> +#include <malloc_heap.h> + +#include "eal_private.h" +#include "eal_thread.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" +#include "eal_options.h" + +#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) + +#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10) + +/* Allow the application to print its usage message too if set */ +static rte_usage_hook_t rte_application_usage_hook = NULL; + +/* early configuration structure, when memory config is not mmapped */ +static struct rte_mem_config early_mem_config; + +/* define fd variable here, because file needs to be kept open for the + * duration of the program, as we hold a write lock on it in the primary proc */ +static int mem_cfg_fd = -1; + +static struct flock wr_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = offsetof(struct rte_mem_config, memseg), + .l_len = sizeof(early_mem_config.memseg), +}; + +/* Address of global and public configuration */ +static struct rte_config rte_config = { + .mem_config = &early_mem_config, +}; + +/* internal configuration (per-core) */ +struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/* internal configuration */ +struct internal_config internal_config; + +/* used by rte_rdtsc() */ +int rte_cycles_vmware_tsc_map; + +/* Return a pointer to the configuration structure */ +struct rte_config * +rte_eal_get_configuration(void) +{ + return &rte_config; +} + +/* parse a sysfs (or other) file containing one integer value */ +int +eal_parse_sysfs_value(const char *filename, unsigned long *val) +{ + FILE *f; + char buf[BUFSIZ]; + char *end = NULL; + + if ((f = fopen(filename, "r")) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", + __func__, filename); + return -1; + } + + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + *val = strtoul(buf, &end, 0); + if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { + RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + fclose(f); + return 0; +} + + +/* create memory configuration in shared/mmap memory. Take out + * a write lock on the memsegs, so we can auto-detect primary/secondary. + * This means we never close the file while running (auto-close on exit). + * We also don't lock the whole file, so that in future we can use read-locks + * on other parts, e.g. memzones, to detect if there are running secondary + * processes. */ +static void +rte_eal_config_create(void) +{ + void *rte_mem_cfg_addr; + int retval; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return; + + /* map the config before hugepage address so that we don't waste a page */ + if (internal_config.base_virtaddr != 0) + rte_mem_cfg_addr = (void *) + RTE_ALIGN_FLOOR(internal_config.base_virtaddr - + sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE)); + else + rte_mem_cfg_addr = NULL; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660); + if (mem_cfg_fd < 0) + rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); + } + + retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config)); + if (retval < 0){ + close(mem_cfg_fd); + rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname); + } + + retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); + if (retval < 0){ + close(mem_cfg_fd); + rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary " + "process running?\n", pathname); + } + + rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config), + PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0); + + if (rte_mem_cfg_addr == MAP_FAILED){ + rte_panic("Cannot mmap memory for rte_config\n"); + } + memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); + rte_config.mem_config = (struct rte_mem_config *) rte_mem_cfg_addr; + + /* store address of the config in the config itself so that secondary + * processes could later map the config into this exact location */ + rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; + +} + +/* attach to an existing shared memory config */ +static void +rte_eal_config_attach(void) +{ + struct rte_mem_config *mem_config; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR); + if (mem_cfg_fd < 0) + rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); + } + + /* map it as read-only first */ + mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config), + PROT_READ, MAP_SHARED, mem_cfg_fd, 0); + if (mem_config == MAP_FAILED) + rte_panic("Cannot mmap memory for rte_config\n"); + + rte_config.mem_config = mem_config; +} + +/* reattach the shared config at exact memory location primary process has it */ +static void +rte_eal_config_reattach(void) +{ + struct rte_mem_config *mem_config; + void *rte_mem_cfg_addr; + + if (internal_config.no_shconf) + return; + + /* save the address primary process has mapped shared config to */ + rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr; + + /* unmap original config */ + munmap(rte_config.mem_config, sizeof(struct rte_mem_config)); + + /* remap the config at proper address */ + mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr, + sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, + mem_cfg_fd, 0); + close(mem_cfg_fd); + if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) + rte_panic("Cannot mmap memory for rte_config\n"); + + rte_config.mem_config = mem_config; +} + +/* Detect if we are a primary or a secondary process */ +enum rte_proc_type_t +eal_proc_type_detect(void) +{ + enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; + const char *pathname = eal_runtime_config_path(); + + /* if we can open the file but not get a write-lock we are a secondary + * process. NOTE: if we get a file handle back, we keep that open + * and don't close it to prevent a race condition between multiple opens */ + if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && + (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) + ptype = RTE_PROC_SECONDARY; + + RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", + ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); + + return ptype; +} + +/* Sets up rte_config structure with the pointer to shared memory config.*/ +static void +rte_config_init(void) +{ + rte_config.process_type = internal_config.process_type; + + switch (rte_config.process_type){ + case RTE_PROC_PRIMARY: + rte_eal_config_create(); + break; + case RTE_PROC_SECONDARY: + rte_eal_config_attach(); + rte_eal_mcfg_wait_complete(rte_config.mem_config); + rte_eal_config_reattach(); + break; + case RTE_PROC_AUTO: + case RTE_PROC_INVALID: + rte_panic("Invalid process type\n"); + } +} + +/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */ +static void +eal_hugedirs_unlock(void) +{ + int i; + + for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) + { + /* skip uninitialized */ + if (internal_config.hugepage_info[i].lock_descriptor < 0) + continue; + /* unlock hugepage file */ + flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN); + close(internal_config.hugepage_info[i].lock_descriptor); + /* reset the field */ + internal_config.hugepage_info[i].lock_descriptor = -1; + } +} + +/* display usage */ +static void +eal_usage(const char *prgname) +{ + printf("\nUsage: %s ", prgname); + eal_common_usage(); + printf("EAL Linux options:\n" + " --"OPT_SOCKET_MEM" Memory to allocate on sockets (comma separated values)\n" + " --"OPT_HUGE_DIR" Directory where hugetlbfs is mounted\n" + " --"OPT_FILE_PREFIX" Prefix for hugepage filenames\n" + " --"OPT_BASE_VIRTADDR" Base virtual address\n" + " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n" + " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n" + " --"OPT_XEN_DOM0" Support running on Xen dom0 without hugetlbfs\n" + "\n"); + /* Allow the application to print its usage message too if hook is set */ + if ( rte_application_usage_hook ) { + printf("===== Application Usage =====\n\n"); + rte_application_usage_hook(prgname); + } +} + +/* Set a per-application usage message */ +rte_usage_hook_t +rte_set_application_usage_hook( rte_usage_hook_t usage_func ) +{ + rte_usage_hook_t old_func; + + /* Will be NULL on the first call to denote the last usage routine. */ + old_func = rte_application_usage_hook; + rte_application_usage_hook = usage_func; + + return old_func; +} + +static int +eal_parse_socket_mem(char *socket_mem) +{ + char * arg[RTE_MAX_NUMA_NODES]; + char *end; + int arg_num, i, len; + uint64_t total_mem = 0; + + len = strnlen(socket_mem, SOCKET_MEM_STRLEN); + if (len == SOCKET_MEM_STRLEN) { + RTE_LOG(ERR, EAL, "--socket-mem is too long\n"); + return -1; + } + + /* all other error cases will be caught later */ + if (!isdigit(socket_mem[len-1])) + return -1; + + /* split the optarg into separate socket values */ + arg_num = rte_strsplit(socket_mem, len, + arg, RTE_MAX_NUMA_NODES, ','); + + /* if split failed, or 0 arguments */ + if (arg_num <= 0) + return -1; + + internal_config.force_sockets = 1; + + /* parse each defined socket option */ + errno = 0; + for (i = 0; i < arg_num; i++) { + end = NULL; + internal_config.socket_mem[i] = strtoull(arg[i], &end, 10); + + /* check for invalid input */ + if ((errno != 0) || + (arg[i][0] == '\0') || (end == NULL) || (*end != '\0')) + return -1; + internal_config.socket_mem[i] *= 1024ULL; + internal_config.socket_mem[i] *= 1024ULL; + total_mem += internal_config.socket_mem[i]; + } + + /* check if we have a positive amount of total memory */ + if (total_mem == 0) + return -1; + + return 0; +} + +static int +eal_parse_base_virtaddr(const char *arg) +{ + char *end; + uint64_t addr; + + errno = 0; + addr = strtoull(arg, &end, 16); + + /* check for errors */ + if ((errno != 0) || (arg[0] == '\0') || end == NULL || (*end != '\0')) + return -1; + + /* make sure we don't exceed 32-bit boundary on 32-bit target */ +#ifndef RTE_ARCH_64 + if (addr >= UINTPTR_MAX) + return -1; +#endif + + /* align the addr on 16M boundary, 16MB is the minimum huge page + * size on IBM Power architecture. If the addr is aligned to 16MB, + * it can align to 2MB for x86. So this alignment can also be used + * on x86 */ + internal_config.base_virtaddr = + RTE_PTR_ALIGN_CEIL((uintptr_t)addr, (size_t)RTE_PGSIZE_16M); + + return 0; +} + +static int +eal_parse_vfio_intr(const char *mode) +{ + unsigned i; + static struct { + const char *name; + enum rte_intr_mode value; + } map[] = { + { "legacy", RTE_INTR_MODE_LEGACY }, + { "msi", RTE_INTR_MODE_MSI }, + { "msix", RTE_INTR_MODE_MSIX }, + }; + + for (i = 0; i < RTE_DIM(map); i++) { + if (!strcmp(mode, map[i].name)) { + internal_config.vfio_intr_mode = map[i].value; + return 0; + } + } + return -1; +} + +static inline size_t +eal_get_hugepage_mem_size(void) +{ + uint64_t size = 0; + unsigned i, j; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (hpi->hugedir != NULL) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + size += hpi->hugepage_sz * hpi->num_pages[j]; + } + } + } + + return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; +} + +/* Parse the arguments for --log-level only */ +static void +eal_log_level_parse(int argc, char **argv) +{ + int opt; + char **argvopt; + int option_index; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + + eal_reset_internal_config(&internal_config); + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') + break; + + ret = (opt == OPT_LOG_LEVEL_NUM) ? + eal_parse_common_option(opt, optarg, &internal_config) : 0; + + /* common parser is not happy */ + if (ret < 0) + break; + } + + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; +} + +/* Parse the argument given in the command line of the application */ +static int +eal_parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + /* getopt is not happy, stop right now */ + if (opt == '?') { + eal_usage(prgname); + ret = -1; + goto out; + } + + ret = eal_parse_common_option(opt, optarg, &internal_config); + /* common parser is not happy */ + if (ret < 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + /* common parser handled this option */ + if (ret == 0) + continue; + + switch (opt) { + case 'h': + eal_usage(prgname); + exit(EXIT_SUCCESS); + + /* long options */ + case OPT_XEN_DOM0_NUM: +#ifdef RTE_LIBRTE_XEN_DOM0 + internal_config.xen_dom0_support = 1; +#else + RTE_LOG(ERR, EAL, "Can't support DPDK app " + "running on Dom0, please configure" + " RTE_LIBRTE_XEN_DOM0=y\n"); + ret = -1; + goto out; +#endif + break; + + case OPT_HUGE_DIR_NUM: + internal_config.hugepage_dir = optarg; + break; + + case OPT_FILE_PREFIX_NUM: + internal_config.hugefile_prefix = optarg; + break; + + case OPT_SOCKET_MEM_NUM: + if (eal_parse_socket_mem(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_MEM "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_BASE_VIRTADDR_NUM: + if (eal_parse_base_virtaddr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_BASE_VIRTADDR "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_VFIO_INTR_NUM: + if (eal_parse_vfio_intr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_VFIO_INTR "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_CREATE_UIO_DEV_NUM: + internal_config.create_uio_dev = 1; + break; + + default: + if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { + RTE_LOG(ERR, EAL, "Option %c is not supported " + "on Linux\n", opt); + } else if (opt >= OPT_LONG_MIN_NUM && + opt < OPT_LONG_MAX_NUM) { + RTE_LOG(ERR, EAL, "Option %s is not supported " + "on Linux\n", + eal_long_options[option_index].name); + } else { + RTE_LOG(ERR, EAL, "Option %d is not supported " + "on Linux\n", opt); + } + eal_usage(prgname); + ret = -1; + goto out; + } + } + + if (eal_adjust_config(&internal_config) != 0) { + ret = -1; + goto out; + } + + /* sanity checks */ + if (eal_check_common_options(&internal_config) != 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + + /* --xen-dom0 doesn't make sense with --socket-mem */ + if (internal_config.xen_dom0_support && internal_config.force_sockets == 1) { + RTE_LOG(ERR, EAL, "Options --"OPT_SOCKET_MEM" cannot be specified " + "together with --"OPT_XEN_DOM0"\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + + if (optind >= 0) + argv[optind-1] = prgname; + ret = optind-1; + +out: + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; + + return ret; +} + +static void +eal_check_mem_on_local_socket(void) +{ + const struct rte_memseg *ms; + int i, socket_id; + + socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); + + ms = rte_eal_get_physmem_layout(); + + for (i = 0; i < RTE_MAX_MEMSEG; i++) + if (ms[i].socket_id == socket_id && + ms[i].len > 0) + return; + + RTE_LOG(WARNING, EAL, "WARNING: Master core has no " + "memory on local socket!\n"); +} + +static int +sync_func(__attribute__((unused)) void *arg) +{ + return 0; +} + +inline static void +rte_eal_mcfg_complete(void) +{ + /* ALL shared mem_config related INIT DONE */ + if (rte_config.process_type == RTE_PROC_PRIMARY) + rte_config.mem_config->magic = RTE_MAGIC; +} + +/* + * Request iopl privilege for all RPL, returns 0 on success + * iopl() call is mostly for the i386 architecture. For other architectures, + * return -1 to indicate IO privilege can't be changed in this way. + */ +int +rte_eal_iopl_init(void) +{ +#if defined(RTE_ARCH_X86) + if (iopl(3) != 0) + return -1; + return 0; +#elif defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) + return 0; /* iopl syscall not supported for ARM/ARM64 */ +#else + return -1; +#endif +} + +/* Launch threads, called at application init(). */ +int +rte_eal_init(int argc, char **argv) +{ + int i, fctret, ret; + pthread_t thread_id; + static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); + const char *logid; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + if (!rte_atomic32_test_and_set(&run_once)) + return -1; + + logid = strrchr(argv[0], '/'); + logid = strdup(logid ? logid + 1: argv[0]); + + thread_id = pthread_self(); + + if (rte_eal_log_early_init() < 0) + rte_panic("Cannot init early logs\n"); + + eal_log_level_parse(argc, argv); + + /* set log level as early as possible */ + rte_set_log_level(internal_config.log_level); + + if (rte_eal_cpu_init() < 0) + rte_panic("Cannot detect lcores\n"); + + fctret = eal_parse_args(argc, argv); + if (fctret < 0) + exit(1); + + if (internal_config.no_hugetlbfs == 0 && + internal_config.process_type != RTE_PROC_SECONDARY && + internal_config.xen_dom0_support == 0 && + eal_hugepage_info_init() < 0) + rte_panic("Cannot get hugepage information\n"); + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) { + if (internal_config.no_hugetlbfs) + internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; + else + internal_config.memory = eal_get_hugepage_mem_size(); + } + + if (internal_config.vmware_tsc_map == 1) { +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT + rte_cycles_vmware_tsc_map = 1; + RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " + "you must have monitor_control.pseudo_perfctr = TRUE\n"); +#else + RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " + "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); +#endif + } + + rte_srand(rte_rdtsc()); + + rte_config_init(); + + if (rte_eal_pci_init() < 0) + rte_panic("Cannot init PCI\n"); + +#ifdef RTE_LIBRTE_IVSHMEM + if (rte_eal_ivshmem_init() < 0) + rte_panic("Cannot init IVSHMEM\n"); +#endif + + if (rte_eal_memory_init() < 0) + rte_panic("Cannot init memory\n"); + + /* the directories are locked during eal_hugepage_info_init */ + eal_hugedirs_unlock(); + + if (rte_eal_memzone_init() < 0) + rte_panic("Cannot init memzone\n"); + + if (rte_eal_tailqs_init() < 0) + rte_panic("Cannot init tail queues for objects\n"); + +#ifdef RTE_LIBRTE_IVSHMEM + if (rte_eal_ivshmem_obj_init() < 0) + rte_panic("Cannot init IVSHMEM objects\n"); +#endif + + if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) + rte_panic("Cannot init logs\n"); + + if (rte_eal_alarm_init() < 0) + rte_panic("Cannot init interrupt-handling thread\n"); + + if (rte_eal_timer_init() < 0) + rte_panic("Cannot init HPET or TSC timers\n"); + + eal_check_mem_on_local_socket(); + + if (eal_plugins_init() < 0) + rte_panic("Cannot init plugins\n"); + + eal_thread_init_master(rte_config.master_lcore); + + ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN); + + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n", + rte_config.master_lcore, (int)thread_id, cpuset, + ret == 0 ? "" : "..."); + + if (rte_eal_dev_init() < 0) + rte_panic("Cannot init pmd devices\n"); + + if (rte_eal_intr_init() < 0) + rte_panic("Cannot init interrupt-handling thread\n"); + + RTE_LCORE_FOREACH_SLAVE(i) { + + /* + * create communication pipes between master thread + * and children + */ + if (pipe(lcore_config[i].pipe_master2slave) < 0) + rte_panic("Cannot create pipe\n"); + if (pipe(lcore_config[i].pipe_slave2master) < 0) + rte_panic("Cannot create pipe\n"); + + lcore_config[i].state = WAIT; + + /* create a thread for each lcore */ + ret = pthread_create(&lcore_config[i].thread_id, NULL, + eal_thread_loop, NULL); + if (ret != 0) + rte_panic("Cannot create thread\n"); + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, + "lcore-slave-%d", i); + ret = rte_thread_setname(lcore_config[i].thread_id, + thread_name); + if (ret != 0) + RTE_LOG(ERR, EAL, + "Cannot set name for lcore thread\n"); + } + + /* + * Launch a dummy function on all slave lcores, so that master lcore + * knows they are all ready when this function returns. + */ + rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); + rte_eal_mp_wait_lcore(); + + /* Probe & Initialize PCI devices */ + if (rte_eal_pci_probe()) + rte_panic("Cannot probe PCI\n"); + + rte_eal_mcfg_complete(); + + return fctret; +} + +/* get core role */ +enum rte_lcore_role_t +rte_eal_lcore_role(unsigned lcore_id) +{ + return rte_config.lcore_role[lcore_id]; +} + +enum rte_proc_type_t +rte_eal_process_type(void) +{ + return rte_config.process_type; +} + +int rte_eal_has_hugepages(void) +{ + return ! internal_config.no_hugetlbfs; +} + +int +rte_eal_check_module(const char *module_name) +{ + char sysfs_mod_name[PATH_MAX]; + struct stat st; + int n; + + if (NULL == module_name) + return -1; + + /* Check if there is sysfs mounted */ + if (stat("/sys/module", &st) != 0) { + RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + /* A module might be built-in, therefore try sysfs */ + n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name); + if (n < 0 || n > PATH_MAX) { + RTE_LOG(DEBUG, EAL, "Could not format module path\n"); + return -1; + } + + if (stat(sysfs_mod_name, &st) != 0) { + RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n", + sysfs_mod_name, errno, strerror(errno)); + return 0; + } + + /* Module has been found */ + return 1; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_alarm.c b/lib/librte_eal/linuxapp/eal/eal_alarm.c new file mode 100644 index 00000000..8b042abc --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_alarm.c @@ -0,0 +1,273 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdio.h> +#include <stdint.h> +#include <signal.h> +#include <errno.h> +#include <string.h> +#include <sys/queue.h> +#include <sys/time.h> +#include <sys/timerfd.h> + +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_interrupts.h> +#include <rte_alarm.h> +#include <rte_common.h> +#include <rte_per_lcore.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_lcore.h> +#include <rte_errno.h> +#include <rte_malloc.h> +#include <rte_spinlock.h> +#include <eal_private.h> + +#ifndef TFD_NONBLOCK +#include <fcntl.h> +#define TFD_NONBLOCK O_NONBLOCK +#endif + +#define NS_PER_US 1000 +#define US_PER_MS 1000 +#define MS_PER_S 1000 +#define US_PER_S (US_PER_MS * MS_PER_S) + +#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW +#else +#define CLOCK_TYPE_ID CLOCK_MONOTONIC +#endif + +struct alarm_entry { + LIST_ENTRY(alarm_entry) next; + struct timeval time; + rte_eal_alarm_callback cb_fn; + void *cb_arg; + volatile uint8_t executing; + volatile pthread_t executing_id; +}; + +static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); +static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static int handler_registered = 0; +static void eal_alarm_callback(struct rte_intr_handle *hdl, void *arg); + +int +rte_eal_alarm_init(void) +{ + intr_handle.type = RTE_INTR_HANDLE_ALARM; + /* create a timerfd file descriptor */ + intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (intr_handle.fd == -1) + goto error; + + return 0; + +error: + rte_errno = errno; + return -1; +} + +static void +eal_alarm_callback(struct rte_intr_handle *hdl __rte_unused, + void *arg __rte_unused) +{ + struct timespec now; + struct alarm_entry *ap; + + rte_spinlock_lock(&alarm_list_lk); + while ((ap = LIST_FIRST(&alarm_list)) !=NULL && + clock_gettime(CLOCK_TYPE_ID, &now) == 0 && + (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec && + (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) { + ap->executing = 1; + ap->executing_id = pthread_self(); + rte_spinlock_unlock(&alarm_list_lk); + + ap->cb_fn(ap->cb_arg); + + rte_spinlock_lock(&alarm_list_lk); + + LIST_REMOVE(ap, next); + rte_free(ap); + } + + if (!LIST_EMPTY(&alarm_list)) { + struct itimerspec atime = { .it_interval = { 0, 0 } }; + + ap = LIST_FIRST(&alarm_list); + atime.it_value.tv_sec = ap->time.tv_sec; + atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US; + /* perform borrow for subtraction if necessary */ + if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US)) + atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US; + + atime.it_value.tv_sec -= now.tv_sec; + atime.it_value.tv_nsec -= now.tv_nsec; + timerfd_settime(intr_handle.fd, 0, &atime, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); +} + +int +rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct timespec now; + int ret = 0; + struct alarm_entry *ap, *new_alarm; + + /* Check parameters, including that us won't cause a uint64_t overflow */ + if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) + return -EINVAL; + + new_alarm = rte_zmalloc(NULL, sizeof(*new_alarm), 0); + if (new_alarm == NULL) + return -ENOMEM; + + /* use current time to calculate absolute time of alarm */ + clock_gettime(CLOCK_TYPE_ID, &now); + + new_alarm->cb_fn = cb_fn; + new_alarm->cb_arg = cb_arg; + new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S; + new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S); + + rte_spinlock_lock(&alarm_list_lk); + if (!handler_registered) { + ret |= rte_intr_callback_register(&intr_handle, + eal_alarm_callback, NULL); + handler_registered = (ret == 0) ? 1 : 0; + } + + if (LIST_EMPTY(&alarm_list)) + LIST_INSERT_HEAD(&alarm_list, new_alarm, next); + else { + LIST_FOREACH(ap, &alarm_list, next) { + if (ap->time.tv_sec > new_alarm->time.tv_sec || + (ap->time.tv_sec == new_alarm->time.tv_sec && + ap->time.tv_usec > new_alarm->time.tv_usec)){ + LIST_INSERT_BEFORE(ap, new_alarm, next); + break; + } + if (LIST_NEXT(ap, next) == NULL) { + LIST_INSERT_AFTER(ap, new_alarm, next); + break; + } + } + } + + if (LIST_FIRST(&alarm_list) == new_alarm) { + struct itimerspec alarm_time = { + .it_interval = {0, 0}, + .it_value = { + .tv_sec = us / US_PER_S, + .tv_nsec = (us % US_PER_S) * NS_PER_US, + }, + }; + ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); + + return ret; +} + +int +rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *ap_prev; + int count = 0; + int err = 0; + int executing; + + if (!cb_fn) { + rte_errno = EINVAL; + return -1; + } + + do { + executing = 0; + rte_spinlock_lock(&alarm_list_lk); + /* remove any matches at the start of the list */ + while ((ap = LIST_FIRST(&alarm_list)) != NULL && + cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + rte_free(ap); + count++; + } else { + /* If calling from other context, mark that alarm is executing + * so loop can spin till it finish. Otherwise we are trying to + * cancel our self - mark it by EINPROGRESS */ + if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + + break; + } + } + ap_prev = ap; + + /* now go through list, removing entries not at start */ + LIST_FOREACH(ap, &alarm_list, next) { + /* this won't be true first time through */ + if (cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + rte_free(ap); + count++; + ap = ap_prev; + } else if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + } + ap_prev = ap; + } + rte_spinlock_unlock(&alarm_list_lk); + } while (executing != 0); + + if (count == 0 && err == 0) + rte_errno = ENOENT; + else if (err) + rte_errno = err; + + return count; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_debug.c b/lib/librte_eal/linuxapp/eal/eal_debug.c new file mode 100644 index 00000000..907fbfa7 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_debug.c @@ -0,0 +1,119 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <execinfo.h> +#include <stdarg.h> +#include <signal.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> + +#include <rte_log.h> +#include <rte_debug.h> +#include <rte_common.h> + +#define BACKTRACE_SIZE 256 + +/* dump the stack of the calling core */ +void rte_dump_stack(void) +{ + void *func[BACKTRACE_SIZE]; + char **symb = NULL; + int size; + + size = backtrace(func, BACKTRACE_SIZE); + symb = backtrace_symbols(func, size); + + if (symb == NULL) + return; + + while (size > 0) { + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, + "%d: [%s]\n", size, symb[size - 1]); + size --; + } + + free(symb); +} + +/* not implemented in this environment */ +void rte_dump_registers(void) +{ + return; +} + +/* call abort(), it will generate a coredump if enabled */ +void __rte_panic(const char *funcname, const char *format, ...) +{ + va_list ap; + + /* disable history */ + rte_log_set_history(0); + + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + rte_dump_stack(); + rte_dump_registers(); + abort(); +} + +/* + * Like rte_panic this terminates the application. However, no traceback is + * provided and no core-dump is generated. + */ +void +rte_exit(int exit_code, const char *format, ...) +{ + va_list ap; + + /* disable history */ + rte_log_set_history(0); + + if (exit_code != 0) + RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" + " Cause: ", exit_code); + + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + +#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR + exit(exit_code); +#else + rte_dump_stack(); + rte_dump_registers(); + abort(); +#endif +} diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c new file mode 100644 index 00000000..18858e2d --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c @@ -0,0 +1,365 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <sys/types.h> +#include <sys/file.h> +#include <dirent.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <fnmatch.h> +#include <inttypes.h> +#include <stdarg.h> +#include <unistd.h> +#include <errno.h> +#include <sys/queue.h> + +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_debug.h> +#include <rte_log.h> +#include <rte_common.h> +#include "rte_string_fns.h" +#include "eal_internal_cfg.h" +#include "eal_hugepages.h" +#include "eal_filesystem.h" + +static const char sys_dir_path[] = "/sys/kernel/mm/hugepages"; + +/* this function is only called from eal_hugepage_info_init which itself + * is only called from a primary process */ +static uint32_t +get_num_hugepages(const char *subdir) +{ + char path[PATH_MAX]; + long unsigned resv_pages, num_pages = 0; + const char *nr_hp_file = "free_hugepages"; + const char *nr_rsvd_file = "resv_hugepages"; + + /* first, check how many reserved pages kernel reports */ + snprintf(path, sizeof(path), "%s/%s/%s", + sys_dir_path, subdir, nr_rsvd_file); + if (eal_parse_sysfs_value(path, &resv_pages) < 0) + return 0; + + snprintf(path, sizeof(path), "%s/%s/%s", + sys_dir_path, subdir, nr_hp_file); + if (eal_parse_sysfs_value(path, &num_pages) < 0) + return 0; + + if (num_pages == 0) + RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", + subdir); + + /* adjust num_pages */ + if (num_pages >= resv_pages) + num_pages -= resv_pages; + else if (resv_pages) + num_pages = 0; + + /* we want to return a uint32_t and more than this looks suspicious + * anyway ... */ + if (num_pages > UINT32_MAX) + num_pages = UINT32_MAX; + + return num_pages; +} + +static uint64_t +get_default_hp_size(void) +{ + const char proc_meminfo[] = "/proc/meminfo"; + const char str_hugepagesz[] = "Hugepagesize:"; + unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1; + char buffer[256]; + unsigned long long size = 0; + + FILE *fd = fopen(proc_meminfo, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_meminfo); + while(fgets(buffer, sizeof(buffer), fd)){ + if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){ + size = rte_str_to_size(&buffer[hugepagesz_len]); + break; + } + } + fclose(fd); + if (size == 0) + rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo); + return size; +} + +static const char * +get_hugepage_dir(uint64_t hugepage_sz) +{ + enum proc_mount_fieldnames { + DEVICE = 0, + MOUNTPT, + FSTYPE, + OPTIONS, + _FIELDNAME_MAX + }; + static uint64_t default_size = 0; + const char proc_mounts[] = "/proc/mounts"; + const char hugetlbfs_str[] = "hugetlbfs"; + const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1; + const char pagesize_opt[] = "pagesize="; + const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1; + const char split_tok = ' '; + char *splitstr[_FIELDNAME_MAX]; + char buf[BUFSIZ]; + char *retval = NULL; + + FILE *fd = fopen(proc_mounts, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_mounts); + + if (default_size == 0) + default_size = get_default_hp_size(); + + while (fgets(buf, sizeof(buf), fd)){ + if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX, + split_tok) != _FIELDNAME_MAX) { + RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts); + break; /* return NULL */ + } + + /* we have a specified --huge-dir option, only examine that dir */ + if (internal_config.hugepage_dir != NULL && + strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0) + continue; + + if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){ + const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt); + + /* if no explicit page size, the default page size is compared */ + if (pagesz_str == NULL){ + if (hugepage_sz == default_size){ + retval = strdup(splitstr[MOUNTPT]); + break; + } + } + /* there is an explicit page size, so check it */ + else { + uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]); + if (pagesz == hugepage_sz) { + retval = strdup(splitstr[MOUNTPT]); + break; + } + } + } /* end if strncmp hugetlbfs */ + } /* end while fgets */ + + fclose(fd); + return retval; +} + +/* + * Clear the hugepage directory of whatever hugepage files + * there are. Checks if the file is locked (i.e. + * if it's in use by another DPDK process). + */ +static int +clear_hugedir(const char * hugedir) +{ + DIR *dir; + struct dirent *dirent; + int dir_fd, fd, lck_result; + const char filter[] = "*map_*"; /* matches hugepage files */ + + /* open directory */ + dir = opendir(hugedir); + if (!dir) { + RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n", + hugedir); + goto error; + } + dir_fd = dirfd(dir); + + dirent = readdir(dir); + if (!dirent) { + RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n", + hugedir); + goto error; + } + + while(dirent != NULL){ + /* skip files that don't match the hugepage pattern */ + if (fnmatch(filter, dirent->d_name, 0) > 0) { + dirent = readdir(dir); + continue; + } + + /* try and lock the file */ + fd = openat(dir_fd, dirent->d_name, O_RDONLY); + + /* skip to next file */ + if (fd == -1) { + dirent = readdir(dir); + continue; + } + + /* non-blocking lock */ + lck_result = flock(fd, LOCK_EX | LOCK_NB); + + /* if lock succeeds, unlock and remove the file */ + if (lck_result != -1) { + flock(fd, LOCK_UN); + unlinkat(dir_fd, dirent->d_name, 0); + } + close (fd); + dirent = readdir(dir); + } + + closedir(dir); + return 0; + +error: + if (dir) + closedir(dir); + + RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n", + strerror(errno)); + + return -1; +} + +static int +compare_hpi(const void *a, const void *b) +{ + const struct hugepage_info *hpi_a = a; + const struct hugepage_info *hpi_b = b; + + return hpi_b->hugepage_sz - hpi_a->hugepage_sz; +} + +/* + * when we initialize the hugepage info, everything goes + * to socket 0 by default. it will later get sorted by memory + * initialization procedure. + */ +int +eal_hugepage_info_init(void) +{ + const char dirent_start_text[] = "hugepages-"; + const size_t dirent_start_len = sizeof(dirent_start_text) - 1; + unsigned i, num_sizes = 0; + DIR *dir; + struct dirent *dirent; + + dir = opendir(sys_dir_path); + if (dir == NULL) + rte_panic("Cannot open directory %s to read system hugepage " + "info\n", sys_dir_path); + + for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) { + struct hugepage_info *hpi; + + if (strncmp(dirent->d_name, dirent_start_text, + dirent_start_len) != 0) + continue; + + if (num_sizes >= MAX_HUGEPAGE_SIZES) + break; + + hpi = &internal_config.hugepage_info[num_sizes]; + hpi->hugepage_sz = + rte_str_to_size(&dirent->d_name[dirent_start_len]); + hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz); + + /* first, check if we have a mountpoint */ + if (hpi->hugedir == NULL) { + uint32_t num_pages; + + num_pages = get_num_hugepages(dirent->d_name); + if (num_pages > 0) + RTE_LOG(NOTICE, EAL, + "%" PRIu32 " hugepages of size " + "%" PRIu64 " reserved, but no mounted " + "hugetlbfs found for that size\n", + num_pages, hpi->hugepage_sz); + continue; + } + + /* try to obtain a writelock */ + hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY); + + /* if blocking lock failed */ + if (flock(hpi->lock_descriptor, LOCK_EX) == -1) { + RTE_LOG(CRIT, EAL, + "Failed to lock hugepage directory!\n"); + break; + } + /* clear out the hugepages dir from unused pages */ + if (clear_hugedir(hpi->hugedir) == -1) + break; + + /* for now, put all pages into socket 0, + * later they will be sorted */ + hpi->num_pages[0] = get_num_hugepages(dirent->d_name); + +#ifndef RTE_ARCH_64 + /* for 32-bit systems, limit number of hugepages to + * 1GB per page size */ + hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], + RTE_PGSIZE_1G / hpi->hugepage_sz); +#endif + + num_sizes++; + } + closedir(dir); + + /* something went wrong, and we broke from the for loop above */ + if (dirent != NULL) + return -1; + + internal_config.num_hugepage_sizes = num_sizes; + + /* sort the page directory entries by size, largest to smallest */ + qsort(&internal_config.hugepage_info[0], num_sizes, + sizeof(internal_config.hugepage_info[0]), compare_hpi); + + /* now we have all info, check we have at least one valid size */ + for (i = 0; i < num_sizes; i++) + if (internal_config.hugepage_info[i].hugedir != NULL && + internal_config.hugepage_info[i].num_pages[0] > 0) + return 0; + + /* no valid hugepage mounts available, return error */ + return -1; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c new file mode 100644 index 00000000..06b26a9e --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c @@ -0,0 +1,1224 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <pthread.h> +#include <sys/queue.h> +#include <stdarg.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <inttypes.h> +#include <sys/epoll.h> +#include <sys/signalfd.h> +#include <sys/ioctl.h> +#include <sys/eventfd.h> +#include <assert.h> + +#include <rte_common.h> +#include <rte_interrupts.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_atomic.h> +#include <rte_branch_prediction.h> +#include <rte_ring.h> +#include <rte_debug.h> +#include <rte_log.h> +#include <rte_mempool.h> +#include <rte_pci.h> +#include <rte_malloc.h> +#include <rte_errno.h> +#include <rte_spinlock.h> + +#include "eal_private.h" +#include "eal_vfio.h" +#include "eal_thread.h" + +#define EAL_INTR_EPOLL_WAIT_FOREVER (-1) +#define NB_OTHER_INTR 1 + +static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */ + +/** + * union for pipe fds. + */ +union intr_pipefds{ + struct { + int pipefd[2]; + }; + struct { + int readfd; + int writefd; + }; +}; + +/** + * union buffer for reading on different devices + */ +union rte_intr_read_buffer { + int uio_intr_count; /* for uio device */ +#ifdef VFIO_PRESENT + uint64_t vfio_intr_count; /* for vfio device */ +#endif + uint64_t timerfd_num; /* for timerfd */ + char charbuf[16]; /* for others */ +}; + +TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); +TAILQ_HEAD(rte_intr_source_list, rte_intr_source); + +struct rte_intr_callback { + TAILQ_ENTRY(rte_intr_callback) next; + rte_intr_callback_fn cb_fn; /**< callback address */ + void *cb_arg; /**< parameter for callback */ +}; + +struct rte_intr_source { + TAILQ_ENTRY(rte_intr_source) next; + struct rte_intr_handle intr_handle; /**< interrupt handle */ + struct rte_intr_cb_list callbacks; /**< user callbacks */ + uint32_t active; +}; + +/* global spinlock for interrupt data operation */ +static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; + +/* union buffer for pipe read/write */ +static union intr_pipefds intr_pipe; + +/* interrupt sources list */ +static struct rte_intr_source_list intr_sources; + +/* interrupt handling thread */ +static pthread_t intr_thread; + +/* VFIO interrupts */ +#ifdef VFIO_PRESENT + +#define IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + sizeof(int)) +/* irq set buffer length for queue interrupts and LSC interrupt */ +#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ + sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1)) + +/* enable legacy (INTx) interrupts */ +static int +vfio_enable_intx(struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + /* enable INTx */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* unmask INTx after enabling */ + memset(irq_set, 0, len); + len = sizeof(struct vfio_irq_set); + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable legacy (INTx) interrupts */ +static int +vfio_disable_intx(struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + /* mask interrupts before disabling */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* disable INTx*/ + memset(irq_set, 0, len); + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, + "Error disabling INTx interrupts for fd %d\n", intr_handle->fd); + return -1; + } + return 0; +} + +/* enable MSI interrupts */ +static int +vfio_enable_msi(struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable MSI interrupts */ +static int +vfio_disable_msi(struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI interrupts for fd %d\n", intr_handle->fd); + + return ret; +} + +/* enable MSI-X interrupts */ +static int +vfio_enable_msix(struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + if (!intr_handle->max_intr) + intr_handle->max_intr = 1; + else if (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID) + intr_handle->max_intr = RTE_MAX_RXTX_INTR_VEC_ID + 1; + + irq_set->count = intr_handle->max_intr; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + /* INTR vector offset 0 reserve for non-efds mapping */ + fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd; + memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds, + sizeof(*intr_handle->efds) * intr_handle->nb_efd); + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable MSI-X interrupts */ +static int +vfio_disable_msix(struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd); + + return ret; +} +#endif + +static int +uio_intx_intr_disable(struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* disable interrupts */ + command_high |= 0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intx_intr_enable(struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* enable interrupts */ + command_high &= ~0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intr_disable(struct rte_intr_handle *intr_handle) +{ + const int value = 0; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +static int +uio_intr_enable(struct rte_intr_handle *intr_handle) +{ + const int value = 1; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +int +rte_intr_callback_register(struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg) +{ + int ret, wake_thread; + struct rte_intr_source *src; + struct rte_intr_callback *callback; + + wake_thread = 0; + + /* first do parameter checking */ + if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { + RTE_LOG(ERR, EAL, + "Registering with invalid input parameter\n"); + return -EINVAL; + } + + /* allocate a new interrupt callback entity */ + callback = rte_zmalloc("interrupt callback list", + sizeof(*callback), 0); + if (callback == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + return -ENOMEM; + } + callback->cb_fn = cb; + callback->cb_arg = cb_arg; + + rte_spinlock_lock(&intr_lock); + + /* check if there is at least one callback registered for the fd */ + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->intr_handle.fd == intr_handle->fd) { + /* we had no interrupts for this */ + if TAILQ_EMPTY(&src->callbacks) + wake_thread = 1; + + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + ret = 0; + break; + } + } + + /* no existing callbacks for this - add new source */ + if (src == NULL) { + if ((src = rte_zmalloc("interrupt source list", + sizeof(*src), 0)) == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + rte_free(callback); + ret = -ENOMEM; + } else { + src->intr_handle = *intr_handle; + TAILQ_INIT(&src->callbacks); + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + TAILQ_INSERT_TAIL(&intr_sources, src, next); + wake_thread = 1; + ret = 0; + } + } + + rte_spinlock_unlock(&intr_lock); + + /** + * check if need to notify the pipe fd waited by epoll_wait to + * rebuild the wait list. + */ + if (wake_thread) + if (write(intr_pipe.writefd, "1", 1) < 0) + return -EPIPE; + + return ret; +} + +int +rte_intr_callback_unregister(struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* interrupt source has some active callbacks right now. */ + } else if (src->active != 0) { + ret = -EAGAIN; + + /* ok to remove. */ + } else { + ret = 0; + + /*walk through the callbacks and remove all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + + next = TAILQ_NEXT(cb, next); + + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + TAILQ_REMOVE(&src->callbacks, cb, next); + rte_free(cb); + ret++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + rte_free(src); + } + } + + rte_spinlock_unlock(&intr_lock); + + /* notify the pipe fd waited by epoll_wait to rebuild the wait list */ + if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) { + ret = -EPIPE; + } + + return ret; +} + +int +rte_intr_enable(struct rte_intr_handle *intr_handle) +{ + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type){ + /* write to the uio fd to enable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_enable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_enable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_enable_msix(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_enable_msi(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_enable_intx(intr_handle)) + return -1; + break; +#endif + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +int +rte_intr_disable(struct rte_intr_handle *intr_handle) +{ + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type){ + /* write to the uio fd to disable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_disable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_disable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_disable_msix(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_disable_msi(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_disable_intx(intr_handle)) + return -1; + break; +#endif + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +static int +eal_intr_process_interrupts(struct epoll_event *events, int nfds) +{ + int n, bytes_read; + struct rte_intr_source *src; + struct rte_intr_callback *cb; + union rte_intr_read_buffer buf; + struct rte_intr_callback active_cb; + + for (n = 0; n < nfds; n++) { + + /** + * if the pipe fd is ready to read, return out to + * rebuild the wait list. + */ + if (events[n].data.fd == intr_pipe.readfd){ + int r = read(intr_pipe.readfd, buf.charbuf, + sizeof(buf.charbuf)); + RTE_SET_USED(r); + return -1; + } + rte_spinlock_lock(&intr_lock); + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == + events[n].data.fd) + break; + if (src == NULL){ + rte_spinlock_unlock(&intr_lock); + continue; + } + + /* mark this interrupt source as active and release the lock. */ + src->active = 1; + rte_spinlock_unlock(&intr_lock); + + /* set the length to be read dor different handle type */ + switch (src->intr_handle.type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; + case RTE_INTR_HANDLE_ALARM: + bytes_read = sizeof(buf.timerfd_num); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#endif + case RTE_INTR_HANDLE_EXT: + default: + bytes_read = 1; + break; + } + + if (src->intr_handle.type != RTE_INTR_HANDLE_EXT) { + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + bytes_read = read(events[n].data.fd, &buf, bytes_read); + if (bytes_read < 0) { + if (errno == EINTR || errno == EWOULDBLOCK) + continue; + + RTE_LOG(ERR, EAL, "Error reading from file " + "descriptor %d: %s\n", + events[n].data.fd, + strerror(errno)); + } else if (bytes_read == 0) + RTE_LOG(ERR, EAL, "Read nothing from file " + "descriptor %d\n", events[n].data.fd); + } + + /* grab a lock, again to call callbacks and update status. */ + rte_spinlock_lock(&intr_lock); + + if (bytes_read > 0) { + + /* Finally, call all callbacks. */ + TAILQ_FOREACH(cb, &src->callbacks, next) { + + /* make a copy and unlock. */ + active_cb = *cb; + rte_spinlock_unlock(&intr_lock); + + /* call the actual callback */ + active_cb.cb_fn(&src->intr_handle, + active_cb.cb_arg); + + /*get the lock back. */ + rte_spinlock_lock(&intr_lock); + } + } + + /* we done with that interrupt source, release it. */ + src->active = 0; + rte_spinlock_unlock(&intr_lock); + } + + return 0; +} + +/** + * It handles all the interrupts. + * + * @param pfd + * epoll file descriptor. + * @param totalfds + * The number of file descriptors added in epoll. + * + * @return + * void + */ +static void +eal_intr_handle_interrupts(int pfd, unsigned totalfds) +{ + struct epoll_event events[totalfds]; + int nfds = 0; + + for(;;) { + nfds = epoll_wait(pfd, events, totalfds, + EAL_INTR_EPOLL_WAIT_FOREVER); + /* epoll_wait fail */ + if (nfds < 0) { + if (errno == EINTR) + continue; + RTE_LOG(ERR, EAL, + "epoll_wait returns with fail\n"); + return; + } + /* epoll_wait timeout, will never happens here */ + else if (nfds == 0) + continue; + /* epoll_wait has at least one fd ready to read */ + if (eal_intr_process_interrupts(events, nfds) < 0) + return; + } +} + +/** + * It builds/rebuilds up the epoll file descriptor with all the + * file descriptors being waited on. Then handles the interrupts. + * + * @param arg + * pointer. (unused) + * + * @return + * never return; + */ +static __attribute__((noreturn)) void * +eal_intr_thread_main(__rte_unused void *arg) +{ + struct epoll_event ev; + + /* host thread, never break out */ + for (;;) { + /* build up the epoll fd with all descriptors we are to + * wait on then pass it to the handle_interrupts function + */ + static struct epoll_event pipe_event = { + .events = EPOLLIN | EPOLLPRI, + }; + struct rte_intr_source *src; + unsigned numfds = 0; + + /* create epoll fd */ + int pfd = epoll_create(1); + if (pfd < 0) + rte_panic("Cannot create epoll instance\n"); + + pipe_event.data.fd = intr_pipe.readfd; + /** + * add pipe fd into wait list, this pipe is used to + * rebuild the wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd, + &pipe_event) < 0) { + rte_panic("Error adding fd to %d epoll_ctl, %s\n", + intr_pipe.readfd, strerror(errno)); + } + numfds++; + + rte_spinlock_lock(&intr_lock); + + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->callbacks.tqh_first == NULL) + continue; /* skip those with no callbacks */ + ev.events = EPOLLIN | EPOLLPRI; + ev.data.fd = src->intr_handle.fd; + + /** + * add all the uio device file descriptor + * into wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, + src->intr_handle.fd, &ev) < 0){ + rte_panic("Error adding fd %d epoll_ctl, %s\n", + src->intr_handle.fd, strerror(errno)); + } + else + numfds++; + } + rte_spinlock_unlock(&intr_lock); + /* serve the interrupt */ + eal_intr_handle_interrupts(pfd, numfds); + + /** + * when we return, we need to rebuild the + * list of fds to monitor. + */ + close(pfd); + } +} + +int +rte_eal_intr_init(void) +{ + int ret = 0, ret_1 = 0; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + /* init the global interrupt source head */ + TAILQ_INIT(&intr_sources); + + /** + * create a pipe which will be waited by epoll and notified to + * rebuild the wait list of epoll. + */ + if (pipe(intr_pipe.pipefd) < 0) + return -1; + + /* create the host thread to wait/handle the interrupt */ + ret = pthread_create(&intr_thread, NULL, + eal_intr_thread_main, NULL); + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to create thread for interrupt handling\n"); + } else { + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, + "eal-intr-thread"); + ret_1 = rte_thread_setname(intr_thread, thread_name); + if (ret_1 != 0) + RTE_LOG(ERR, EAL, + "Failed to set thread name for interrupt handling\n"); + } + + return -ret; +} + +static void +eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle) +{ + union rte_intr_read_buffer buf; + int bytes_read = 1; + int nbytes; + + switch (intr_handle->type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#endif + default: + bytes_read = 1; + RTE_LOG(INFO, EAL, "unexpected intr type\n"); + break; + } + + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + do { + nbytes = read(fd, &buf, bytes_read); + if (nbytes < 0) { + if (errno == EINTR || errno == EWOULDBLOCK || + errno == EAGAIN) + continue; + RTE_LOG(ERR, EAL, + "Error reading from fd %d: %s\n", + fd, strerror(errno)); + } else if (nbytes == 0) + RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd); + return; + } while (1); +} + +static int +eal_epoll_process_event(struct epoll_event *evs, unsigned int n, + struct rte_epoll_event *events) +{ + unsigned int i, count = 0; + struct rte_epoll_event *rev; + + for (i = 0; i < n; i++) { + rev = evs[i].data.ptr; + if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID, + RTE_EPOLL_EXEC)) + continue; + + events[count].status = RTE_EPOLL_VALID; + events[count].fd = rev->fd; + events[count].epfd = rev->epfd; + events[count].epdata.event = rev->epdata.event; + events[count].epdata.data = rev->epdata.data; + if (rev->epdata.cb_fun) + rev->epdata.cb_fun(rev->fd, + rev->epdata.cb_arg); + + rte_compiler_barrier(); + rev->status = RTE_EPOLL_VALID; + count++; + } + return count; +} + +static inline int +eal_init_tls_epfd(void) +{ + int pfd = epoll_create(255); + + if (pfd < 0) { + RTE_LOG(ERR, EAL, + "Cannot create epoll instance\n"); + return -1; + } + return pfd; +} + +int +rte_intr_tls_epfd(void) +{ + if (RTE_PER_LCORE(_epfd) == -1) + RTE_PER_LCORE(_epfd) = eal_init_tls_epfd(); + + return RTE_PER_LCORE(_epfd); +} + +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout) +{ + struct epoll_event evs[maxevents]; + int rc; + + if (!events) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + while (1) { + rc = epoll_wait(epfd, evs, maxevents, timeout); + if (likely(rc > 0)) { + /* epoll_wait has at least one fd ready to read */ + rc = eal_epoll_process_event(evs, rc, events); + break; + } else if (rc < 0) { + if (errno == EINTR) + continue; + /* epoll_wait fail */ + RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n", + strerror(errno)); + rc = -1; + break; + } else { + /* rc == 0, epoll_wait timed out */ + break; + } + } + + return rc; +} + +static inline void +eal_epoll_data_safe_free(struct rte_epoll_event *ev) +{ + while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID, + RTE_EPOLL_INVALID)) + while (ev->status != RTE_EPOLL_VALID) + rte_pause(); + memset(&ev->epdata, 0, sizeof(ev->epdata)); + ev->fd = -1; + ev->epfd = -1; +} + +int +rte_epoll_ctl(int epfd, int op, int fd, + struct rte_epoll_event *event) +{ + struct epoll_event ev; + + if (!event) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + if (op == EPOLL_CTL_ADD) { + event->status = RTE_EPOLL_VALID; + event->fd = fd; /* ignore fd in event */ + event->epfd = epfd; + ev.data.ptr = (void *)event; + } + + ev.events = event->epdata.event; + if (epoll_ctl(epfd, op, fd, &ev) < 0) { + RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n", + op, fd, strerror(errno)); + if (op == EPOLL_CTL_ADD) + /* rollback status when CTL_ADD fail */ + event->status = RTE_EPOLL_INVALID; + return -1; + } + + if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID) + eal_epoll_data_safe_free(event); + + return 0; +} + +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd, + int op, unsigned int vec, void *data) +{ + struct rte_epoll_event *rev; + struct rte_epoll_data *epdata; + int epfd_op; + unsigned int efd_idx; + int rc = 0; + + efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? + (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; + + if (!intr_handle || intr_handle->nb_efd == 0 || + efd_idx >= intr_handle->nb_efd) { + RTE_LOG(ERR, EAL, "Wrong intr vector number.\n"); + return -EPERM; + } + + switch (op) { + case RTE_INTR_EVENT_ADD: + epfd_op = EPOLL_CTL_ADD; + rev = &intr_handle->elist[efd_idx]; + if (rev->status != RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event already been added.\n"); + return -EEXIST; + } + + /* attach to intr vector fd */ + epdata = &rev->epdata; + epdata->event = EPOLLIN | EPOLLPRI | EPOLLET; + epdata->data = data; + epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr; + epdata->cb_arg = (void *)intr_handle; + rc = rte_epoll_ctl(epfd, epfd_op, + intr_handle->efds[efd_idx], rev); + if (!rc) + RTE_LOG(DEBUG, EAL, + "efd %d associated with vec %d added on epfd %d" + "\n", rev->fd, vec, epfd); + else + rc = -EPERM; + break; + case RTE_INTR_EVENT_DEL: + epfd_op = EPOLL_CTL_DEL; + rev = &intr_handle->elist[efd_idx]; + if (rev->status == RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event does not exist.\n"); + return -EPERM; + } + + rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev); + if (rc) + rc = -EPERM; + break; + default: + RTE_LOG(ERR, EAL, "event op type mismatch\n"); + rc = -EPERM; + } + + return rc; +} + +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) +{ + uint32_t i; + int fd; + uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); + + assert(nb_efd != 0); + + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) { + for (i = 0; i < n; i++) { + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (fd < 0) { + RTE_LOG(ERR, EAL, + "can't setup eventfd, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + intr_handle->efds[i] = fd; + } + intr_handle->nb_efd = n; + intr_handle->max_intr = NB_OTHER_INTR + n; + } else { + intr_handle->efds[0] = intr_handle->fd; + intr_handle->nb_efd = RTE_MIN(nb_efd, 1U); + intr_handle->max_intr = NB_OTHER_INTR; + } + + return 0; +} + +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle) +{ + uint32_t i; + struct rte_epoll_event *rev; + + for (i = 0; i < intr_handle->nb_efd; i++) { + rev = &intr_handle->elist[i]; + if (rev->status == RTE_EPOLL_INVALID) + continue; + if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) { + /* force free if the entry valid */ + eal_epoll_data_safe_free(rev); + rev->status = RTE_EPOLL_INVALID; + } + } + + if (intr_handle->max_intr > intr_handle->nb_efd) { + for (i = 0; i < intr_handle->nb_efd; i++) + close(intr_handle->efds[i]); + } + intr_handle->nb_efd = 0; + intr_handle->max_intr = 0; +} + +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) +{ + return !(!intr_handle->nb_efd); +} + +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle) +{ + if (!rte_intr_dp_is_en(intr_handle)) + return 1; + else + return !!(intr_handle->max_intr - intr_handle->nb_efd); +} + +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) +{ + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) + return 1; + + return 0; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_ivshmem.c b/lib/librte_eal/linuxapp/eal/eal_ivshmem.c new file mode 100644 index 00000000..07aec694 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_ivshmem.c @@ -0,0 +1,955 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef RTE_LIBRTE_IVSHMEM /* hide it from coverage */ + +#include <stdint.h> +#include <unistd.h> +#include <inttypes.h> +#include <sys/mman.h> +#include <sys/file.h> +#include <string.h> +#include <sys/queue.h> + +#include <rte_log.h> +#include <rte_pci.h> +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_string_fns.h> +#include <rte_errno.h> +#include <rte_ring.h> +#include <rte_mempool.h> +#include <rte_malloc.h> +#include <rte_common.h> +#include <rte_ivshmem.h> + +#include "eal_internal_cfg.h" +#include "eal_private.h" + +#define PCI_VENDOR_ID_IVSHMEM 0x1Af4 +#define PCI_DEVICE_ID_IVSHMEM 0x1110 + +#define IVSHMEM_MAGIC 0x0BADC0DE + +#define IVSHMEM_RESOURCE_PATH "/sys/bus/pci/devices/%04x:%02x:%02x.%x/resource2" +#define IVSHMEM_CONFIG_PATH "/var/run/.%s_ivshmem_config" + +#define PHYS 0x1 +#define VIRT 0x2 +#define IOREMAP 0x4 +#define FULL (PHYS|VIRT|IOREMAP) + +#define METADATA_SIZE_ALIGNED \ + (RTE_ALIGN_CEIL(sizeof(struct rte_ivshmem_metadata),pagesz)) + +#define CONTAINS(x,y)\ + (((y).addr_64 >= (x).addr_64) && ((y).addr_64 < (x).addr_64 + (x).len)) + +#define DIM(x) (sizeof(x)/sizeof(x[0])) + +struct ivshmem_pci_device { + char path[PATH_MAX]; + phys_addr_t ioremap_addr; +}; + +/* data type to store in config */ +struct ivshmem_segment { + struct rte_ivshmem_metadata_entry entry; + uint64_t align; + char path[PATH_MAX]; +}; +struct ivshmem_shared_config { + struct ivshmem_segment segment[RTE_MAX_MEMSEG]; + uint32_t segment_idx; + struct ivshmem_pci_device pci_devs[RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS]; + uint32_t pci_devs_idx; +}; +static struct ivshmem_shared_config * ivshmem_config; +static int memseg_idx; +static int pagesz; + +/* Tailq heads to add rings to */ +TAILQ_HEAD(rte_ring_list, rte_tailq_entry); + +/* + * Utility functions + */ + +static int +is_ivshmem_device(struct rte_pci_device * dev) +{ + return dev->id.vendor_id == PCI_VENDOR_ID_IVSHMEM + && dev->id.device_id == PCI_DEVICE_ID_IVSHMEM; +} + +static void * +map_metadata(int fd, uint64_t len) +{ + size_t metadata_len = sizeof(struct rte_ivshmem_metadata); + size_t aligned_len = METADATA_SIZE_ALIGNED; + + return mmap(NULL, metadata_len, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, len - aligned_len); +} + +static void +unmap_metadata(void * ptr) +{ + munmap(ptr, sizeof(struct rte_ivshmem_metadata)); +} + +static int +has_ivshmem_metadata(int fd, uint64_t len) +{ + struct rte_ivshmem_metadata metadata; + void * ptr; + + ptr = map_metadata(fd, len); + + if (ptr == MAP_FAILED) + return -1; + + metadata = *(struct rte_ivshmem_metadata*) (ptr); + + unmap_metadata(ptr); + + return metadata.magic_number == IVSHMEM_MAGIC; +} + +static void +remove_segment(struct ivshmem_segment * ms, int len, int idx) +{ + int i; + + for (i = idx; i < len - 1; i++) + memcpy(&ms[i], &ms[i+1], sizeof(struct ivshmem_segment)); + memset(&ms[len-1], 0, sizeof(struct ivshmem_segment)); +} + +static int +overlap(const struct rte_memzone * mz1, const struct rte_memzone * mz2) +{ + uint64_t start1, end1, start2, end2; + uint64_t p_start1, p_end1, p_start2, p_end2; + uint64_t i_start1, i_end1, i_start2, i_end2; + int result = 0; + + /* gather virtual addresses */ + start1 = mz1->addr_64; + end1 = mz1->addr_64 + mz1->len; + start2 = mz2->addr_64; + end2 = mz2->addr_64 + mz2->len; + + /* gather physical addresses */ + p_start1 = mz1->phys_addr; + p_end1 = mz1->phys_addr + mz1->len; + p_start2 = mz2->phys_addr; + p_end2 = mz2->phys_addr + mz2->len; + + /* gather ioremap addresses */ + i_start1 = mz1->ioremap_addr; + i_end1 = mz1->ioremap_addr + mz1->len; + i_start2 = mz2->ioremap_addr; + i_end2 = mz2->ioremap_addr + mz2->len; + + /* check for overlap in virtual addresses */ + if (start1 > start2 && start1 < end2) + result |= VIRT; + if (start2 >= start1 && start2 < end1) + result |= VIRT; + + /* check for overlap in physical addresses */ + if (p_start1 > p_start2 && p_start1 < p_end2) + result |= PHYS; + if (p_start2 > p_start1 && p_start2 < p_end1) + result |= PHYS; + + /* check for overlap in ioremap addresses */ + if (i_start1 > i_start2 && i_start1 < i_end2) + result |= IOREMAP; + if (i_start2 > i_start1 && i_start2 < i_end1) + result |= IOREMAP; + + return result; +} + +static int +adjacent(const struct rte_memzone * mz1, const struct rte_memzone * mz2) +{ + uint64_t start1, end1, start2, end2; + uint64_t p_start1, p_end1, p_start2, p_end2; + uint64_t i_start1, i_end1, i_start2, i_end2; + int result = 0; + + /* gather virtual addresses */ + start1 = mz1->addr_64; + end1 = mz1->addr_64 + mz1->len; + start2 = mz2->addr_64; + end2 = mz2->addr_64 + mz2->len; + + /* gather physical addresses */ + p_start1 = mz1->phys_addr; + p_end1 = mz1->phys_addr + mz1->len; + p_start2 = mz2->phys_addr; + p_end2 = mz2->phys_addr + mz2->len; + + /* gather ioremap addresses */ + i_start1 = mz1->ioremap_addr; + i_end1 = mz1->ioremap_addr + mz1->len; + i_start2 = mz2->ioremap_addr; + i_end2 = mz2->ioremap_addr + mz2->len; + + /* check if segments are virtually adjacent */ + if (start1 == end2) + result |= VIRT; + if (start2 == end1) + result |= VIRT; + + /* check if segments are physically adjacent */ + if (p_start1 == p_end2) + result |= PHYS; + if (p_start2 == p_end1) + result |= PHYS; + + /* check if segments are ioremap-adjacent */ + if (i_start1 == i_end2) + result |= IOREMAP; + if (i_start2 == i_end1) + result |= IOREMAP; + + return result; +} + +static int +has_adjacent_segments(struct ivshmem_segment * ms, int len) +{ + int i, j; + + for (i = 0; i < len; i++) + for (j = i + 1; j < len; j++) { + /* we're only interested in fully adjacent segments; partially + * adjacent segments can coexist. + */ + if (adjacent(&ms[i].entry.mz, &ms[j].entry.mz) == FULL) + return 1; + } + return 0; +} + +static int +has_overlapping_segments(struct ivshmem_segment * ms, int len) +{ + int i, j; + + for (i = 0; i < len; i++) + for (j = i + 1; j < len; j++) + if (overlap(&ms[i].entry.mz, &ms[j].entry.mz)) + return 1; + return 0; +} + +static int +seg_compare(const void * a, const void * b) +{ + const struct ivshmem_segment * s1 = (const struct ivshmem_segment*) a; + const struct ivshmem_segment * s2 = (const struct ivshmem_segment*) b; + + /* move unallocated zones to the end */ + if (s1->entry.mz.addr == NULL && s2->entry.mz.addr == NULL) + return 0; + if (s1->entry.mz.addr == 0) + return 1; + if (s2->entry.mz.addr == 0) + return -1; + + return s1->entry.mz.phys_addr > s2->entry.mz.phys_addr; +} + +#ifdef RTE_LIBRTE_IVSHMEM_DEBUG +static void +entry_dump(struct rte_ivshmem_metadata_entry *e) +{ + RTE_LOG(DEBUG, EAL, "\tvirt: %p-%p\n", e->mz.addr, + RTE_PTR_ADD(e->mz.addr, e->mz.len)); + RTE_LOG(DEBUG, EAL, "\tphys: 0x%" PRIx64 "-0x%" PRIx64 "\n", + e->mz.phys_addr, + e->mz.phys_addr + e->mz.len); + RTE_LOG(DEBUG, EAL, "\tio: 0x%" PRIx64 "-0x%" PRIx64 "\n", + e->mz.ioremap_addr, + e->mz.ioremap_addr + e->mz.len); + RTE_LOG(DEBUG, EAL, "\tlen: 0x%" PRIx64 "\n", e->mz.len); + RTE_LOG(DEBUG, EAL, "\toff: 0x%" PRIx64 "\n", e->offset); +} +#endif + + + +/* + * Actual useful code + */ + +/* read through metadata mapped from the IVSHMEM device */ +static int +read_metadata(char * path, int path_len, int fd, uint64_t flen) +{ + struct rte_ivshmem_metadata metadata; + struct rte_ivshmem_metadata_entry * entry; + int idx, i; + void * ptr; + + ptr = map_metadata(fd, flen); + + if (ptr == MAP_FAILED) + return -1; + + metadata = *(struct rte_ivshmem_metadata*) (ptr); + + unmap_metadata(ptr); + + RTE_LOG(DEBUG, EAL, "Parsing metadata for \"%s\"\n", metadata.name); + + idx = ivshmem_config->segment_idx; + + for (i = 0; i < RTE_LIBRTE_IVSHMEM_MAX_ENTRIES && + idx <= RTE_MAX_MEMSEG; i++) { + + if (idx == RTE_MAX_MEMSEG) { + RTE_LOG(ERR, EAL, "Not enough memory segments!\n"); + return -1; + } + + entry = &metadata.entry[i]; + + /* stop on uninitialized memzone */ + if (entry->mz.len == 0) + break; + + /* copy metadata entry */ + memcpy(&ivshmem_config->segment[idx].entry, entry, + sizeof(struct rte_ivshmem_metadata_entry)); + + /* copy path */ + snprintf(ivshmem_config->segment[idx].path, path_len, "%s", path); + + idx++; + } + ivshmem_config->segment_idx = idx; + + return 0; +} + +/* check through each segment and look for adjacent or overlapping ones. */ +static int +cleanup_segments(struct ivshmem_segment * ms, int tbl_len) +{ + struct ivshmem_segment * s, * tmp; + int i, j, concat, seg_adjacent, seg_overlapping; + uint64_t start1, start2, end1, end2, p_start1, p_start2, i_start1, i_start2; + + qsort(ms, tbl_len, sizeof(struct ivshmem_segment), + seg_compare); + + while (has_overlapping_segments(ms, tbl_len) || + has_adjacent_segments(ms, tbl_len)) { + + for (i = 0; i < tbl_len; i++) { + s = &ms[i]; + + concat = 0; + + for (j = i + 1; j < tbl_len; j++) { + tmp = &ms[j]; + + /* check if this segment is overlapping with existing segment, + * or is adjacent to existing segment */ + seg_overlapping = overlap(&s->entry.mz, &tmp->entry.mz); + seg_adjacent = adjacent(&s->entry.mz, &tmp->entry.mz); + + /* check if segments fully overlap or are fully adjacent */ + if ((seg_adjacent == FULL) || (seg_overlapping == FULL)) { + +#ifdef RTE_LIBRTE_IVSHMEM_DEBUG + RTE_LOG(DEBUG, EAL, "Concatenating segments\n"); + RTE_LOG(DEBUG, EAL, "Segment %i:\n", i); + entry_dump(&s->entry); + RTE_LOG(DEBUG, EAL, "Segment %i:\n", j); + entry_dump(&tmp->entry); +#endif + + start1 = s->entry.mz.addr_64; + start2 = tmp->entry.mz.addr_64; + p_start1 = s->entry.mz.phys_addr; + p_start2 = tmp->entry.mz.phys_addr; + i_start1 = s->entry.mz.ioremap_addr; + i_start2 = tmp->entry.mz.ioremap_addr; + end1 = s->entry.mz.addr_64 + s->entry.mz.len; + end2 = tmp->entry.mz.addr_64 + tmp->entry.mz.len; + + /* settle for minimum start address and maximum length */ + s->entry.mz.addr_64 = RTE_MIN(start1, start2); + s->entry.mz.phys_addr = RTE_MIN(p_start1, p_start2); + s->entry.mz.ioremap_addr = RTE_MIN(i_start1, i_start2); + s->entry.offset = RTE_MIN(s->entry.offset, tmp->entry.offset); + s->entry.mz.len = RTE_MAX(end1, end2) - s->entry.mz.addr_64; + concat = 1; + +#ifdef RTE_LIBRTE_IVSHMEM_DEBUG + RTE_LOG(DEBUG, EAL, "Resulting segment:\n"); + entry_dump(&s->entry); + +#endif + } + /* if segments not fully overlap, we have an error condition. + * adjacent segments can coexist. + */ + else if (seg_overlapping > 0) { + RTE_LOG(ERR, EAL, "Segments %i and %i overlap!\n", i, j); +#ifdef RTE_LIBRTE_IVSHMEM_DEBUG + RTE_LOG(DEBUG, EAL, "Segment %i:\n", i); + entry_dump(&s->entry); + RTE_LOG(DEBUG, EAL, "Segment %i:\n", j); + entry_dump(&tmp->entry); +#endif + return -1; + } + if (concat) + break; + } + /* if we concatenated, remove segment at j */ + if (concat) { + remove_segment(ms, tbl_len, j); + tbl_len--; + break; + } + } + } + + return tbl_len; +} + +static int +create_shared_config(void) +{ + char path[PATH_MAX]; + int fd; + + /* build ivshmem config file path */ + snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH, + internal_config.hugefile_prefix); + + fd = open(path, O_CREAT | O_RDWR, 0600); + + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open %s: %s\n", path, strerror(errno)); + return -1; + } + + /* try ex-locking first - if the file is locked, we have a problem */ + if (flock(fd, LOCK_EX | LOCK_NB) == -1) { + RTE_LOG(ERR, EAL, "Locking %s failed: %s\n", path, strerror(errno)); + close(fd); + return -1; + } + + if (ftruncate(fd, sizeof(struct ivshmem_shared_config)) < 0) { + RTE_LOG(ERR, EAL, "ftruncate failed: %s\n", strerror(errno)); + return -1; + } + + ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config), + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + + if (ivshmem_config == MAP_FAILED) + return -1; + + memset(ivshmem_config, 0, sizeof(struct ivshmem_shared_config)); + + /* change the exclusive lock we got earlier to a shared lock */ + if (flock(fd, LOCK_SH | LOCK_NB) == -1) { + RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno)); + return -1; + } + + close(fd); + + return 0; +} + +/* open shared config file and, if present, map the config. + * having no config file is not an error condition, as we later check if + * ivshmem_config is NULL (if it is, that means nothing was mapped). */ +static int +open_shared_config(void) +{ + char path[PATH_MAX]; + int fd; + + /* build ivshmem config file path */ + snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH, + internal_config.hugefile_prefix); + + fd = open(path, O_RDONLY); + + /* if the file doesn't exist, just return success */ + if (fd < 0 && errno == ENOENT) + return 0; + /* else we have an error condition */ + else if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open %s: %s\n", + path, strerror(errno)); + return -1; + } + + /* try ex-locking first - if the lock *does* succeed, this means it's a + * stray config file, so it should be deleted. + */ + if (flock(fd, LOCK_EX | LOCK_NB) != -1) { + + /* if we can't remove the file, something is wrong */ + if (unlink(path) < 0) { + RTE_LOG(ERR, EAL, "Could not remove %s: %s\n", path, + strerror(errno)); + return -1; + } + + /* release the lock */ + flock(fd, LOCK_UN); + close(fd); + + /* return success as having a stray config file is equivalent to not + * having config file at all. + */ + return 0; + } + + ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config), + PROT_READ, MAP_SHARED, fd, 0); + + if (ivshmem_config == MAP_FAILED) + return -1; + + /* place a shared lock on config file */ + if (flock(fd, LOCK_SH | LOCK_NB) == -1) { + RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno)); + return -1; + } + + close(fd); + + return 0; +} + +/* + * This function does the following: + * + * 1) Builds a table of ivshmem_segments with proper offset alignment + * 2) Cleans up that table so that we don't have any overlapping or adjacent + * memory segments + * 3) Creates memsegs from this table and maps them into memory. + */ +static inline int +map_all_segments(void) +{ + struct ivshmem_segment ms_tbl[RTE_MAX_MEMSEG]; + struct ivshmem_pci_device * pci_dev; + struct rte_mem_config * mcfg; + struct ivshmem_segment * seg; + int fd, fd_zero; + unsigned i, j; + struct rte_memzone mz; + struct rte_memseg ms; + void * base_addr; + uint64_t align, len; + phys_addr_t ioremap_addr; + + ioremap_addr = 0; + + memset(ms_tbl, 0, sizeof(ms_tbl)); + memset(&mz, 0, sizeof(struct rte_memzone)); + memset(&ms, 0, sizeof(struct rte_memseg)); + + /* first, build a table of memsegs to map, to avoid failed mmaps due to + * overlaps + */ + for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMSEG; i++) { + if (i == RTE_MAX_MEMSEG) { + RTE_LOG(ERR, EAL, "Too many segments requested!\n"); + return -1; + } + + seg = &ivshmem_config->segment[i]; + + /* copy segment to table */ + memcpy(&ms_tbl[i], seg, sizeof(struct ivshmem_segment)); + + /* find ioremap addr */ + for (j = 0; j < DIM(ivshmem_config->pci_devs); j++) { + pci_dev = &ivshmem_config->pci_devs[j]; + if (!strncmp(pci_dev->path, seg->path, sizeof(pci_dev->path))) { + ioremap_addr = pci_dev->ioremap_addr; + break; + } + } + if (ioremap_addr == 0) { + RTE_LOG(ERR, EAL, "Cannot find ioremap addr!\n"); + return -1; + } + + /* work out alignments */ + align = seg->entry.mz.addr_64 - + RTE_ALIGN_FLOOR(seg->entry.mz.addr_64, 0x1000); + len = RTE_ALIGN_CEIL(seg->entry.mz.len + align, 0x1000); + + /* save original alignments */ + ms_tbl[i].align = align; + + /* create a memory zone */ + mz.addr_64 = seg->entry.mz.addr_64 - align; + mz.len = len; + mz.hugepage_sz = seg->entry.mz.hugepage_sz; + mz.phys_addr = seg->entry.mz.phys_addr - align; + + /* find true physical address */ + mz.ioremap_addr = ioremap_addr + seg->entry.offset - align; + + ms_tbl[i].entry.offset = seg->entry.offset - align; + + memcpy(&ms_tbl[i].entry.mz, &mz, sizeof(struct rte_memzone)); + } + + /* clean up the segments */ + memseg_idx = cleanup_segments(ms_tbl, ivshmem_config->segment_idx); + + if (memseg_idx < 0) + return -1; + + mcfg = rte_eal_get_configuration()->mem_config; + + fd_zero = open("/dev/zero", O_RDWR); + + if (fd_zero < 0) { + RTE_LOG(ERR, EAL, "Cannot open /dev/zero: %s\n", strerror(errno)); + return -1; + } + + /* create memsegs and put them into DPDK memory */ + for (i = 0; i < (unsigned) memseg_idx; i++) { + + seg = &ms_tbl[i]; + + ms.addr_64 = seg->entry.mz.addr_64; + ms.hugepage_sz = seg->entry.mz.hugepage_sz; + ms.len = seg->entry.mz.len; + ms.nchannel = rte_memory_get_nchannel(); + ms.nrank = rte_memory_get_nrank(); + ms.phys_addr = seg->entry.mz.phys_addr; + ms.ioremap_addr = seg->entry.mz.ioremap_addr; + ms.socket_id = seg->entry.mz.socket_id; + + base_addr = mmap(ms.addr, ms.len, + PROT_READ | PROT_WRITE, MAP_PRIVATE, fd_zero, 0); + + if (base_addr == MAP_FAILED || base_addr != ms.addr) { + RTE_LOG(ERR, EAL, "Cannot map /dev/zero!\n"); + return -1; + } + + fd = open(seg->path, O_RDWR); + + if (fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", seg->path, + strerror(errno)); + return -1; + } + + munmap(ms.addr, ms.len); + + base_addr = mmap(ms.addr, ms.len, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, + seg->entry.offset); + + + if (base_addr == MAP_FAILED || base_addr != ms.addr) { + RTE_LOG(ERR, EAL, "Cannot map segment into memory: " + "expected %p got %p (%s)\n", ms.addr, base_addr, + strerror(errno)); + return -1; + } + + RTE_LOG(DEBUG, EAL, "Memory segment mapped: %p (len %" PRIx64 ") at " + "offset 0x%" PRIx64 "\n", + ms.addr, ms.len, seg->entry.offset); + + /* put the pointers back into their real positions using original + * alignment */ + ms.addr_64 += seg->align; + ms.phys_addr += seg->align; + ms.ioremap_addr += seg->align; + ms.len -= seg->align; + + /* at this point, the rest of DPDK memory is not initialized, so we + * expect memsegs to be empty */ + memcpy(&mcfg->memseg[i], &ms, + sizeof(struct rte_memseg)); + + close(fd); + + RTE_LOG(DEBUG, EAL, "IVSHMEM segment found, size: 0x%lx\n", + ms.len); + } + + return 0; +} + +/* this happens at a later stage, after general EAL memory initialization */ +int +rte_eal_ivshmem_obj_init(void) +{ + struct rte_ring_list* ring_list = NULL; + struct rte_mem_config * mcfg; + struct ivshmem_segment * seg; + struct rte_memzone * mz; + struct rte_ring * r; + struct rte_tailq_entry *te; + unsigned i, ms, idx; + uint64_t offset; + + /* secondary process would not need any object discovery - it'll all + * already be in shared config */ + if (rte_eal_process_type() != RTE_PROC_PRIMARY || ivshmem_config == NULL) + return 0; + + /* check that we have an initialised ring tail queue */ + ring_list = RTE_TAILQ_LOOKUP(RTE_TAILQ_RING_NAME, rte_ring_list); + if (ring_list == NULL) { + RTE_LOG(ERR, EAL, "No rte_ring tailq found!\n"); + return -1; + } + + mcfg = rte_eal_get_configuration()->mem_config; + + /* create memzones */ + for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMZONE; i++) { + + seg = &ivshmem_config->segment[i]; + + /* add memzone */ + if (mcfg->memzone_cnt == RTE_MAX_MEMZONE) { + RTE_LOG(ERR, EAL, "No more memory zones available!\n"); + return -1; + } + + idx = mcfg->memzone_cnt; + + RTE_LOG(DEBUG, EAL, "Found memzone: '%s' at %p (len 0x%" PRIx64 ")\n", + seg->entry.mz.name, seg->entry.mz.addr, seg->entry.mz.len); + + memcpy(&mcfg->memzone[idx], &seg->entry.mz, + sizeof(struct rte_memzone)); + + /* find ioremap address */ + for (ms = 0; ms <= RTE_MAX_MEMSEG; ms++) { + if (ms == RTE_MAX_MEMSEG) { + RTE_LOG(ERR, EAL, "Physical address of segment not found!\n"); + return -1; + } + if (CONTAINS(mcfg->memseg[ms], mcfg->memzone[idx])) { + offset = mcfg->memzone[idx].addr_64 - + mcfg->memseg[ms].addr_64; + mcfg->memzone[idx].ioremap_addr = mcfg->memseg[ms].ioremap_addr + + offset; + break; + } + } + + mcfg->memzone_cnt++; + } + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* find rings */ + for (i = 0; i < mcfg->memzone_cnt; i++) { + mz = &mcfg->memzone[i]; + + /* check if memzone has a ring prefix */ + if (strncmp(mz->name, RTE_RING_MZ_PREFIX, + sizeof(RTE_RING_MZ_PREFIX) - 1) != 0) + continue; + + r = (struct rte_ring*) (mz->addr_64); + + te = rte_zmalloc("RING_TAILQ_ENTRY", sizeof(*te), 0); + if (te == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate ring tailq entry!\n"); + return -1; + } + + te->data = (void *) r; + + TAILQ_INSERT_TAIL(ring_list, te, next); + + RTE_LOG(DEBUG, EAL, "Found ring: '%s' at %p\n", r->name, mz->addr); + } + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + +#ifdef RTE_LIBRTE_IVSHMEM_DEBUG + rte_memzone_dump(stdout); + rte_ring_list_dump(stdout); +#endif + + return 0; +} + +/* initialize ivshmem structures */ +int rte_eal_ivshmem_init(void) +{ + struct rte_pci_device * dev; + struct rte_pci_resource * res; + int fd, ret; + char path[PATH_MAX]; + + /* initialize everything to 0 */ + memset(path, 0, sizeof(path)); + ivshmem_config = NULL; + + pagesz = getpagesize(); + + RTE_LOG(DEBUG, EAL, "Searching for IVSHMEM devices...\n"); + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + + if (open_shared_config() < 0) { + RTE_LOG(ERR, EAL, "Could not open IVSHMEM config!\n"); + return -1; + } + } + else { + + TAILQ_FOREACH(dev, &pci_device_list, next) { + + if (is_ivshmem_device(dev)) { + + /* IVSHMEM memory is always on BAR2 */ + res = &dev->mem_resource[2]; + + /* if we don't have a BAR2 */ + if (res->len == 0) + continue; + + /* construct pci device path */ + snprintf(path, sizeof(path), IVSHMEM_RESOURCE_PATH, + dev->addr.domain, dev->addr.bus, dev->addr.devid, + dev->addr.function); + + /* try to find memseg */ + fd = open(path, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", path); + return -1; + } + + /* check if it's a DPDK IVSHMEM device */ + ret = has_ivshmem_metadata(fd, res->len); + + /* is DPDK device */ + if (ret == 1) { + + /* config file creation is deferred until the first + * DPDK device is found. then, it has to be created + * only once. */ + if (ivshmem_config == NULL && + create_shared_config() < 0) { + RTE_LOG(ERR, EAL, "Could not create IVSHMEM config!\n"); + close(fd); + return -1; + } + + if (read_metadata(path, sizeof(path), fd, res->len) < 0) { + RTE_LOG(ERR, EAL, "Could not read metadata from" + " device %02x:%02x.%x!\n", dev->addr.bus, + dev->addr.devid, dev->addr.function); + close(fd); + return -1; + } + + if (ivshmem_config->pci_devs_idx == RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS) { + RTE_LOG(WARNING, EAL, + "IVSHMEM PCI device limit exceeded. Increase " + "CONFIG_RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS in " + "your config file.\n"); + break; + } + + RTE_LOG(INFO, EAL, "Found IVSHMEM device %02x:%02x.%x\n", + dev->addr.bus, dev->addr.devid, dev->addr.function); + + ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].ioremap_addr = res->phys_addr; + snprintf(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path, + sizeof(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path), + "%s", path); + + ivshmem_config->pci_devs_idx++; + } + /* failed to read */ + else if (ret < 0) { + RTE_LOG(ERR, EAL, "Could not read IVSHMEM device: %s\n", + strerror(errno)); + close(fd); + return -1; + } + /* not a DPDK device */ + else + RTE_LOG(DEBUG, EAL, "Skipping non-DPDK IVSHMEM device\n"); + + /* close the BAR fd */ + close(fd); + } + } + } + + /* ivshmem_config is not NULL only if config was created and/or mapped */ + if (ivshmem_config) { + if (map_all_segments() < 0) { + RTE_LOG(ERR, EAL, "Mapping IVSHMEM segments failed!\n"); + return -1; + } + } + else { + RTE_LOG(DEBUG, EAL, "No IVSHMEM configuration found! \n"); + } + + return 0; +} + +#endif diff --git a/lib/librte_eal/linuxapp/eal/eal_lcore.c b/lib/librte_eal/linuxapp/eal/eal_lcore.c new file mode 100644 index 00000000..de5b4260 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_lcore.c @@ -0,0 +1,110 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <unistd.h> +#include <limits.h> +#include <string.h> +#include <dirent.h> + +#include <rte_log.h> +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_string_fns.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_filesystem.h" +#include "eal_thread.h" + +#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u" +#define CORE_ID_FILE "topology/core_id" +#define NUMA_NODE_PATH "/sys/devices/system/node" + +/* Check if a cpu is present by the presence of the cpu information for it */ +int +eal_cpu_detected(unsigned lcore_id) +{ + char path[PATH_MAX]; + int len = snprintf(path, sizeof(path), SYS_CPU_DIR + "/"CORE_ID_FILE, lcore_id); + if (len <= 0 || (unsigned)len >= sizeof(path)) + return 0; + if (access(path, F_OK) != 0) + return 0; + + return 1; +} + +/* + * Get CPU socket id (NUMA node) for a logical core. + * + * This searches each nodeX directories in /sys for the symlink for the given + * lcore_id and returns the numa node where the lcore is found. If lcore is not + * found on any numa node, returns zero. + */ +unsigned +eal_cpu_socket_id(unsigned lcore_id) +{ + unsigned socket; + + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH, + socket, lcore_id); + if (access(path, F_OK) == 0) + return socket; + } + return 0; +} + +/* Get the cpu core id value from the /sys/.../cpuX core_id value */ +unsigned +eal_cpu_core_id(unsigned lcore_id) +{ + char path[PATH_MAX]; + unsigned long id; + + int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE); + if (len <= 0 || (unsigned)len >= sizeof(path)) + goto err; + if (eal_parse_sysfs_value(path, &id) != 0) + goto err; + return (unsigned)id; + +err: + RTE_LOG(ERR, EAL, "Error reading core id value from %s " + "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id); + return 0; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_log.c b/lib/librte_eal/linuxapp/eal/eal_log.c new file mode 100644 index 00000000..0b133c3e --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_log.c @@ -0,0 +1,146 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <stdio.h> +#include <stdint.h> +#include <sys/types.h> +#include <syslog.h> +#include <sys/queue.h> + +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_spinlock.h> +#include <rte_log.h> + +#include "eal_private.h" + +/* + * default log function, used once mempool (hence log history) is + * available + */ +static ssize_t +console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) +{ + char copybuf[BUFSIZ + 1]; + ssize_t ret; + uint32_t loglevel; + + /* add this log in history */ + rte_log_add_in_history(buf, size); + + /* write on stdout */ + ret = fwrite(buf, 1, size, stdout); + fflush(stdout); + + /* truncate message if too big (should not happen) */ + if (size > BUFSIZ) + size = BUFSIZ; + + /* Syslog error levels are from 0 to 7, so subtract 1 to convert */ + loglevel = rte_log_cur_msg_loglevel() - 1; + memcpy(copybuf, buf, size); + copybuf[size] = '\0'; + + /* write on syslog too */ + syslog(loglevel, "%s", copybuf); + + return ret; +} + +static cookie_io_functions_t console_log_func = { + .write = console_log_write, +}; + +/* + * set the log to default function, called during eal init process, + * once memzones are available. + */ +int +rte_eal_log_init(const char *id, int facility) +{ + FILE *log_stream; + + log_stream = fopencookie(NULL, "w+", console_log_func); + if (log_stream == NULL) + return -1; + + openlog(id, LOG_NDELAY | LOG_PID, facility); + + if (rte_eal_common_log_init(log_stream) < 0) + return -1; + + return 0; +} + +/* early logs */ + +/* + * early log function, used during boot when mempool (hence log + * history) is not available + */ +static ssize_t +early_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) +{ + ssize_t ret; + ret = fwrite(buf, size, 1, stdout); + fflush(stdout); + if (ret == 0) + return -1; + return ret; +} + +static cookie_io_functions_t early_log_func = { + .write = early_log_write, +}; +static FILE *early_log_stream; + +/* + * init the log library, called by rte_eal_init() to enable early + * logs + */ +int +rte_eal_log_early_init(void) +{ + early_log_stream = fopencookie(NULL, "w+", early_log_func); + if (early_log_stream == NULL) { + printf("Cannot configure early_log_stream\n"); + return -1; + } + rte_openlog_stream(early_log_stream); + return 0; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c new file mode 100644 index 00000000..5b9132c6 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -0,0 +1,1559 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/* BSD LICENSE + * + * Copyright(c) 2013 6WIND. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define _FILE_OFFSET_BITS 64 +#include <errno.h> +#include <stdarg.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <inttypes.h> +#include <string.h> +#include <stdarg.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/queue.h> +#include <sys/file.h> +#include <unistd.h> +#include <limits.h> +#include <errno.h> +#include <sys/ioctl.h> +#include <sys/time.h> + +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_string_fns.h> + +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" + +#ifdef RTE_LIBRTE_XEN_DOM0 +int rte_xen_dom0_supported(void) +{ + return internal_config.xen_dom0_support; +} +#endif + +/** + * @file + * Huge page mapping under linux + * + * To reserve a big contiguous amount of memory, we use the hugepage + * feature of linux. For that, we need to have hugetlbfs mounted. This + * code will create many files in this directory (one per page) and + * map them in virtual memory. For each page, we will retrieve its + * physical address and remap it in order to have a virtual contiguous + * zone as well as a physical contiguous zone. + */ + +static uint64_t baseaddr_offset; + +static unsigned proc_pagemap_readable; + +#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space" + +static void +test_proc_pagemap_readable(void) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + + if (fd < 0) { + RTE_LOG(ERR, EAL, + "Cannot open /proc/self/pagemap: %s. " + "virt2phys address translation will not work\n", + strerror(errno)); + return; + } + + /* Is readable */ + close(fd); + proc_pagemap_readable = 1; +} + +/* Lock page in physical memory and prevent from swapping. */ +int +rte_mem_lock_page(const void *virt) +{ + unsigned long virtual = (unsigned long)virt; + int page_size = getpagesize(); + unsigned long aligned = (virtual & ~ (page_size - 1)); + return mlock((void*)aligned, page_size); +} + +/* + * Get physical address of any mapped virtual address in the current process. + */ +phys_addr_t +rte_mem_virt2phy(const void *virtaddr) +{ + int fd; + uint64_t page, physaddr; + unsigned long virt_pfn; + int page_size; + off_t offset; + + /* Cannot parse /proc/self/pagemap, no need to log errors everywhere */ + if (!proc_pagemap_readable) + return RTE_BAD_PHYS_ADDR; + + /* standard page size */ + page_size = getpagesize(); + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + return RTE_BAD_PHYS_ADDR; + } + + virt_pfn = (unsigned long)virtaddr / page_size; + offset = sizeof(uint64_t) * virt_pfn; + if (lseek(fd, offset, SEEK_SET) == (off_t) -1) { + RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + close(fd); + return RTE_BAD_PHYS_ADDR; + } + if (read(fd, &page, sizeof(uint64_t)) < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + close(fd); + return RTE_BAD_PHYS_ADDR; + } + + /* + * the pfn (page frame number) are bits 0-54 (see + * pagemap.txt in linux Documentation) + */ + physaddr = ((page & 0x7fffffffffffffULL) * page_size) + + ((unsigned long)virtaddr % page_size); + close(fd); + return physaddr; +} + +/* + * For each hugepage in hugepg_tbl, fill the physaddr value. We find + * it by browsing the /proc/self/pagemap special file. + */ +static int +find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned i; + phys_addr_t addr; + + for (i = 0; i < hpi->num_pages[0]; i++) { + addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va); + if (addr == RTE_BAD_PHYS_ADDR) + return -1; + hugepg_tbl[i].physaddr = addr; + } + return 0; +} + +/* + * Check whether address-space layout randomization is enabled in + * the kernel. This is important for multi-process as it can prevent + * two processes mapping data to the same virtual address + * Returns: + * 0 - address space randomization disabled + * 1/2 - address space randomization enabled + * negative error code on error + */ +static int +aslr_enabled(void) +{ + char c; + int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY); + if (fd < 0) + return -errno; + retval = read(fd, &c, 1); + close(fd); + if (retval < 0) + return -errno; + if (retval == 0) + return -EIO; + switch (c) { + case '0' : return 0; + case '1' : return 1; + case '2' : return 2; + default: return -EINVAL; + } +} + +/* + * Try to mmap *size bytes in /dev/zero. If it is successful, return the + * pointer to the mmap'd area and keep *size unmodified. Else, retry + * with a smaller zone: decrease *size by hugepage_sz until it reaches + * 0. In this case, return NULL. Note: this function returns an address + * which is a multiple of hugepage size. + */ +static void * +get_virtual_area(size_t *size, size_t hugepage_sz) +{ + void *addr; + int fd; + long aligned_addr; + + if (internal_config.base_virtaddr != 0) { + addr = (void*) (uintptr_t) (internal_config.base_virtaddr + + baseaddr_offset); + } + else addr = NULL; + + RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); + + fd = open("/dev/zero", O_RDONLY); + if (fd < 0){ + RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n"); + return NULL; + } + do { + addr = mmap(addr, + (*size) + hugepage_sz, PROT_READ, MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) + *size -= hugepage_sz; + } while (addr == MAP_FAILED && *size > 0); + + if (addr == MAP_FAILED) { + close(fd); + RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", + strerror(errno)); + return NULL; + } + + munmap(addr, (*size) + hugepage_sz); + close(fd); + + /* align addr to a huge page size boundary */ + aligned_addr = (long)addr; + aligned_addr += (hugepage_sz - 1); + aligned_addr &= (~(hugepage_sz - 1)); + addr = (void *)(aligned_addr); + + RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", + addr, *size); + + /* increment offset */ + baseaddr_offset += *size; + + return addr; +} + +/* + * Mmap all hugepages of hugepage table: it first open a file in + * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the + * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored + * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to + * map continguous physical blocks in contiguous virtual blocks. + */ +static int +map_all_hugepages(struct hugepage_file *hugepg_tbl, + struct hugepage_info *hpi, int orig) +{ + int fd; + unsigned i; + void *virtaddr; + void *vma_addr = NULL; + size_t vma_len = 0; + +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + RTE_SET_USED(vma_len); +#endif + + for (i = 0; i < hpi->num_pages[0]; i++) { + uint64_t hugepage_sz = hpi->hugepage_sz; + + if (orig) { + hugepg_tbl[i].file_id = i; + hugepg_tbl[i].size = hugepage_sz; +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + eal_get_hugefile_temp_path(hugepg_tbl[i].filepath, + sizeof(hugepg_tbl[i].filepath), hpi->hugedir, + hugepg_tbl[i].file_id); +#else + eal_get_hugefile_path(hugepg_tbl[i].filepath, + sizeof(hugepg_tbl[i].filepath), hpi->hugedir, + hugepg_tbl[i].file_id); +#endif + hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0'; + } +#ifndef RTE_ARCH_64 + /* for 32-bit systems, don't remap 1G and 16G pages, just reuse + * original map address as final map address. + */ + else if ((hugepage_sz == RTE_PGSIZE_1G) + || (hugepage_sz == RTE_PGSIZE_16G)) { + hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va; + hugepg_tbl[i].orig_va = NULL; + continue; + } +#endif + +#ifndef RTE_EAL_SINGLE_FILE_SEGMENTS + else if (vma_len == 0) { + unsigned j, num_pages; + + /* reserve a virtual area for next contiguous + * physical block: count the number of + * contiguous physical pages. */ + for (j = i+1; j < hpi->num_pages[0] ; j++) { +#ifdef RTE_ARCH_PPC_64 + /* The physical addresses are sorted in + * descending order on PPC64 */ + if (hugepg_tbl[j].physaddr != + hugepg_tbl[j-1].physaddr - hugepage_sz) + break; +#else + if (hugepg_tbl[j].physaddr != + hugepg_tbl[j-1].physaddr + hugepage_sz) + break; +#endif + } + num_pages = j - i; + vma_len = num_pages * hugepage_sz; + + /* get the biggest virtual memory area up to + * vma_len. If it fails, vma_addr is NULL, so + * let the kernel provide the address. */ + vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz); + if (vma_addr == NULL) + vma_len = hugepage_sz; + } +#endif + + /* try to create hugepage file */ + fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__, + strerror(errno)); + return -1; + } + + /* map the segment, and populate page tables, + * the kernel fills this segment with zeros */ + virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (virtaddr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__, + strerror(errno)); + close(fd); + return -1; + } + + if (orig) { + hugepg_tbl[i].orig_va = virtaddr; + } + else { + hugepg_tbl[i].final_va = virtaddr; + } + + /* set shared flock on the file. */ + if (flock(fd, LOCK_SH | LOCK_NB) == -1) { + RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n", + __func__, strerror(errno)); + close(fd); + return -1; + } + + close(fd); + + vma_addr = (char *)vma_addr + hugepage_sz; + vma_len -= hugepage_sz; + } + return 0; +} + +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + +/* + * Remaps all hugepages into single file segments + */ +static int +remap_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + int fd; + unsigned i = 0, j, num_pages, page_idx = 0; + void *vma_addr = NULL, *old_addr = NULL, *page_addr = NULL; + size_t vma_len = 0; + size_t hugepage_sz = hpi->hugepage_sz; + size_t total_size, offset; + char filepath[MAX_HUGEPAGE_PATH]; + phys_addr_t physaddr; + int socket; + + while (i < hpi->num_pages[0]) { + +#ifndef RTE_ARCH_64 + /* for 32-bit systems, don't remap 1G pages and 16G pages, + * just reuse original map address as final map address. + */ + if ((hugepage_sz == RTE_PGSIZE_1G) + || (hugepage_sz == RTE_PGSIZE_16G)) { + hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va; + hugepg_tbl[i].orig_va = NULL; + i++; + continue; + } +#endif + + /* reserve a virtual area for next contiguous + * physical block: count the number of + * contiguous physical pages. */ + for (j = i+1; j < hpi->num_pages[0] ; j++) { +#ifdef RTE_ARCH_PPC_64 + /* The physical addresses are sorted in descending + * order on PPC64 */ + if (hugepg_tbl[j].physaddr != + hugepg_tbl[j-1].physaddr - hugepage_sz) + break; +#else + if (hugepg_tbl[j].physaddr != + hugepg_tbl[j-1].physaddr + hugepage_sz) + break; +#endif + } + num_pages = j - i; + vma_len = num_pages * hugepage_sz; + + socket = hugepg_tbl[i].socket_id; + + /* get the biggest virtual memory area up to + * vma_len. If it fails, vma_addr is NULL, so + * let the kernel provide the address. */ + vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz); + + /* If we can't find a big enough virtual area, work out how many pages + * we are going to get */ + if (vma_addr == NULL) + j = i + 1; + else if (vma_len != num_pages * hugepage_sz) { + num_pages = vma_len / hugepage_sz; + j = i + num_pages; + + } + + hugepg_tbl[page_idx].file_id = page_idx; + eal_get_hugefile_path(filepath, + sizeof(filepath), + hpi->hugedir, + hugepg_tbl[page_idx].file_id); + + /* try to create hugepage file */ + fd = open(filepath, O_CREAT | O_RDWR, 0755); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__, strerror(errno)); + return -1; + } + + total_size = 0; + for (;i < j; i++) { + + /* unmap current segment */ + if (total_size > 0) + munmap(vma_addr, total_size); + + /* unmap original page */ + munmap(hugepg_tbl[i].orig_va, hugepage_sz); + unlink(hugepg_tbl[i].filepath); + + total_size += hugepage_sz; + + old_addr = vma_addr; + + /* map new, bigger segment, and populate page tables, + * the kernel fills this segment with zeros */ + vma_addr = mmap(vma_addr, total_size, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0); + + if (vma_addr == MAP_FAILED || vma_addr != old_addr) { + RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__, strerror(errno)); + close(fd); + return -1; + } + } + + /* set shared flock on the file. */ + if (flock(fd, LOCK_SH | LOCK_NB) == -1) { + RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n", + __func__, strerror(errno)); + close(fd); + return -1; + } + + snprintf(hugepg_tbl[page_idx].filepath, MAX_HUGEPAGE_PATH, "%s", + filepath); + + physaddr = rte_mem_virt2phy(vma_addr); + + if (physaddr == RTE_BAD_PHYS_ADDR) + return -1; + + hugepg_tbl[page_idx].final_va = vma_addr; + + hugepg_tbl[page_idx].physaddr = physaddr; + + hugepg_tbl[page_idx].repeated = num_pages; + + hugepg_tbl[page_idx].socket_id = socket; + + close(fd); + + /* verify the memory segment - that is, check that every VA corresponds + * to the physical address we expect to see + */ + for (offset = 0; offset < vma_len; offset += hugepage_sz) { + uint64_t expected_physaddr; + + expected_physaddr = hugepg_tbl[page_idx].physaddr + offset; + page_addr = RTE_PTR_ADD(vma_addr, offset); + physaddr = rte_mem_virt2phy(page_addr); + + if (physaddr != expected_physaddr) { + RTE_LOG(ERR, EAL, "Segment sanity check failed: wrong physaddr " + "at %p (offset 0x%" PRIx64 ": 0x%" PRIx64 + " (expected 0x%" PRIx64 ")\n", + page_addr, offset, physaddr, expected_physaddr); + return -1; + } + } + + page_idx++; + } + + /* zero out the rest */ + memset(&hugepg_tbl[page_idx], 0, (hpi->num_pages[0] - page_idx) * sizeof(struct hugepage_file)); + return page_idx; +} +#else/* RTE_EAL_SINGLE_FILE_SEGMENTS=n */ + +/* Unmap all hugepages from original mapping */ +static int +unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned i; + for (i = 0; i < hpi->num_pages[0]; i++) { + if (hugepg_tbl[i].orig_va) { + munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz); + hugepg_tbl[i].orig_va = NULL; + } + } + return 0; +} +#endif /* RTE_EAL_SINGLE_FILE_SEGMENTS */ + +/* + * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge + * page. + */ +static int +find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + int socket_id; + char *end, *nodestr; + unsigned i, hp_count = 0; + uint64_t virt_addr; + char buf[BUFSIZ]; + char hugedir_str[PATH_MAX]; + FILE *f; + + f = fopen("/proc/self/numa_maps", "r"); + if (f == NULL) { + RTE_LOG(NOTICE, EAL, "cannot open /proc/self/numa_maps," + " consider that all memory is in socket_id 0\n"); + return 0; + } + + snprintf(hugedir_str, sizeof(hugedir_str), + "%s/%s", hpi->hugedir, internal_config.hugefile_prefix); + + /* parse numa map */ + while (fgets(buf, sizeof(buf), f) != NULL) { + + /* ignore non huge page */ + if (strstr(buf, " huge ") == NULL && + strstr(buf, hugedir_str) == NULL) + continue; + + /* get zone addr */ + virt_addr = strtoull(buf, &end, 16); + if (virt_addr == 0 || end == buf) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* get node id (socket id) */ + nodestr = strstr(buf, " N"); + if (nodestr == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + nodestr += 2; + end = strstr(nodestr, "="); + if (end == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + end[0] = '\0'; + end = NULL; + + socket_id = strtoul(nodestr, &end, 0); + if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* if we find this page in our mappings, set socket_id */ + for (i = 0; i < hpi->num_pages[0]; i++) { + void *va = (void *)(unsigned long)virt_addr; + if (hugepg_tbl[i].orig_va == va) { + hugepg_tbl[i].socket_id = socket_id; + hp_count++; + } + } + } + + if (hp_count < hpi->num_pages[0]) + goto error; + + fclose(f); + return 0; + +error: + fclose(f); + return -1; +} + +static int +cmp_physaddr(const void *a, const void *b) +{ +#ifndef RTE_ARCH_PPC_64 + const struct hugepage_file *p1 = (const struct hugepage_file *)a; + const struct hugepage_file *p2 = (const struct hugepage_file *)b; +#else + /* PowerPC needs memory sorted in reverse order from x86 */ + const struct hugepage_file *p1 = (const struct hugepage_file *)b; + const struct hugepage_file *p2 = (const struct hugepage_file *)a; +#endif + if (p1->physaddr < p2->physaddr) + return -1; + else if (p1->physaddr > p2->physaddr) + return 1; + else + return 0; +} + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + void *retval; + int fd = open(filename, O_CREAT | O_RDWR, 0666); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + return retval; +} + +/* + * this copies *active* hugepages from one hugepage table to another. + * destination is typically the shared memory. + */ +static int +copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size, + const struct hugepage_file * src, int src_size) +{ + int src_pos, dst_pos = 0; + + for (src_pos = 0; src_pos < src_size; src_pos++) { + if (src[src_pos].final_va != NULL) { + /* error on overflow attempt */ + if (dst_pos == dest_size) + return -1; + memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file)); + dst_pos++; + } + } + return 0; +} + +static int +unlink_hugepage_files(struct hugepage_file *hugepg_tbl, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += + internal_config.hugepage_info[size].num_pages[socket]; + + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + if (hp->final_va != NULL && unlink(hp->filepath)) { + RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + } + } + return 0; +} + +/* + * unmaps hugepages that are not going to be used. since we originally allocate + * ALL hugepages (not just those we need), additional unmapping needs to be done. + */ +static int +unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl, + struct hugepage_info *hpi, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += internal_config.hugepage_info[size].num_pages[socket]; + + for (size = 0; size < num_hp_info; size++) { + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + unsigned pages_found = 0; + + /* traverse until we have unmapped all the unused pages */ + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + /* if this page was already cleared */ + if (hp->final_va == NULL) + continue; +#endif + + /* find a page that matches the criteria */ + if ((hp->size == hpi[size].hugepage_sz) && + (hp->socket_id == (int) socket)) { + + /* if we skipped enough pages, unmap the rest */ + if (pages_found == hpi[size].num_pages[socket]) { + uint64_t unmap_len; + +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + unmap_len = hp->size * hp->repeated; +#else + unmap_len = hp->size; +#endif + + /* get start addr and len of the remaining segment */ + munmap(hp->final_va, (size_t) unmap_len); + + hp->final_va = NULL; + if (unlink(hp->filepath) == -1) { + RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + return -1; + } + } +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + /* else, check how much do we need to map */ + else { + int nr_pg_left = + hpi[size].num_pages[socket] - pages_found; + + /* if we need enough memory to fit into the segment */ + if (hp->repeated <= nr_pg_left) { + pages_found += hp->repeated; + } + /* truncate the segment */ + else { + uint64_t final_size = nr_pg_left * hp->size; + uint64_t seg_size = hp->repeated * hp->size; + + void * unmap_va = RTE_PTR_ADD(hp->final_va, + final_size); + int fd; + + munmap(unmap_va, seg_size - final_size); + + fd = open(hp->filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", + hp->filepath, strerror(errno)); + return -1; + } + if (ftruncate(fd, final_size) < 0) { + RTE_LOG(ERR, EAL, "Cannot truncate %s: %s\n", + hp->filepath, strerror(errno)); + return -1; + } + close(fd); + + pages_found += nr_pg_left; + hp->repeated = nr_pg_left; + } + } +#else + /* else, lock the page and skip */ + else + pages_found++; +#endif + + } /* match page */ + } /* foreach page */ + } /* foreach socket */ + } /* foreach pagesize */ + + return 0; +} + +static inline uint64_t +get_socket_mem_size(int socket) +{ + uint64_t size = 0; + unsigned i; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++){ + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (hpi->hugedir != NULL) + size += hpi->hugepage_sz * hpi->num_pages[socket]; + } + + return size; +} + +/* + * This function is a NUMA-aware equivalent of calc_num_pages. + * It takes in the list of hugepage sizes and the + * number of pages thereof, and calculates the best number of + * pages of each size to fulfill the request for <memory> ram + */ +static int +calc_num_pages_per_socket(uint64_t * memory, + struct hugepage_info *hp_info, + struct hugepage_info *hp_used, + unsigned num_hp_info) +{ + unsigned socket, j, i = 0; + unsigned requested, available; + int total_num_pages = 0; + uint64_t remaining_mem, cur_mem; + uint64_t total_mem = internal_config.memory; + + if (num_hp_info == 0) + return -1; + + /* if specific memory amounts per socket weren't requested */ + if (internal_config.force_sockets == 0) { + int cpu_per_socket[RTE_MAX_NUMA_NODES]; + size_t default_size, total_size; + unsigned lcore_id; + + /* Compute number of cores per socket */ + memset(cpu_per_socket, 0, sizeof(cpu_per_socket)); + RTE_LCORE_FOREACH(lcore_id) { + cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; + } + + /* + * Automatically spread requested memory amongst detected sockets according + * to number of cores from cpu mask present on each socket + */ + total_size = internal_config.memory; + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + + /* Set memory amount per socket */ + default_size = (internal_config.memory * cpu_per_socket[socket]) + / rte_lcore_count(); + + /* Limit to maximum available memory on socket */ + default_size = RTE_MIN(default_size, get_socket_mem_size(socket)); + + /* Update sizes */ + memory[socket] = default_size; + total_size -= default_size; + } + + /* + * If some memory is remaining, try to allocate it by getting all + * available memory from sockets, one after the other + */ + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + /* take whatever is available */ + default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket], + total_size); + + /* Update sizes */ + memory[socket] += default_size; + total_size -= default_size; + } + } + + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) { + /* skips if the memory on specific socket wasn't requested */ + for (i = 0; i < num_hp_info && memory[socket] != 0; i++){ + hp_used[i].hugedir = hp_info[i].hugedir; + hp_used[i].num_pages[socket] = RTE_MIN( + memory[socket] / hp_info[i].hugepage_sz, + hp_info[i].num_pages[socket]); + + cur_mem = hp_used[i].num_pages[socket] * + hp_used[i].hugepage_sz; + + memory[socket] -= cur_mem; + total_mem -= cur_mem; + + total_num_pages += hp_used[i].num_pages[socket]; + + /* check if we have met all memory requests */ + if (memory[socket] == 0) + break; + + /* check if we have any more pages left at this size, if so + * move on to next size */ + if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket]) + continue; + /* At this point we know that there are more pages available that are + * bigger than the memory we want, so lets see if we can get enough + * from other page sizes. + */ + remaining_mem = 0; + for (j = i+1; j < num_hp_info; j++) + remaining_mem += hp_info[j].hugepage_sz * + hp_info[j].num_pages[socket]; + + /* is there enough other memory, if not allocate another page and quit */ + if (remaining_mem < memory[socket]){ + cur_mem = RTE_MIN(memory[socket], + hp_info[i].hugepage_sz); + memory[socket] -= cur_mem; + total_mem -= cur_mem; + hp_used[i].num_pages[socket]++; + total_num_pages++; + break; /* we are done with this socket*/ + } + } + /* if we didn't satisfy all memory requirements per socket */ + if (memory[socket] > 0) { + /* to prevent icc errors */ + requested = (unsigned) (internal_config.socket_mem[socket] / + 0x100000); + available = requested - + ((unsigned) (memory[socket] / 0x100000)); + RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! " + "Requested: %uMB, available: %uMB\n", socket, + requested, available); + return -1; + } + } + + /* if we didn't satisfy total memory requirements */ + if (total_mem > 0) { + requested = (unsigned) (internal_config.memory / 0x100000); + available = requested - (unsigned) (total_mem / 0x100000); + RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB," + " available: %uMB\n", requested, available); + return -1; + } + return total_num_pages; +} + +/* + * Prepare physical memory mapping: fill configuration structure with + * these infos, return 0 on success. + * 1. map N huge pages in separate files in hugetlbfs + * 2. find associated physical addr + * 3. find associated NUMA socket ID + * 4. sort all huge pages by physical address + * 5. remap these N huge pages in the correct order + * 6. unmap the first mapping + * 7. fill memsegs in configuration with contiguous zones + */ +int +rte_eal_hugepage_init(void) +{ + struct rte_mem_config *mcfg; + struct hugepage_file *hugepage, *tmp_hp = NULL; + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + + uint64_t memory[RTE_MAX_NUMA_NODES]; + + unsigned hp_offset; + int i, j, new_memseg; + int nr_hugefiles, nr_hugepages = 0; + void *addr; +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + int new_pages_count[MAX_HUGEPAGE_SIZES]; +#endif + + test_proc_pagemap_readable(); + + memset(used_hp, 0, sizeof(used_hp)); + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + /* hugetlbfs can be disabled */ + if (internal_config.no_hugetlbfs) { + addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); + return -1; + } + mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr; + mcfg->memseg[0].addr = addr; + mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K; + mcfg->memseg[0].len = internal_config.memory; + mcfg->memseg[0].socket_id = 0; + return 0; + } + +/* check if app runs on Xen Dom0 */ + if (internal_config.xen_dom0_support) { +#ifdef RTE_LIBRTE_XEN_DOM0 + /* use dom0_mm kernel driver to init memory */ + if (rte_xen_dom0_memory_init() < 0) + return -1; + else + return 0; +#endif + } + + /* calculate total number of hugepages available. at this point we haven't + * yet started sorting them so they all are on socket 0 */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + /* meanwhile, also initialize used_hp hugepage sizes in used_hp */ + used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz; + + nr_hugepages += internal_config.hugepage_info[i].num_pages[0]; + } + + /* + * allocate a memory area for hugepage table. + * this isn't shared memory yet. due to the fact that we need some + * processing done on these pages, shared memory will be created + * at a later stage. + */ + tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file)); + if (tmp_hp == NULL) + goto fail; + + memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file)); + + hp_offset = 0; /* where we start the current page size entries */ + + /* map all hugepages and sort them */ + for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ + struct hugepage_info *hpi; + + /* + * we don't yet mark hugepages as used at this stage, so + * we just map all hugepages available to the system + * all hugepages are still located on socket 0 + */ + hpi = &internal_config.hugepage_info[i]; + + if (hpi->num_pages[0] == 0) + continue; + + /* map all hugepages available */ + if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){ + RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n", + (unsigned)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + + /* find physical addresses and sockets for each hugepage */ + if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0){ + RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n", + (unsigned)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + + if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){ + RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n", + (unsigned)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + + qsort(&tmp_hp[hp_offset], hpi->num_pages[0], + sizeof(struct hugepage_file), cmp_physaddr); + +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + /* remap all hugepages into single file segments */ + new_pages_count[i] = remap_all_hugepages(&tmp_hp[hp_offset], hpi); + if (new_pages_count[i] < 0){ + RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n", + (unsigned)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + + /* we have processed a num of hugepages of this size, so inc offset */ + hp_offset += new_pages_count[i]; +#else + /* remap all hugepages */ + if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){ + RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n", + (unsigned)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + + /* unmap original mappings */ + if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0) + goto fail; + + /* we have processed a num of hugepages of this size, so inc offset */ + hp_offset += hpi->num_pages[0]; +#endif + } + +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + nr_hugefiles = 0; + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + nr_hugefiles += new_pages_count[i]; + } +#else + nr_hugefiles = nr_hugepages; +#endif + + + /* clean out the numbers of pages */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + internal_config.hugepage_info[i].num_pages[j] = 0; + + /* get hugepages for each socket */ + for (i = 0; i < nr_hugefiles; i++) { + int socket = tmp_hp[i].socket_id; + + /* find a hugepage info with right size and increment num_pages */ + const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES, + (int)internal_config.num_hugepage_sizes); + for (j = 0; j < nb_hpsizes; j++) { + if (tmp_hp[i].size == + internal_config.hugepage_info[j].hugepage_sz) { +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + internal_config.hugepage_info[j].num_pages[socket] += + tmp_hp[i].repeated; +#else + internal_config.hugepage_info[j].num_pages[socket]++; +#endif + } + } + } + + /* make a copy of socket_mem, needed for number of pages calculation */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + memory[i] = internal_config.socket_mem[i]; + + /* calculate final number of pages */ + nr_hugepages = calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes); + + /* error if not enough memory available */ + if (nr_hugepages < 0) + goto fail; + + /* reporting in! */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + if (used_hp[i].num_pages[j] > 0) { + RTE_LOG(DEBUG, EAL, + "Requesting %u pages of size %uMB" + " from socket %i\n", + used_hp[i].num_pages[j], + (unsigned) + (used_hp[i].hugepage_sz / 0x100000), + j); + } + } + } + + /* create shared memory */ + hugepage = create_shared_memory(eal_hugepage_info_path(), + nr_hugefiles * sizeof(struct hugepage_file)); + + if (hugepage == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + goto fail; + } + memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file)); + + /* + * unmap pages that we won't need (looks at used_hp). + * also, sets final_va to NULL on pages that were unmapped. + */ + if (unmap_unneeded_hugepages(tmp_hp, used_hp, + internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n"); + goto fail; + } + + /* + * copy stuff from malloc'd hugepage* to the actual shared memory. + * this procedure only copies those hugepages that have final_va + * not NULL. has overflow protection. + */ + if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles, + tmp_hp, nr_hugefiles) < 0) { + RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n"); + goto fail; + } + + /* free the hugepage backing files */ + if (internal_config.hugepage_unlink && + unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n"); + goto fail; + } + + /* free the temporary hugepage table */ + free(tmp_hp); + tmp_hp = NULL; + + /* find earliest free memseg - this is needed because in case of IVSHMEM, + * segments might have already been initialized */ + for (j = 0; j < RTE_MAX_MEMSEG; j++) + if (mcfg->memseg[j].addr == NULL) { + /* move to previous segment and exit loop */ + j--; + break; + } + + for (i = 0; i < nr_hugefiles; i++) { + new_memseg = 0; + + /* if this is a new section, create a new memseg */ + if (i == 0) + new_memseg = 1; + else if (hugepage[i].socket_id != hugepage[i-1].socket_id) + new_memseg = 1; + else if (hugepage[i].size != hugepage[i-1].size) + new_memseg = 1; + +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start from higher + * virtual address to lower address. Here, both the physical + * address and virtual address are in descending order */ + else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) != + hugepage[i].size) + new_memseg = 1; + else if (((unsigned long)hugepage[i-1].final_va - + (unsigned long)hugepage[i].final_va) != hugepage[i].size) + new_memseg = 1; +#else + else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) != + hugepage[i].size) + new_memseg = 1; + else if (((unsigned long)hugepage[i].final_va - + (unsigned long)hugepage[i-1].final_va) != hugepage[i].size) + new_memseg = 1; +#endif + + if (new_memseg) { + j += 1; + if (j == RTE_MAX_MEMSEG) + break; + + mcfg->memseg[j].phys_addr = hugepage[i].physaddr; + mcfg->memseg[j].addr = hugepage[i].final_va; +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + mcfg->memseg[j].len = hugepage[i].size * hugepage[i].repeated; +#else + mcfg->memseg[j].len = hugepage[i].size; +#endif + mcfg->memseg[j].socket_id = hugepage[i].socket_id; + mcfg->memseg[j].hugepage_sz = hugepage[i].size; + } + /* continuation of previous memseg */ + else { +#ifdef RTE_ARCH_PPC_64 + /* Use the phy and virt address of the last page as segment + * address for IBM Power architecture */ + mcfg->memseg[j].phys_addr = hugepage[i].physaddr; + mcfg->memseg[j].addr = hugepage[i].final_va; +#endif + mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz; + } + hugepage[i].memseg_id = j; + } + + if (i < nr_hugefiles) { + RTE_LOG(ERR, EAL, "Can only reserve %d pages " + "from %d requested\n" + "Current %s=%d is not enough\n" + "Please either increase it or request less amount " + "of memory.\n", + i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG), + RTE_MAX_MEMSEG); + return -ENOMEM; + } + + return 0; + +fail: + free(tmp_hp); + return -1; +} + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +getFileSize(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +/* + * This creates the memory mappings in the secondary process to match that of + * the server process. It goes through each memory segment in the DPDK runtime + * configuration and finds the hugepages which form that segment, mapping them + * in order to form a contiguous block in the virtual memory space + */ +int +rte_eal_hugepage_attach(void) +{ + const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + const struct hugepage_file *hp = NULL; + unsigned num_hp = 0; + unsigned i, s = 0; /* s used to track the segment number */ + off_t size; + int fd, fd_zero = -1, fd_hugepage = -1; + + if (aslr_enabled() > 0) { + RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization " + "(ASLR) is enabled in the kernel.\n"); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory " + "into secondary processes\n"); + } + + test_proc_pagemap_readable(); + + if (internal_config.xen_dom0_support) { +#ifdef RTE_LIBRTE_XEN_DOM0 + if (rte_xen_dom0_memory_attach() < 0) { + RTE_LOG(ERR, EAL,"Failed to attach memory setments of primay " + "process\n"); + return -1; + } + return 0; +#endif + } + + fd_zero = open("/dev/zero", O_RDONLY); + if (fd_zero < 0) { + RTE_LOG(ERR, EAL, "Could not open /dev/zero\n"); + goto error; + } + fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY); + if (fd_hugepage < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path()); + goto error; + } + + /* map all segments into memory to make sure we get the addrs */ + for (s = 0; s < RTE_MAX_MEMSEG; ++s) { + void *base_addr; + + /* + * the first memory segment with len==0 is the one that + * follows the last valid segment. + */ + if (mcfg->memseg[s].len == 0) + break; + +#ifdef RTE_LIBRTE_IVSHMEM + /* + * if segment has ioremap address set, it's an IVSHMEM segment and + * doesn't need mapping as it was already mapped earlier + */ + if (mcfg->memseg[s].ioremap_addr != 0) + continue; +#endif + + /* + * fdzero is mmapped to get a contiguous block of virtual + * addresses of the appropriate memseg size. + * use mmap to get identical addresses as the primary process. + */ + base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len, + PROT_READ, MAP_PRIVATE, fd_zero, 0); + if (base_addr == MAP_FAILED || + base_addr != mcfg->memseg[s].addr) { + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes " + "in /dev/zero to requested address [%p]: '%s'\n", + (unsigned long long)mcfg->memseg[s].len, + mcfg->memseg[s].addr, strerror(errno)); + if (aslr_enabled() > 0) { + RTE_LOG(ERR, EAL, "It is recommended to " + "disable ASLR in the kernel " + "and retry running both primary " + "and secondary processes\n"); + } + goto error; + } + } + + size = getFileSize(fd_hugepage); + hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0); + if (hp == NULL) { + RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path()); + goto error; + } + + num_hp = size / sizeof(struct hugepage_file); + RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp); + + s = 0; + while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){ + void *addr, *base_addr; + uintptr_t offset = 0; + size_t mapping_size; +#ifdef RTE_LIBRTE_IVSHMEM + /* + * if segment has ioremap address set, it's an IVSHMEM segment and + * doesn't need mapping as it was already mapped earlier + */ + if (mcfg->memseg[s].ioremap_addr != 0) { + s++; + continue; + } +#endif + /* + * free previously mapped memory so we can map the + * hugepages into the space + */ + base_addr = mcfg->memseg[s].addr; + munmap(base_addr, mcfg->memseg[s].len); + + /* find the hugepages for this segment and map them + * we don't need to worry about order, as the server sorted the + * entries before it did the second mmap of them */ + for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){ + if (hp[i].memseg_id == (int)s){ + fd = open(hp[i].filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", + hp[i].filepath); + goto error; + } +#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS + mapping_size = hp[i].size * hp[i].repeated; +#else + mapping_size = hp[i].size; +#endif + addr = mmap(RTE_PTR_ADD(base_addr, offset), + mapping_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + close(fd); /* close file both on success and on failure */ + if (addr == MAP_FAILED || + addr != RTE_PTR_ADD(base_addr, offset)) { + RTE_LOG(ERR, EAL, "Could not mmap %s\n", + hp[i].filepath); + goto error; + } + offset+=mapping_size; + } + } + RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s, + (unsigned long long)mcfg->memseg[s].len); + s++; + } + /* unmap the hugepage config file, since we are done using it */ + munmap((void *)(uintptr_t)hp, size); + close(fd_zero); + close(fd_hugepage); + return 0; + +error: + if (fd_zero >= 0) + close(fd_zero); + if (fd_hugepage >= 0) + close(fd_hugepage); + return -1; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c new file mode 100644 index 00000000..dbf12a84 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c @@ -0,0 +1,762 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <dirent.h> + +#include <rte_log.h> +#include <rte_pci.h> +#include <rte_eal_memconfig.h> +#include <rte_malloc.h> +#include <rte_devargs.h> +#include <rte_memcpy.h> + +#include "eal_filesystem.h" +#include "eal_private.h" +#include "eal_pci_init.h" + +/** + * @file + * PCI probing under linux + * + * This code is used to simulate a PCI probe by parsing information in sysfs. + * When a registered device matches a driver, it is then initialized with + * IGB_UIO driver (or doesn't initialize, if the device wasn't bound to it). + */ + +/* unbind kernel driver for this device */ +int +pci_unbind_kernel_driver(struct rte_pci_device *dev) +{ + int n; + FILE *f; + char filename[PATH_MAX]; + char buf[BUFSIZ]; + struct rte_pci_addr *loc = &dev->addr; + + /* open /sys/bus/pci/devices/AAAA:BB:CC.D/driver */ + snprintf(filename, sizeof(filename), + SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/driver/unbind", + loc->domain, loc->bus, loc->devid, loc->function); + + f = fopen(filename, "w"); + if (f == NULL) /* device was not bound */ + return 0; + + n = snprintf(buf, sizeof(buf), PCI_PRI_FMT "\n", + loc->domain, loc->bus, loc->devid, loc->function); + if ((n < 0) || (n >= (int)sizeof(buf))) { + RTE_LOG(ERR, EAL, "%s(): snprintf failed\n", __func__); + goto error; + } + if (fwrite(buf, n, 1, f) == 0) { + RTE_LOG(ERR, EAL, "%s(): could not write to %s\n", __func__, + filename); + goto error; + } + + fclose(f); + return 0; + +error: + fclose(f); + return -1; +} + +static int +pci_get_kernel_driver_by_path(const char *filename, char *dri_name) +{ + int count; + char path[PATH_MAX]; + char *name; + + if (!filename || !dri_name) + return -1; + + count = readlink(filename, path, PATH_MAX); + if (count >= PATH_MAX) + return -1; + + /* For device does not have a driver */ + if (count < 0) + return 1; + + path[count] = '\0'; + + name = strrchr(path, '/'); + if (name) { + strncpy(dri_name, name + 1, strlen(name + 1) + 1); + return 0; + } + + return -1; +} + +/* Map pci device */ +int +rte_eal_pci_map_device(struct rte_pci_device *dev) +{ + int ret = -1; + + /* try mapping the NIC resources using VFIO if it exists */ + switch (dev->kdrv) { + case RTE_KDRV_VFIO: +#ifdef VFIO_PRESENT + if (pci_vfio_is_enabled()) + ret = pci_vfio_map_resource(dev); +#endif + break; + case RTE_KDRV_IGB_UIO: + case RTE_KDRV_UIO_GENERIC: + /* map resources for devices that use uio */ + ret = pci_uio_map_resource(dev); + break; + default: + RTE_LOG(DEBUG, EAL, + " Not managed by a supported kernel driver, skipped\n"); + ret = 1; + break; + } + + return ret; +} + +/* Unmap pci device */ +void +rte_eal_pci_unmap_device(struct rte_pci_device *dev) +{ + /* try unmapping the NIC resources using VFIO if it exists */ + switch (dev->kdrv) { + case RTE_KDRV_VFIO: + RTE_LOG(ERR, EAL, "Hotplug doesn't support vfio yet\n"); + break; + case RTE_KDRV_IGB_UIO: + case RTE_KDRV_UIO_GENERIC: + /* unmap resources for devices that use uio */ + pci_uio_unmap_resource(dev); + break; + default: + RTE_LOG(DEBUG, EAL, + " Not managed by a supported kernel driver, skipped\n"); + break; + } +} + +void * +pci_find_max_end_va(void) +{ + const struct rte_memseg *seg = rte_eal_get_physmem_layout(); + const struct rte_memseg *last = seg; + unsigned i = 0; + + for (i = 0; i < RTE_MAX_MEMSEG; i++, seg++) { + if (seg->addr == NULL) + break; + + if (seg->addr > last->addr) + last = seg; + + } + return RTE_PTR_ADD(last->addr, last->len); +} + +/* parse the "resource" sysfs file */ +static int +pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev) +{ + FILE *f; + char buf[BUFSIZ]; + union pci_resource_info { + struct { + char *phys_addr; + char *end_addr; + char *flags; + }; + char *ptrs[PCI_RESOURCE_FMT_NVAL]; + } res_info; + int i; + uint64_t phys_addr, end_addr, flags; + + f = fopen(filename, "r"); + if (f == NULL) { + RTE_LOG(ERR, EAL, "Cannot open sysfs resource\n"); + return -1; + } + + for (i = 0; i<PCI_MAX_RESOURCE; i++) { + + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, + "%s(): cannot read resource\n", __func__); + goto error; + } + + if (rte_strsplit(buf, sizeof(buf), res_info.ptrs, 3, ' ') != 3) { + RTE_LOG(ERR, EAL, + "%s(): bad resource format\n", __func__); + goto error; + } + errno = 0; + phys_addr = strtoull(res_info.phys_addr, NULL, 16); + end_addr = strtoull(res_info.end_addr, NULL, 16); + flags = strtoull(res_info.flags, NULL, 16); + if (errno != 0) { + RTE_LOG(ERR, EAL, + "%s(): bad resource format\n", __func__); + goto error; + } + + if (flags & IORESOURCE_MEM) { + dev->mem_resource[i].phys_addr = phys_addr; + dev->mem_resource[i].len = end_addr - phys_addr + 1; + /* not mapped for now */ + dev->mem_resource[i].addr = NULL; + } + } + fclose(f); + return 0; + +error: + fclose(f); + return -1; +} + +/* Scan one pci sysfs entry, and fill the devices list from it. */ +static int +pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus, + uint8_t devid, uint8_t function) +{ + char filename[PATH_MAX]; + unsigned long tmp; + struct rte_pci_device *dev; + char driver[PATH_MAX]; + int ret; + + dev = malloc(sizeof(*dev)); + if (dev == NULL) + return -1; + + memset(dev, 0, sizeof(*dev)); + dev->addr.domain = domain; + dev->addr.bus = bus; + dev->addr.devid = devid; + dev->addr.function = function; + + /* get vendor id */ + snprintf(filename, sizeof(filename), "%s/vendor", dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + dev->id.vendor_id = (uint16_t)tmp; + + /* get device id */ + snprintf(filename, sizeof(filename), "%s/device", dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + dev->id.device_id = (uint16_t)tmp; + + /* get subsystem_vendor id */ + snprintf(filename, sizeof(filename), "%s/subsystem_vendor", + dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + dev->id.subsystem_vendor_id = (uint16_t)tmp; + + /* get subsystem_device id */ + snprintf(filename, sizeof(filename), "%s/subsystem_device", + dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + dev->id.subsystem_device_id = (uint16_t)tmp; + + /* get max_vfs */ + dev->max_vfs = 0; + snprintf(filename, sizeof(filename), "%s/max_vfs", dirname); + if (!access(filename, F_OK) && + eal_parse_sysfs_value(filename, &tmp) == 0) + dev->max_vfs = (uint16_t)tmp; + else { + /* for non igb_uio driver, need kernel version >= 3.8 */ + snprintf(filename, sizeof(filename), + "%s/sriov_numvfs", dirname); + if (!access(filename, F_OK) && + eal_parse_sysfs_value(filename, &tmp) == 0) + dev->max_vfs = (uint16_t)tmp; + } + + /* get numa node */ + snprintf(filename, sizeof(filename), "%s/numa_node", + dirname); + if (access(filename, R_OK) != 0) { + /* if no NUMA support, set default to 0 */ + dev->numa_node = 0; + } else { + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + dev->numa_node = tmp; + } + + /* parse resources */ + snprintf(filename, sizeof(filename), "%s/resource", dirname); + if (pci_parse_sysfs_resource(filename, dev) < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot parse resource\n", __func__); + free(dev); + return -1; + } + + /* parse driver */ + snprintf(filename, sizeof(filename), "%s/driver", dirname); + ret = pci_get_kernel_driver_by_path(filename, driver); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Fail to get kernel driver\n"); + free(dev); + return -1; + } + + if (!ret) { + if (!strcmp(driver, "vfio-pci")) + dev->kdrv = RTE_KDRV_VFIO; + else if (!strcmp(driver, "igb_uio")) + dev->kdrv = RTE_KDRV_IGB_UIO; + else if (!strcmp(driver, "uio_pci_generic")) + dev->kdrv = RTE_KDRV_UIO_GENERIC; + else + dev->kdrv = RTE_KDRV_UNKNOWN; + } else + dev->kdrv = RTE_KDRV_NONE; + + /* device is valid, add in list (sorted) */ + if (TAILQ_EMPTY(&pci_device_list)) { + TAILQ_INSERT_TAIL(&pci_device_list, dev, next); + } else { + struct rte_pci_device *dev2; + int ret; + + TAILQ_FOREACH(dev2, &pci_device_list, next) { + ret = rte_eal_compare_pci_addr(&dev->addr, &dev2->addr); + if (ret > 0) + continue; + + if (ret < 0) { + TAILQ_INSERT_BEFORE(dev2, dev, next); + } else { /* already registered */ + dev2->kdrv = dev->kdrv; + dev2->max_vfs = dev->max_vfs; + memmove(dev2->mem_resource, dev->mem_resource, + sizeof(dev->mem_resource)); + free(dev); + } + return 0; + } + TAILQ_INSERT_TAIL(&pci_device_list, dev, next); + } + + return 0; +} + +/* + * split up a pci address into its constituent parts. + */ +static int +parse_pci_addr_format(const char *buf, int bufsize, uint16_t *domain, + uint8_t *bus, uint8_t *devid, uint8_t *function) +{ + /* first split on ':' */ + union splitaddr { + struct { + char *domain; + char *bus; + char *devid; + char *function; + }; + char *str[PCI_FMT_NVAL]; /* last element-separator is "." not ":" */ + } splitaddr; + + char *buf_copy = strndup(buf, bufsize); + if (buf_copy == NULL) + return -1; + + if (rte_strsplit(buf_copy, bufsize, splitaddr.str, PCI_FMT_NVAL, ':') + != PCI_FMT_NVAL - 1) + goto error; + /* final split is on '.' between devid and function */ + splitaddr.function = strchr(splitaddr.devid,'.'); + if (splitaddr.function == NULL) + goto error; + *splitaddr.function++ = '\0'; + + /* now convert to int values */ + errno = 0; + *domain = (uint16_t)strtoul(splitaddr.domain, NULL, 16); + *bus = (uint8_t)strtoul(splitaddr.bus, NULL, 16); + *devid = (uint8_t)strtoul(splitaddr.devid, NULL, 16); + *function = (uint8_t)strtoul(splitaddr.function, NULL, 10); + if (errno != 0) + goto error; + + free(buf_copy); /* free the copy made with strdup */ + return 0; +error: + free(buf_copy); + return -1; +} + +/* + * Scan the content of the PCI bus, and the devices in the devices + * list + */ +int +rte_eal_pci_scan(void) +{ + struct dirent *e; + DIR *dir; + char dirname[PATH_MAX]; + uint16_t domain; + uint8_t bus, devid, function; + + dir = opendir(SYSFS_PCI_DEVICES); + if (dir == NULL) { + RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n", + __func__, strerror(errno)); + return -1; + } + + while ((e = readdir(dir)) != NULL) { + if (e->d_name[0] == '.') + continue; + + if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &domain, + &bus, &devid, &function) != 0) + continue; + + snprintf(dirname, sizeof(dirname), "%s/%s", SYSFS_PCI_DEVICES, + e->d_name); + if (pci_scan_one(dirname, domain, bus, devid, function) < 0) + goto error; + } + closedir(dir); + return 0; + +error: + closedir(dir); + return -1; +} + +#ifdef RTE_PCI_CONFIG +/* + * It is deprecated, all its configurations have been moved into + * each PMD respectively. + */ +void +pci_config_space_set(__rte_unused struct rte_pci_device *dev) +{ + RTE_LOG(DEBUG, EAL, "Nothing here, as it is deprecated\n"); +} +#endif + +/* Read PCI config space. */ +int rte_eal_pci_read_config(const struct rte_pci_device *device, + void *buf, size_t len, off_t offset) +{ + const struct rte_intr_handle *intr_handle = &device->intr_handle; + + switch (intr_handle->type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + return pci_uio_read_config(intr_handle, buf, len, offset); + +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + return pci_vfio_read_config(intr_handle, buf, len, offset); +#endif + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } +} + +/* Write PCI config space. */ +int rte_eal_pci_write_config(const struct rte_pci_device *device, + const void *buf, size_t len, off_t offset) +{ + const struct rte_intr_handle *intr_handle = &device->intr_handle; + + switch (intr_handle->type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + return pci_uio_write_config(intr_handle, buf, len, offset); + +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + return pci_vfio_write_config(intr_handle, buf, len, offset); +#endif + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } +} + +#if defined(RTE_ARCH_X86) +static int +pci_ioport_map(struct rte_pci_device *dev, int bar __rte_unused, + struct rte_pci_ioport *p) +{ + uint16_t start, end; + FILE *fp; + char *line = NULL; + char pci_id[16]; + int found = 0; + size_t linesz; + + snprintf(pci_id, sizeof(pci_id), PCI_PRI_FMT, + dev->addr.domain, dev->addr.bus, + dev->addr.devid, dev->addr.function); + + fp = fopen("/proc/ioports", "r"); + if (fp == NULL) { + RTE_LOG(ERR, EAL, "%s(): can't open ioports\n", __func__); + return -1; + } + + while (getdelim(&line, &linesz, '\n', fp) > 0) { + char *ptr = line; + char *left; + int n; + + n = strcspn(ptr, ":"); + ptr[n] = 0; + left = &ptr[n + 1]; + + while (*left && isspace(*left)) + left++; + + if (!strncmp(left, pci_id, strlen(pci_id))) { + found = 1; + + while (*ptr && isspace(*ptr)) + ptr++; + + sscanf(ptr, "%04hx-%04hx", &start, &end); + + break; + } + } + + free(line); + fclose(fp); + + if (!found) + return -1; + + dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; + p->base = start; + RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%x\n", start); + + return 0; +} +#endif + +int +rte_eal_pci_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p) +{ + int ret = -1; + + switch (dev->kdrv) { +#ifdef VFIO_PRESENT + case RTE_KDRV_VFIO: + if (pci_vfio_is_enabled()) + ret = pci_vfio_ioport_map(dev, bar, p); + break; +#endif + case RTE_KDRV_IGB_UIO: + ret = pci_uio_ioport_map(dev, bar, p); + break; + case RTE_KDRV_UIO_GENERIC: +#if defined(RTE_ARCH_X86) + ret = pci_ioport_map(dev, bar, p); +#else + ret = pci_uio_ioport_map(dev, bar, p); +#endif + break; + case RTE_KDRV_NONE: +#if defined(RTE_ARCH_X86) + ret = pci_ioport_map(dev, bar, p); +#endif + break; + default: + break; + } + + if (!ret) + p->dev = dev; + + return ret; +} + +void +rte_eal_pci_ioport_read(struct rte_pci_ioport *p, + void *data, size_t len, off_t offset) +{ + switch (p->dev->kdrv) { +#ifdef VFIO_PRESENT + case RTE_KDRV_VFIO: + pci_vfio_ioport_read(p, data, len, offset); + break; +#endif + case RTE_KDRV_IGB_UIO: + pci_uio_ioport_read(p, data, len, offset); + break; + case RTE_KDRV_UIO_GENERIC: + pci_uio_ioport_read(p, data, len, offset); + break; + case RTE_KDRV_NONE: +#if defined(RTE_ARCH_X86) + pci_uio_ioport_read(p, data, len, offset); +#endif + break; + default: + break; + } +} + +void +rte_eal_pci_ioport_write(struct rte_pci_ioport *p, + const void *data, size_t len, off_t offset) +{ + switch (p->dev->kdrv) { +#ifdef VFIO_PRESENT + case RTE_KDRV_VFIO: + pci_vfio_ioport_write(p, data, len, offset); + break; +#endif + case RTE_KDRV_IGB_UIO: + pci_uio_ioport_write(p, data, len, offset); + break; + case RTE_KDRV_UIO_GENERIC: + pci_uio_ioport_write(p, data, len, offset); + break; + case RTE_KDRV_NONE: +#if defined(RTE_ARCH_X86) + pci_uio_ioport_write(p, data, len, offset); +#endif + break; + default: + break; + } +} + +int +rte_eal_pci_ioport_unmap(struct rte_pci_ioport *p) +{ + int ret = -1; + + switch (p->dev->kdrv) { +#ifdef VFIO_PRESENT + case RTE_KDRV_VFIO: + if (pci_vfio_is_enabled()) + ret = pci_vfio_ioport_unmap(p); + break; +#endif + case RTE_KDRV_IGB_UIO: + ret = pci_uio_ioport_unmap(p); + break; + case RTE_KDRV_UIO_GENERIC: +#if defined(RTE_ARCH_X86) + ret = 0; +#else + ret = pci_uio_ioport_unmap(p); +#endif + break; + case RTE_KDRV_NONE: +#if defined(RTE_ARCH_X86) + ret = 0; +#endif + break; + default: + break; + } + + return ret; +} + +/* Init the PCI EAL subsystem */ +int +rte_eal_pci_init(void) +{ + TAILQ_INIT(&pci_driver_list); + TAILQ_INIT(&pci_device_list); + + /* for debug purposes, PCI can be disabled */ + if (internal_config.no_pci) + return 0; + + if (rte_eal_pci_scan() < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot scan PCI bus\n", __func__); + return -1; + } +#ifdef VFIO_PRESENT + pci_vfio_enable(); + + if (pci_vfio_is_enabled()) { + + /* if we are primary process, create a thread to communicate with + * secondary processes. the thread will use a socket to wait for + * requests from secondary process to send open file descriptors, + * because VFIO does not allow multiple open descriptors on a group or + * VFIO container. + */ + if (internal_config.process_type == RTE_PROC_PRIMARY && + pci_vfio_mp_sync_setup() < 0) + return -1; + } +#endif + return 0; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h new file mode 100644 index 00000000..7011753d --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h @@ -0,0 +1,127 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef EAL_PCI_INIT_H_ +#define EAL_PCI_INIT_H_ + +#include "eal_vfio.h" + +/* + * Helper function to map PCI resources right after hugepages in virtual memory + */ +extern void *pci_map_addr; +void *pci_find_max_end_va(void); + +int pci_uio_alloc_resource(struct rte_pci_device *dev, + struct mapped_pci_resource **uio_res); +void pci_uio_free_resource(struct rte_pci_device *dev, + struct mapped_pci_resource *uio_res); +int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx, + struct mapped_pci_resource *uio_res, int map_idx); + +int pci_uio_read_config(const struct rte_intr_handle *intr_handle, + void *buf, size_t len, off_t offs); +int pci_uio_write_config(const struct rte_intr_handle *intr_handle, + const void *buf, size_t len, off_t offs); + +int pci_uio_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p); +void pci_uio_ioport_read(struct rte_pci_ioport *p, + void *data, size_t len, off_t offset); +void pci_uio_ioport_write(struct rte_pci_ioport *p, + const void *data, size_t len, off_t offset); +int pci_uio_ioport_unmap(struct rte_pci_ioport *p); + +#ifdef VFIO_PRESENT + +#define VFIO_MAX_GROUPS 64 + +int pci_vfio_enable(void); +int pci_vfio_is_enabled(void); +int pci_vfio_mp_sync_setup(void); + +/* access config space */ +int pci_vfio_read_config(const struct rte_intr_handle *intr_handle, + void *buf, size_t len, off_t offs); +int pci_vfio_write_config(const struct rte_intr_handle *intr_handle, + const void *buf, size_t len, off_t offs); + +int pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p); +void pci_vfio_ioport_read(struct rte_pci_ioport *p, + void *data, size_t len, off_t offset); +void pci_vfio_ioport_write(struct rte_pci_ioport *p, + const void *data, size_t len, off_t offset); +int pci_vfio_ioport_unmap(struct rte_pci_ioport *p); + +/* map VFIO resource prototype */ +int pci_vfio_map_resource(struct rte_pci_device *dev); +int pci_vfio_get_group_fd(int iommu_group_fd); +int pci_vfio_get_container_fd(void); + +/* + * Function prototypes for VFIO multiprocess sync functions + */ +int vfio_mp_sync_send_request(int socket, int req); +int vfio_mp_sync_receive_request(int socket); +int vfio_mp_sync_send_fd(int socket, int fd); +int vfio_mp_sync_receive_fd(int socket); +int vfio_mp_sync_connect_to_primary(void); + +/* socket comm protocol definitions */ +#define SOCKET_REQ_CONTAINER 0x100 +#define SOCKET_REQ_GROUP 0x200 +#define SOCKET_OK 0x0 +#define SOCKET_NO_FD 0x1 +#define SOCKET_ERR 0xFF + +/* + * we don't need to store device fd's anywhere since they can be obtained from + * the group fd via an ioctl() call. + */ +struct vfio_group { + int group_no; + int fd; +}; + +struct vfio_config { + int vfio_enabled; + int vfio_container_fd; + int vfio_container_has_dma; + int vfio_group_idx; + struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; +}; + +#endif + +#endif /* EAL_PCI_INIT_H_ */ diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c new file mode 100644 index 00000000..068694dc --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c @@ -0,0 +1,491 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <dirent.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <linux/pci_regs.h> + +#if defined(RTE_ARCH_X86) +#include <sys/io.h> +#endif + +#include <rte_log.h> +#include <rte_pci.h> +#include <rte_eal_memconfig.h> +#include <rte_common.h> +#include <rte_malloc.h> + +#include "eal_filesystem.h" +#include "eal_pci_init.h" + +void *pci_map_addr = NULL; + +#define OFF_MAX ((uint64_t)(off_t)-1) + +int +pci_uio_read_config(const struct rte_intr_handle *intr_handle, + void *buf, size_t len, off_t offset) +{ + return pread(intr_handle->uio_cfg_fd, buf, len, offset); +} + +int +pci_uio_write_config(const struct rte_intr_handle *intr_handle, + const void *buf, size_t len, off_t offset) +{ + return pwrite(intr_handle->uio_cfg_fd, buf, len, offset); +} + +static int +pci_uio_set_bus_master(int dev_fd) +{ + uint16_t reg; + int ret; + + ret = pread(dev_fd, ®, sizeof(reg), PCI_COMMAND); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, + "Cannot read command from PCI config space!\n"); + return -1; + } + + /* return if bus mastering is already on */ + if (reg & PCI_COMMAND_MASTER) + return 0; + + reg |= PCI_COMMAND_MASTER; + + ret = pwrite(dev_fd, ®, sizeof(reg), PCI_COMMAND); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, + "Cannot write command to PCI config space!\n"); + return -1; + } + + return 0; +} + +static int +pci_mknod_uio_dev(const char *sysfs_uio_path, unsigned uio_num) +{ + FILE *f; + char filename[PATH_MAX]; + int ret; + unsigned major, minor; + dev_t dev; + + /* get the name of the sysfs file that contains the major and minor + * of the uio device and read its content */ + snprintf(filename, sizeof(filename), "%s/dev", sysfs_uio_path); + + f = fopen(filename, "r"); + if (f == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs to get major:minor\n", + __func__); + return -1; + } + + ret = fscanf(f, "%u:%u", &major, &minor); + if (ret != 2) { + RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs to get major:minor\n", + __func__); + fclose(f); + return -1; + } + fclose(f); + + /* create the char device "mknod /dev/uioX c major minor" */ + snprintf(filename, sizeof(filename), "/dev/uio%u", uio_num); + dev = makedev(major, minor); + ret = mknod(filename, S_IFCHR | S_IRUSR | S_IWUSR, dev); + if (f == NULL) { + RTE_LOG(ERR, EAL, "%s(): mknod() failed %s\n", + __func__, strerror(errno)); + return -1; + } + + return ret; +} + +/* + * Return the uioX char device used for a pci device. On success, return + * the UIO number and fill dstbuf string with the path of the device in + * sysfs. On error, return a negative value. In this case dstbuf is + * invalid. + */ +static int +pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf, + unsigned int buflen, int create) +{ + struct rte_pci_addr *loc = &dev->addr; + unsigned int uio_num; + struct dirent *e; + DIR *dir; + char dirname[PATH_MAX]; + + /* depending on kernel version, uio can be located in uio/uioX + * or uio:uioX */ + + snprintf(dirname, sizeof(dirname), + SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/uio", + loc->domain, loc->bus, loc->devid, loc->function); + + dir = opendir(dirname); + if (dir == NULL) { + /* retry with the parent directory */ + snprintf(dirname, sizeof(dirname), + SYSFS_PCI_DEVICES "/" PCI_PRI_FMT, + loc->domain, loc->bus, loc->devid, loc->function); + dir = opendir(dirname); + + if (dir == NULL) { + RTE_LOG(ERR, EAL, "Cannot opendir %s\n", dirname); + return -1; + } + } + + /* take the first file starting with "uio" */ + while ((e = readdir(dir)) != NULL) { + /* format could be uio%d ...*/ + int shortprefix_len = sizeof("uio") - 1; + /* ... or uio:uio%d */ + int longprefix_len = sizeof("uio:uio") - 1; + char *endptr; + + if (strncmp(e->d_name, "uio", 3) != 0) + continue; + + /* first try uio%d */ + errno = 0; + uio_num = strtoull(e->d_name + shortprefix_len, &endptr, 10); + if (errno == 0 && endptr != (e->d_name + shortprefix_len)) { + snprintf(dstbuf, buflen, "%s/uio%u", dirname, uio_num); + break; + } + + /* then try uio:uio%d */ + errno = 0; + uio_num = strtoull(e->d_name + longprefix_len, &endptr, 10); + if (errno == 0 && endptr != (e->d_name + longprefix_len)) { + snprintf(dstbuf, buflen, "%s/uio:uio%u", dirname, uio_num); + break; + } + } + closedir(dir); + + /* No uio resource found */ + if (e == NULL) + return -1; + + /* create uio device if we've been asked to */ + if (internal_config.create_uio_dev && create && + pci_mknod_uio_dev(dstbuf, uio_num) < 0) + RTE_LOG(WARNING, EAL, "Cannot create /dev/uio%u\n", uio_num); + + return uio_num; +} + +void +pci_uio_free_resource(struct rte_pci_device *dev, + struct mapped_pci_resource *uio_res) +{ + rte_free(uio_res); + + if (dev->intr_handle.uio_cfg_fd >= 0) { + close(dev->intr_handle.uio_cfg_fd); + dev->intr_handle.uio_cfg_fd = -1; + } + if (dev->intr_handle.fd) { + close(dev->intr_handle.fd); + dev->intr_handle.fd = -1; + dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; + } +} + +int +pci_uio_alloc_resource(struct rte_pci_device *dev, + struct mapped_pci_resource **uio_res) +{ + char dirname[PATH_MAX]; + char cfgname[PATH_MAX]; + char devname[PATH_MAX]; /* contains the /dev/uioX */ + int uio_num; + struct rte_pci_addr *loc; + + loc = &dev->addr; + + /* find uio resource */ + uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 1); + if (uio_num < 0) { + RTE_LOG(WARNING, EAL, " "PCI_PRI_FMT" not managed by UIO driver, " + "skipping\n", loc->domain, loc->bus, loc->devid, loc->function); + return 1; + } + snprintf(devname, sizeof(devname), "/dev/uio%u", uio_num); + + /* save fd if in primary process */ + dev->intr_handle.fd = open(devname, O_RDWR); + if (dev->intr_handle.fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", + devname, strerror(errno)); + goto error; + } + + snprintf(cfgname, sizeof(cfgname), + "/sys/class/uio/uio%u/device/config", uio_num); + dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR); + if (dev->intr_handle.uio_cfg_fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", + cfgname, strerror(errno)); + goto error; + } + + if (dev->kdrv == RTE_KDRV_IGB_UIO) + dev->intr_handle.type = RTE_INTR_HANDLE_UIO; + else { + dev->intr_handle.type = RTE_INTR_HANDLE_UIO_INTX; + + /* set bus master that is not done by uio_pci_generic */ + if (pci_uio_set_bus_master(dev->intr_handle.uio_cfg_fd)) { + RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n"); + goto error; + } + } + + /* allocate the mapping details for secondary processes*/ + *uio_res = rte_zmalloc("UIO_RES", sizeof(**uio_res), 0); + if (*uio_res == NULL) { + RTE_LOG(ERR, EAL, + "%s(): cannot store uio mmap details\n", __func__); + goto error; + } + + snprintf((*uio_res)->path, sizeof((*uio_res)->path), "%s", devname); + memcpy(&(*uio_res)->pci_addr, &dev->addr, sizeof((*uio_res)->pci_addr)); + + return 0; + +error: + pci_uio_free_resource(dev, *uio_res); + return -1; +} + +int +pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx, + struct mapped_pci_resource *uio_res, int map_idx) +{ + int fd; + char devname[PATH_MAX]; /* contains the /dev/uioX */ + void *mapaddr; + struct rte_pci_addr *loc; + struct pci_map *maps; + + loc = &dev->addr; + maps = uio_res->maps; + + /* update devname for mmap */ + snprintf(devname, sizeof(devname), + SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/resource%d", + loc->domain, loc->bus, loc->devid, + loc->function, res_idx); + + /* allocate memory to keep path */ + maps[map_idx].path = rte_malloc(NULL, strlen(devname) + 1, 0); + if (maps[map_idx].path == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate memory for path: %s\n", + strerror(errno)); + return -1; + } + + /* + * open resource file, to mmap it + */ + fd = open(devname, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", + devname, strerror(errno)); + goto error; + } + + /* try mapping somewhere close to the end of hugepages */ + if (pci_map_addr == NULL) + pci_map_addr = pci_find_max_end_va(); + + mapaddr = pci_map_resource(pci_map_addr, fd, 0, + (size_t)dev->mem_resource[res_idx].len, 0); + close(fd); + if (mapaddr == MAP_FAILED) + goto error; + + pci_map_addr = RTE_PTR_ADD(mapaddr, + (size_t)dev->mem_resource[res_idx].len); + + maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr; + maps[map_idx].size = dev->mem_resource[res_idx].len; + maps[map_idx].addr = mapaddr; + maps[map_idx].offset = 0; + strcpy(maps[map_idx].path, devname); + dev->mem_resource[res_idx].addr = mapaddr; + + return 0; + +error: + rte_free(maps[map_idx].path); + return -1; +} + +int +pci_uio_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p) +{ +#if defined(RTE_ARCH_X86) + char dirname[PATH_MAX]; + char filename[PATH_MAX]; + int uio_num; + unsigned long start; + + uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 0); + if (uio_num < 0) + return -1; + + /* get portio start */ + snprintf(filename, sizeof(filename), + "%s/portio/port%d/start", dirname, bar); + if (eal_parse_sysfs_value(filename, &start) < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot parse portio start\n", + __func__); + return -1; + } + /* ensure we don't get anything funny here, read/write will cast to + * uin16_t */ + if (start > UINT16_MAX) + return -1; + + /* FIXME only for primary process ? */ + if (dev->intr_handle.type == RTE_INTR_HANDLE_UNKNOWN) { + + snprintf(filename, sizeof(filename), "/dev/uio%u", uio_num); + dev->intr_handle.fd = open(filename, O_RDWR); + if (dev->intr_handle.fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", + filename, strerror(errno)); + return -1; + } + dev->intr_handle.type = RTE_INTR_HANDLE_UIO; + } + + RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%lx\n", start); + + p->base = start; + return 0; +#else + RTE_SET_USED(dev); + RTE_SET_USED(bar); + RTE_SET_USED(p); + return -1; +#endif +} + +void +pci_uio_ioport_read(struct rte_pci_ioport *p, + void *data, size_t len, off_t offset) +{ +#if defined(RTE_ARCH_X86) + uint8_t *d; + int size; + unsigned short reg = p->base + offset; + + for (d = data; len > 0; d += size, reg += size, len -= size) { + if (len >= 4) { + size = 4; + *(uint32_t *)d = inl(reg); + } else if (len >= 2) { + size = 2; + *(uint16_t *)d = inw(reg); + } else { + size = 1; + *d = inb(reg); + } + } +#else + RTE_SET_USED(p); + RTE_SET_USED(data); + RTE_SET_USED(len); + RTE_SET_USED(offset); +#endif +} + +void +pci_uio_ioport_write(struct rte_pci_ioport *p, + const void *data, size_t len, off_t offset) +{ +#if defined(RTE_ARCH_X86) + const uint8_t *s; + int size; + unsigned short reg = p->base + offset; + + for (s = data; len > 0; s += size, reg += size, len -= size) { + if (len >= 4) { + size = 4; + outl_p(*(const uint32_t *)s, reg); + } else if (len >= 2) { + size = 2; + outw_p(*(const uint16_t *)s, reg); + } else { + size = 1; + outb_p(*s, reg); + } + } +#else + RTE_SET_USED(p); + RTE_SET_USED(data); + RTE_SET_USED(len); + RTE_SET_USED(offset); +#endif +} + +int +pci_uio_ioport_unmap(struct rte_pci_ioport *p) +{ + RTE_SET_USED(p); +#if defined(RTE_ARCH_X86) + /* FIXME close intr fd ? */ + return 0; +#else + return -1; +#endif +} diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c new file mode 100644 index 00000000..10266f8f --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c @@ -0,0 +1,1097 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <fcntl.h> +#include <linux/pci_regs.h> +#include <sys/eventfd.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <rte_log.h> +#include <rte_pci.h> +#include <rte_eal_memconfig.h> +#include <rte_malloc.h> +#include <eal_private.h> + +#include "eal_filesystem.h" +#include "eal_pci_init.h" +#include "eal_vfio.h" + +/** + * @file + * PCI probing under linux (VFIO version) + * + * This code tries to determine if the PCI device is bound to VFIO driver, + * and initialize it (map BARs, set up interrupts) if that's the case. + * + * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". + */ + +#ifdef VFIO_PRESENT + +#define PAGE_SIZE (sysconf(_SC_PAGESIZE)) +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +static struct rte_tailq_elem rte_vfio_tailq = { + .name = "VFIO_RESOURCE_LIST", +}; +EAL_REGISTER_TAILQ(rte_vfio_tailq) + +#define VFIO_DIR "/dev/vfio" +#define VFIO_CONTAINER_PATH "/dev/vfio/vfio" +#define VFIO_GROUP_FMT "/dev/vfio/%u" +#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u" +#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL) +#define VFIO_GET_REGION_IDX(x) (x >> 40) + +/* per-process VFIO config */ +static struct vfio_config vfio_cfg; + +/* DMA mapping function prototype. + * Takes VFIO container fd as a parameter. + * Returns 0 on success, -1 on error. + * */ +typedef int (*vfio_dma_func_t)(int); + +struct vfio_iommu_type { + int type_id; + const char *name; + vfio_dma_func_t dma_map_func; +}; + +static int vfio_type1_dma_map(int); +static int vfio_noiommu_dma_map(int); + +/* IOMMU types we support */ +static const struct vfio_iommu_type iommu_types[] = { + /* x86 IOMMU, otherwise known as type 1 */ + { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map}, + /* IOMMU-less mode */ + { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map}, +}; + +int +vfio_type1_dma_map(int vfio_container_fd) +{ + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); + int i, ret; + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + struct vfio_iommu_type1_dma_map dma_map; + + if (ms[i].addr == NULL) + break; + + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = ms[i].addr_64; + dma_map.size = ms[i].len; + dma_map.iova = ms[i].phys_addr; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +int +vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +int +pci_vfio_read_config(const struct rte_intr_handle *intr_handle, + void *buf, size_t len, off_t offs) +{ + return pread64(intr_handle->vfio_dev_fd, buf, len, + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); +} + +int +pci_vfio_write_config(const struct rte_intr_handle *intr_handle, + const void *buf, size_t len, off_t offs) +{ + return pwrite64(intr_handle->vfio_dev_fd, buf, len, + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); +} + +/* get PCI BAR number where MSI-X interrupts are */ +static int +pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset, + uint32_t *msix_table_size) +{ + int ret; + uint32_t reg; + uint16_t flags; + uint8_t cap_id, cap_offset; + + /* read PCI capability pointer from config space */ + ret = pread64(fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_CAPABILITY_LIST); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " + "config space!\n"); + return -1; + } + + /* we need first byte */ + cap_offset = reg & 0xFF; + + while (cap_offset) { + + /* read PCI capability ID */ + ret = pread64(fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + cap_offset); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI " + "config space!\n"); + return -1; + } + + /* we need first byte */ + cap_id = reg & 0xFF; + + /* if we haven't reached MSI-X, check next capability */ + if (cap_id != PCI_CAP_ID_MSIX) { + ret = pread64(fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + cap_offset); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " + "config space!\n"); + return -1; + } + + /* we need second byte */ + cap_offset = (reg & 0xFF00) >> 8; + + continue; + } + /* else, read table offset */ + else { + /* table offset resides in the next 4 bytes */ + ret = pread64(fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + cap_offset + 4); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config " + "space!\n"); + return -1; + } + + ret = pread64(fd, &flags, sizeof(flags), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + cap_offset + 2); + if (ret != sizeof(flags)) { + RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config " + "space!\n"); + return -1; + } + + *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR; + *msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; + *msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); + + return 0; + } + } + return 0; +} + +/* set PCI bus mastering */ +static int +pci_vfio_set_bus_master(int dev_fd) +{ + uint16_t reg; + int ret; + + ret = pread64(dev_fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_COMMAND); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n"); + return -1; + } + + /* set the master bit */ + reg |= PCI_COMMAND_MASTER; + + ret = pwrite64(dev_fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_COMMAND); + + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n"); + return -1; + } + + return 0; +} + +/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ +static const struct vfio_iommu_type * +pci_vfio_set_iommu_type(int vfio_container_fd) { + unsigned idx; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, + t->type_id); + if (!ret) { + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + return t; + } + /* not an error, there may be more supported IOMMU types */ + RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " + "error %i (%s)\n", t->type_id, t->name, errno, + strerror(errno)); + } + /* if we didn't find a suitable IOMMU type, fail */ + return NULL; +} + +/* check if we have any supported extensions */ +static int +pci_vfio_has_supported_extensions(int vfio_container_fd) { + int ret; + unsigned idx, n_extensions = 0; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, + t->type_id); + if (ret < 0) { + RTE_LOG(ERR, EAL, " could not get IOMMU type, " + "error %i (%s)\n", errno, + strerror(errno)); + close(vfio_container_fd); + return -1; + } else if (ret == 1) { + /* we found a supported extension */ + n_extensions++; + } + RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", + t->type_id, t->name, + ret ? "supported" : "not supported"); + } + + /* if we didn't find any supported IOMMU types, fail */ + if (!n_extensions) { + close(vfio_container_fd); + return -1; + } + + return 0; +} + +/* set up interrupt support (but not enable interrupts) */ +static int +pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) +{ + int i, ret, intr_idx; + + /* default to invalid index */ + intr_idx = VFIO_PCI_NUM_IRQS; + + /* get interrupt type from internal config (MSI-X by default, can be + * overriden from the command line + */ + switch (internal_config.vfio_intr_mode) { + case RTE_INTR_MODE_MSIX: + intr_idx = VFIO_PCI_MSIX_IRQ_INDEX; + break; + case RTE_INTR_MODE_MSI: + intr_idx = VFIO_PCI_MSI_IRQ_INDEX; + break; + case RTE_INTR_MODE_LEGACY: + intr_idx = VFIO_PCI_INTX_IRQ_INDEX; + break; + /* don't do anything if we want to automatically determine interrupt type */ + case RTE_INTR_MODE_NONE: + break; + default: + RTE_LOG(ERR, EAL, " unknown default interrupt type!\n"); + return -1; + } + + /* start from MSI-X interrupt type */ + for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) { + struct vfio_irq_info irq = { .argsz = sizeof(irq) }; + int fd = -1; + + /* skip interrupt modes we don't want */ + if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE && + i != intr_idx) + continue; + + irq.index = i; + + ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); + if (ret < 0) { + RTE_LOG(ERR, EAL, " cannot get IRQ info, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* if this vector cannot be used with eventfd, fail if we explicitly + * specified interrupt type, otherwise continue */ + if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) { + if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE) { + RTE_LOG(ERR, EAL, + " interrupt vector does not support eventfd!\n"); + return -1; + } else + continue; + } + + /* set up an eventfd for interrupts */ + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (fd < 0) { + RTE_LOG(ERR, EAL, " cannot set up eventfd, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + dev->intr_handle.fd = fd; + dev->intr_handle.vfio_dev_fd = vfio_dev_fd; + + switch (i) { + case VFIO_PCI_MSIX_IRQ_INDEX: + internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX; + dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX; + break; + case VFIO_PCI_MSI_IRQ_INDEX: + internal_config.vfio_intr_mode = RTE_INTR_MODE_MSI; + dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI; + break; + case VFIO_PCI_INTX_IRQ_INDEX: + internal_config.vfio_intr_mode = RTE_INTR_MODE_LEGACY; + dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY; + break; + default: + RTE_LOG(ERR, EAL, " unknown interrupt type!\n"); + return -1; + } + + return 0; + } + + /* if we're here, we haven't found a suitable interrupt vector */ + return -1; +} + +/* open container fd or get an existing one */ +int +pci_vfio_get_container_fd(void) +{ + int ret, vfio_container_fd; + + /* if we're in a primary process, try to open the container */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); + if (vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, " cannot open VFIO container, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* check VFIO API version */ + ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); + if (ret != VFIO_API_VERSION) { + if (ret < 0) + RTE_LOG(ERR, EAL, " could not get VFIO API version, " + "error %i (%s)\n", errno, strerror(errno)); + else + RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n"); + close(vfio_container_fd); + return -1; + } + + ret = pci_vfio_has_supported_extensions(vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " no supported IOMMU " + "extensions found!\n"); + return -1; + } + + return vfio_container_fd; + } else { + /* + * if we're in a secondary process, request container fd from the + * primary process via our socket + */ + int socket_fd; + + socket_fd = vfio_mp_sync_connect_to_primary(); + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); + return -1; + } + if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) { + RTE_LOG(ERR, EAL, " cannot request container fd!\n"); + close(socket_fd); + return -1; + } + vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd); + if (vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, " cannot get container fd!\n"); + close(socket_fd); + return -1; + } + close(socket_fd); + return vfio_container_fd; + } + + return -1; +} + +/* open group fd or get an existing one */ +int +pci_vfio_get_group_fd(int iommu_group_no) +{ + int i; + int vfio_group_fd; + char filename[PATH_MAX]; + + /* check if we already have the group descriptor open */ + for (i = 0; i < vfio_cfg.vfio_group_idx; i++) + if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no) + return vfio_cfg.vfio_groups[i].fd; + + /* if primary, try to open the group */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* try regular group format */ + snprintf(filename, sizeof(filename), + VFIO_GROUP_FMT, iommu_group_no); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + /* if file not found, it's not an error */ + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + + /* special case: try no-IOMMU path as well */ + snprintf(filename, sizeof(filename), + VFIO_NOIOMMU_GROUP_FMT, iommu_group_no); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + return 0; + } + /* noiommu group found */ + } + + /* if the fd is valid, create a new group for it */ + if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); + close(vfio_group_fd); + return -1; + } + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no; + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd; + return vfio_group_fd; + } + /* if we're in a secondary process, request group fd from the primary + * process via our socket + */ + else { + int socket_fd, ret; + + socket_fd = vfio_mp_sync_connect_to_primary(); + + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); + return -1; + } + if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) { + RTE_LOG(ERR, EAL, " cannot request container fd!\n"); + close(socket_fd); + return -1; + } + if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) { + RTE_LOG(ERR, EAL, " cannot send group number!\n"); + close(socket_fd); + return -1; + } + ret = vfio_mp_sync_receive_request(socket_fd); + switch (ret) { + case SOCKET_NO_FD: + close(socket_fd); + return 0; + case SOCKET_OK: + vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd); + /* if we got the fd, return it */ + if (vfio_group_fd > 0) { + close(socket_fd); + return vfio_group_fd; + } + /* fall-through on error */ + default: + RTE_LOG(ERR, EAL, " cannot get container fd!\n"); + close(socket_fd); + return -1; + } + } + return -1; +} + +/* parse IOMMU group number for a PCI device + * returns 1 on success, -1 for errors, 0 for non-existent group + */ +static int +pci_vfio_get_group_no(const char *pci_addr, int *iommu_group_no) +{ + char linkname[PATH_MAX]; + char filename[PATH_MAX]; + char *tok[16], *group_tok, *end; + int ret; + + memset(linkname, 0, sizeof(linkname)); + memset(filename, 0, sizeof(filename)); + + /* try to find out IOMMU group for this device */ + snprintf(linkname, sizeof(linkname), + SYSFS_PCI_DEVICES "/%s/iommu_group", pci_addr); + + ret = readlink(linkname, filename, sizeof(filename)); + + /* if the link doesn't exist, no VFIO for us */ + if (ret < 0) + return 0; + + ret = rte_strsplit(filename, sizeof(filename), + tok, RTE_DIM(tok), '/'); + + if (ret <= 0) { + RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", pci_addr); + return -1; + } + + /* IOMMU group is always the last token */ + errno = 0; + group_tok = tok[ret - 1]; + end = group_tok; + *iommu_group_no = strtol(group_tok, &end, 10); + if ((end != group_tok && *end != '\0') || errno != 0) { + RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", pci_addr); + return -1; + } + + return 1; +} + +static void +clear_current_group(void) +{ + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0; + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1; +} + + +/* + * map the PCI resources of a PCI device in virtual memory (VFIO version). + * primary and secondary processes follow almost exactly the same path + */ +int +pci_vfio_map_resource(struct rte_pci_device *dev) +{ + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + int vfio_group_fd, vfio_dev_fd; + int iommu_group_no; + char pci_addr[PATH_MAX] = {0}; + struct rte_pci_addr *loc = &dev->addr; + int i, ret, msix_bar; + struct mapped_pci_resource *vfio_res = NULL; + struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); + + struct pci_map *maps; + uint32_t msix_table_offset = 0; + uint32_t msix_table_size = 0; + uint32_t ioport_bar; + + dev->intr_handle.fd = -1; + dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; + + /* store PCI address string */ + snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, + loc->domain, loc->bus, loc->devid, loc->function); + + /* get group number */ + ret = pci_vfio_get_group_no(pci_addr, &iommu_group_no); + if (ret == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + pci_addr); + return 1; + } + + /* if negative, something failed */ + if (ret < 0) + return -1; + + /* get the actual group fd */ + vfio_group_fd = pci_vfio_get_group_fd(iommu_group_no); + if (vfio_group_fd < 0) + return -1; + + /* store group fd */ + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no; + vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd; + + /* if group_fd == 0, that means the device isn't managed by VFIO */ + if (vfio_group_fd == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + pci_addr); + /* we store 0 as group fd to distinguish between existing but + * unbound VFIO groups, and groups that don't exist at all. + */ + vfio_cfg.vfio_group_idx++; + return 1; + } + + /* + * at this point, we know at least one port on this device is bound to VFIO, + * so we can proceed to try and set this particular port up + */ + + /* check if the group is viable */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get group status, " + "error %i (%s)\n", pci_addr, errno, strerror(errno)); + close(vfio_group_fd); + clear_current_group(); + return -1; + } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", pci_addr); + close(vfio_group_fd); + clear_current_group(); + return -1; + } + + /* + * at this point, we know that this group is viable (meaning, all devices + * are either bound to VFIO or not bound to anything) + */ + + /* check if group does not have a container yet */ + if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { + + /* add group to a container */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, + &vfio_cfg.vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " + "error %i (%s)\n", pci_addr, errno, strerror(errno)); + close(vfio_group_fd); + clear_current_group(); + return -1; + } + /* + * at this point we know that this group has been successfully + * initialized, so we increment vfio_group_idx to indicate that we can + * add new groups. + */ + vfio_cfg.vfio_group_idx++; + } + + /* + * pick an IOMMU type and set up DMA mappings for container + * + * needs to be done only once, only when at least one group is assigned to + * a container and only in primary process + */ + if (internal_config.process_type == RTE_PROC_PRIMARY && + vfio_cfg.vfio_container_has_dma == 0) { + /* select an IOMMU type which we will be using */ + const struct vfio_iommu_type *t = + pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd); + if (!t) { + RTE_LOG(ERR, EAL, " %s failed to select IOMMU type\n", pci_addr); + return -1; + } + ret = t->dma_map_func(vfio_cfg.vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " %s DMA remapping failed, " + "error %i (%s)\n", pci_addr, errno, strerror(errno)); + return -1; + } + vfio_cfg.vfio_container_has_dma = 1; + } + + /* get a file descriptor for the device */ + vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, pci_addr); + if (vfio_dev_fd < 0) { + /* if we cannot get a device fd, this simply means that this + * particular port is not bound to VFIO + */ + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + pci_addr); + return 1; + } + + /* test and setup the device */ + ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_INFO, &device_info); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get device info, " + "error %i (%s)\n", pci_addr, errno, strerror(errno)); + close(vfio_dev_fd); + return -1; + } + + /* get MSI-X BAR, if any (we have to know where it is because we can't + * easily mmap it when using VFIO) */ + msix_bar = -1; + ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar, + &msix_table_offset, &msix_table_size); + if (ret < 0) { + RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", pci_addr); + close(vfio_dev_fd); + return -1; + } + + /* if we're in a primary process, allocate vfio_res and get region info */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0); + if (vfio_res == NULL) { + RTE_LOG(ERR, EAL, + "%s(): cannot store uio mmap details\n", __func__); + close(vfio_dev_fd); + return -1; + } + memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr)); + + /* get number of registers (up to BAR5) */ + vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions, + VFIO_PCI_BAR5_REGION_INDEX + 1); + } else { + /* if we're in a secondary process, just find our tailq entry */ + TAILQ_FOREACH(vfio_res, vfio_res_list, next) { + if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr))) + continue; + break; + } + /* if we haven't found our tailq entry, something's wrong */ + if (vfio_res == NULL) { + RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", + pci_addr); + close(vfio_dev_fd); + return -1; + } + } + + /* map BARs */ + maps = vfio_res->maps; + + for (i = 0; i < (int) vfio_res->nb_maps; i++) { + struct vfio_region_info reg = { .argsz = sizeof(reg) }; + void *bar_addr; + struct memreg { + unsigned long offset, size; + } memreg[2] = {}; + + reg.index = i; + + ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®); + + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get device region info " + "error %i (%s)\n", pci_addr, errno, strerror(errno)); + close(vfio_dev_fd); + if (internal_config.process_type == RTE_PROC_PRIMARY) + rte_free(vfio_res); + return -1; + } + + /* chk for io port region */ + ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_BASE_ADDRESS_0 + i*4); + + if (ret != sizeof(ioport_bar)) { + RTE_LOG(ERR, EAL, + "Cannot read command (%x) from config space!\n", + PCI_BASE_ADDRESS_0 + i*4); + return -1; + } + + if (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) { + RTE_LOG(INFO, EAL, + "Ignore mapping IO port bar(%d) addr: %x\n", + i, ioport_bar); + continue; + } + + /* skip non-mmapable BARs */ + if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) + continue; + + if (i == msix_bar) { + /* + * VFIO will not let us map the MSI-X table, + * but we can map around it. + */ + uint32_t table_start = msix_table_offset; + uint32_t table_end = table_start + msix_table_size; + table_end = (table_end + ~PAGE_MASK) & PAGE_MASK; + table_start &= PAGE_MASK; + + if (table_start == 0 && table_end >= reg.size) { + /* Cannot map this BAR */ + RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n", i); + continue; + } else { + memreg[0].offset = reg.offset; + memreg[0].size = table_start; + memreg[1].offset = table_end; + memreg[1].size = reg.size - table_end; + + RTE_LOG(DEBUG, EAL, + "Trying to map BAR %d that contains the MSI-X " + "table. Trying offsets: " + "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", i, + memreg[0].offset, memreg[0].size, + memreg[1].offset, memreg[1].size); + } + } else { + memreg[0].offset = reg.offset; + memreg[0].size = reg.size; + } + + /* try to figure out an address */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* try mapping somewhere close to the end of hugepages */ + if (pci_map_addr == NULL) + pci_map_addr = pci_find_max_end_va(); + + bar_addr = pci_map_addr; + pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size); + } else { + bar_addr = maps[i].addr; + } + + /* reserve the address using an inaccessible mapping */ + bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE | + MAP_ANONYMOUS, -1, 0); + if (bar_addr != MAP_FAILED) { + void *map_addr = NULL; + if (memreg[0].size) { + /* actual map of first part */ + map_addr = pci_map_resource(bar_addr, vfio_dev_fd, + memreg[0].offset, + memreg[0].size, + MAP_FIXED); + } + + /* if there's a second part, try to map it */ + if (map_addr != MAP_FAILED + && memreg[1].offset && memreg[1].size) { + void *second_addr = RTE_PTR_ADD(bar_addr, memreg[1].offset); + map_addr = pci_map_resource(second_addr, + vfio_dev_fd, memreg[1].offset, + memreg[1].size, + MAP_FIXED); + } + + if (map_addr == MAP_FAILED || !map_addr) { + munmap(bar_addr, reg.size); + bar_addr = MAP_FAILED; + } + } + + if (bar_addr == MAP_FAILED || + (internal_config.process_type == RTE_PROC_SECONDARY && + bar_addr != maps[i].addr)) { + RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", pci_addr, i, + strerror(errno)); + close(vfio_dev_fd); + if (internal_config.process_type == RTE_PROC_PRIMARY) + rte_free(vfio_res); + return -1; + } + + maps[i].addr = bar_addr; + maps[i].offset = reg.offset; + maps[i].size = reg.size; + maps[i].path = NULL; /* vfio doesn't have per-resource paths */ + dev->mem_resource[i].addr = bar_addr; + } + + /* if secondary process, do not set up interrupts */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) { + RTE_LOG(ERR, EAL, " %s error setting up interrupts!\n", pci_addr); + close(vfio_dev_fd); + rte_free(vfio_res); + return -1; + } + + /* set bus mastering for the device */ + if (pci_vfio_set_bus_master(vfio_dev_fd)) { + RTE_LOG(ERR, EAL, " %s cannot set up bus mastering!\n", pci_addr); + close(vfio_dev_fd); + rte_free(vfio_res); + return -1; + } + + /* Reset the device */ + ioctl(vfio_dev_fd, VFIO_DEVICE_RESET); + } + + if (internal_config.process_type == RTE_PROC_PRIMARY) + TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next); + + return 0; +} + +int +pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p) +{ + if (bar < VFIO_PCI_BAR0_REGION_INDEX || + bar > VFIO_PCI_BAR5_REGION_INDEX) { + RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar); + return -1; + } + + p->dev = dev; + p->base = VFIO_GET_REGION_ADDR(bar); + return 0; +} + +void +pci_vfio_ioport_read(struct rte_pci_ioport *p, + void *data, size_t len, off_t offset) +{ + const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; + + if (pread64(intr_handle->vfio_dev_fd, data, + len, p->base + offset) <= 0) + RTE_LOG(ERR, EAL, + "Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n", + VFIO_GET_REGION_IDX(p->base), (int)offset); +} + +void +pci_vfio_ioport_write(struct rte_pci_ioport *p, + const void *data, size_t len, off_t offset) +{ + const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; + + if (pwrite64(intr_handle->vfio_dev_fd, data, + len, p->base + offset) <= 0) + RTE_LOG(ERR, EAL, + "Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n", + VFIO_GET_REGION_IDX(p->base), (int)offset); +} + +int +pci_vfio_ioport_unmap(struct rte_pci_ioport *p) +{ + RTE_SET_USED(p); + return -1; +} + +int +pci_vfio_enable(void) +{ + /* initialize group list */ + int i; + int vfio_available; + + for (i = 0; i < VFIO_MAX_GROUPS; i++) { + vfio_cfg.vfio_groups[i].fd = -1; + vfio_cfg.vfio_groups[i].group_no = -1; + } + + /* inform the user that we are probing for VFIO */ + RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); + + /* check if vfio-pci module is loaded */ + vfio_available = rte_eal_check_module("vfio_pci"); + + /* return error directly */ + if (vfio_available == -1) { + RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); + return -1; + } + + /* return 0 if VFIO modules not loaded */ + if (vfio_available == 0) { + RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, " + "skipping VFIO support...\n"); + return 0; + } + + vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd(); + + /* check if we have VFIO driver enabled */ + if (vfio_cfg.vfio_container_fd != -1) { + RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); + vfio_cfg.vfio_enabled = 1; + } else { + RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); + } + + return 0; +} + +int +pci_vfio_is_enabled(void) +{ + return vfio_cfg.vfio_enabled; +} +#endif diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c new file mode 100644 index 00000000..d9188fde --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c @@ -0,0 +1,405 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <pthread.h> + +/* sys/un.h with __USE_MISC uses strlen, which is unsafe */ +#ifdef __USE_MISC +#define REMOVED_USE_MISC +#undef __USE_MISC +#endif +#include <sys/un.h> +/* make sure we redefine __USE_MISC only if it was previously undefined */ +#ifdef REMOVED_USE_MISC +#define __USE_MISC +#undef REMOVED_USE_MISC +#endif + +#include <rte_log.h> +#include <rte_pci.h> +#include <rte_eal_memconfig.h> +#include <rte_malloc.h> + +#include "eal_filesystem.h" +#include "eal_pci_init.h" +#include "eal_thread.h" + +/** + * @file + * VFIO socket for communication between primary and secondary processes. + * + * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". + */ + +#ifdef VFIO_PRESENT + +#define SOCKET_PATH_FMT "%s/.%s_mp_socket" +#define CMSGLEN (CMSG_LEN(sizeof(int))) +#define FD_TO_CMSGHDR(fd, chdr) \ + do {\ + (chdr).cmsg_len = CMSGLEN;\ + (chdr).cmsg_level = SOL_SOCKET;\ + (chdr).cmsg_type = SCM_RIGHTS;\ + memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\ + } while (0) +#define CMSGHDR_TO_FD(chdr, fd) \ + memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd)) + +static pthread_t socket_thread; +static int mp_socket_fd; + + +/* get socket path (/var/run if root, $HOME otherwise) */ +static void +get_socket_path(char *buffer, int bufsz) +{ + const char *dir = "/var/run"; + const char *home_dir = getenv("HOME"); + + if (getuid() != 0 && home_dir != NULL) + dir = home_dir; + + /* use current prefix as file path */ + snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir, + internal_config.hugefile_prefix); +} + + + +/* + * data flow for socket comm protocol: + * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP + * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number + * 2. server receives message + * 2a. in case of invalid group, SOCKET_ERR is sent back to client + * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client + * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd + * + * in case of any error, socket is closed. + */ + +/* send a request, return -1 on error */ +int +vfio_mp_sync_send_request(int socket, int req) +{ + struct msghdr hdr; + struct iovec iov; + int buf; + int ret; + + memset(&hdr, 0, sizeof(hdr)); + + buf = req; + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + + ret = sendmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + return 0; +} + +/* receive a request and return it */ +int +vfio_mp_sync_receive_request(int socket) +{ + int buf; + struct msghdr hdr; + struct iovec iov; + int ret, req; + + memset(&hdr, 0, sizeof(hdr)); + + buf = SOCKET_ERR; + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + + ret = recvmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + + req = buf; + + return req; +} + +/* send OK in message, fd in control message */ +int +vfio_mp_sync_send_fd(int socket, int fd) +{ + int buf; + struct msghdr hdr; + struct cmsghdr *chdr; + char chdr_buf[CMSGLEN]; + struct iovec iov; + int ret; + + chdr = (struct cmsghdr *) chdr_buf; + memset(chdr, 0, sizeof(chdr_buf)); + memset(&hdr, 0, sizeof(hdr)); + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + hdr.msg_control = chdr; + hdr.msg_controllen = CMSGLEN; + + buf = SOCKET_OK; + FD_TO_CMSGHDR(fd, *chdr); + + ret = sendmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + return 0; +} + +/* receive OK in message, fd in control message */ +int +vfio_mp_sync_receive_fd(int socket) +{ + int buf; + struct msghdr hdr; + struct cmsghdr *chdr; + char chdr_buf[CMSGLEN]; + struct iovec iov; + int ret, req, fd; + + buf = SOCKET_ERR; + + chdr = (struct cmsghdr *) chdr_buf; + memset(chdr, 0, sizeof(chdr_buf)); + memset(&hdr, 0, sizeof(hdr)); + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + hdr.msg_control = chdr; + hdr.msg_controllen = CMSGLEN; + + ret = recvmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + + req = buf; + + if (req != SOCKET_OK) + return -1; + + CMSGHDR_TO_FD(*chdr, fd); + + return fd; +} + +/* connect socket_fd in secondary process to the primary process's socket */ +int +vfio_mp_sync_connect_to_primary(void) +{ + struct sockaddr_un addr; + socklen_t sockaddr_len; + int socket_fd; + + /* set up a socket */ + socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to create socket!\n"); + return -1; + } + + get_socket_path(addr.sun_path, sizeof(addr.sun_path)); + addr.sun_family = AF_UNIX; + + sockaddr_len = sizeof(struct sockaddr_un); + + if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0) + return socket_fd; + + /* if connect failed */ + close(socket_fd); + return -1; +} + + + +/* + * socket listening thread for primary process + */ +static __attribute__((noreturn)) void * +pci_vfio_mp_sync_thread(void __rte_unused * arg) +{ + int ret, fd, vfio_group_no; + + /* wait for requests on the socket */ + for (;;) { + int conn_sock; + struct sockaddr_un addr; + socklen_t sockaddr_len = sizeof(addr); + + /* this is a blocking call */ + conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr, + &sockaddr_len); + + /* just restart on error */ + if (conn_sock == -1) + continue; + + /* set socket to linger after close */ + struct linger l; + l.l_onoff = 1; + l.l_linger = 60; + setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)); + + ret = vfio_mp_sync_receive_request(conn_sock); + + switch (ret) { + case SOCKET_REQ_CONTAINER: + fd = pci_vfio_get_container_fd(); + if (fd < 0) + vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); + else + vfio_mp_sync_send_fd(conn_sock, fd); + break; + case SOCKET_REQ_GROUP: + /* wait for group number */ + vfio_group_no = vfio_mp_sync_receive_request(conn_sock); + if (vfio_group_no < 0) { + close(conn_sock); + continue; + } + + fd = pci_vfio_get_group_fd(vfio_group_no); + + if (fd < 0) + vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); + /* if VFIO group exists but isn't bound to VFIO driver */ + else if (fd == 0) + vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD); + /* if group exists and is bound to VFIO driver */ + else { + vfio_mp_sync_send_request(conn_sock, SOCKET_OK); + vfio_mp_sync_send_fd(conn_sock, fd); + } + break; + default: + vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); + break; + } + close(conn_sock); + } +} + +static int +vfio_mp_sync_socket_setup(void) +{ + int ret, socket_fd; + struct sockaddr_un addr; + socklen_t sockaddr_len; + + /* set up a socket */ + socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to create socket!\n"); + return -1; + } + + get_socket_path(addr.sun_path, sizeof(addr.sun_path)); + addr.sun_family = AF_UNIX; + + sockaddr_len = sizeof(struct sockaddr_un); + + unlink(addr.sun_path); + + ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len); + if (ret) { + RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno)); + close(socket_fd); + return -1; + } + + ret = listen(socket_fd, 50); + if (ret) { + RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno)); + close(socket_fd); + return -1; + } + + /* save the socket in local configuration */ + mp_socket_fd = socket_fd; + + return 0; +} + +/* + * set up a local socket and tell it to listen for incoming connections + */ +int +pci_vfio_mp_sync_setup(void) +{ + int ret; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + if (vfio_mp_sync_socket_setup() < 0) { + RTE_LOG(ERR, EAL, "Failed to set up local socket!\n"); + return -1; + } + + ret = pthread_create(&socket_thread, NULL, + pci_vfio_mp_sync_thread, NULL); + if (ret) { + RTE_LOG(ERR, EAL, + "Failed to create thread for communication with secondary processes!\n"); + close(mp_socket_fd); + return -1; + } + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "pci-vfio-sync"); + ret = rte_thread_setname(socket_thread, thread_name); + if (ret) + RTE_LOG(ERR, EAL, + "Failed to set thread name for secondary processes!\n"); + + return 0; +} + +#endif diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c new file mode 100644 index 00000000..18bd8e04 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_thread.c @@ -0,0 +1,199 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <pthread.h> +#include <sched.h> +#include <sys/queue.h> +#include <sys/syscall.h> + +#include <rte_debug.h> +#include <rte_atomic.h> +#include <rte_launch.h> +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_per_lcore.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> + +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; +RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; +RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); + +/* + * Send a message to a slave lcore identified by slave_id to call a + * function f with argument arg. Once the execution is done, the + * remote lcore switch in FINISHED state. + */ +int +rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) +{ + int n; + char c = 0; + int m2s = lcore_config[slave_id].pipe_master2slave[1]; + int s2m = lcore_config[slave_id].pipe_slave2master[0]; + + if (lcore_config[slave_id].state != WAIT) + return -EBUSY; + + lcore_config[slave_id].f = f; + lcore_config[slave_id].arg = arg; + + /* send message */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(m2s, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + /* wait ack */ + do { + n = read(s2m, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + return 0; +} + +/* set affinity for current EAL thread */ +static int +eal_thread_set_affinity(void) +{ + unsigned lcore_id = rte_lcore_id(); + + /* acquire system unique id */ + rte_gettid(); + + /* update EAL thread core affinity */ + return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); +} + +void eal_thread_init_master(unsigned lcore_id) +{ + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); +} + +/* main loop of threads */ +__attribute__((noreturn)) void * +eal_thread_loop(__attribute__((unused)) void *arg) +{ + char c; + int n, ret; + unsigned lcore_id; + pthread_t thread_id; + int m2s, s2m; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + + thread_id = pthread_self(); + + /* retrieve our lcore_id from the configuration structure */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (thread_id == lcore_config[lcore_id].thread_id) + break; + } + if (lcore_id == RTE_MAX_LCORE) + rte_panic("cannot retrieve lcore id\n"); + + m2s = lcore_config[lcore_id].pipe_master2slave[0]; + s2m = lcore_config[lcore_id].pipe_slave2master[1]; + + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); + + ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN); + + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n", + lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "..."); + + /* read on our pipe to get commands */ + while (1) { + void *fct_arg; + + /* wait command */ + do { + n = read(m2s, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + lcore_config[lcore_id].state = RUNNING; + + /* send ack */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(s2m, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + if (lcore_config[lcore_id].f == NULL) + rte_panic("NULL function pointer\n"); + + /* call the function and store the return value */ + fct_arg = lcore_config[lcore_id].arg; + ret = lcore_config[lcore_id].f(fct_arg); + lcore_config[lcore_id].ret = ret; + rte_wmb(); + lcore_config[lcore_id].state = FINISHED; + } + + /* never reached */ + /* pthread_exit(NULL); */ + /* return NULL; */ +} + +/* require calling thread tid by gettid() */ +int rte_sys_gettid(void) +{ + return (int)syscall(SYS_gettid); +} diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c new file mode 100644 index 00000000..f2abb7b6 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_timer.c @@ -0,0 +1,305 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * Copyright(c) 2012-2013 6WIND S.A. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <unistd.h> +#include <fcntl.h> +#include <inttypes.h> +#include <sys/mman.h> +#include <sys/queue.h> +#include <pthread.h> +#include <errno.h> + +#include <rte_common.h> +#include <rte_log.h> +#include <rte_cycles.h> +#include <rte_lcore.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_eal.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_internal_cfg.h" + +enum timer_source eal_timer_source = EAL_TIMER_HPET; + +#ifdef RTE_LIBEAL_USE_HPET + +#define DEV_HPET "/dev/hpet" + +/* Maximum number of counters. */ +#define HPET_TIMER_NUM 3 + +/* General capabilities register */ +#define CLK_PERIOD_SHIFT 32 /* Clock period shift. */ +#define CLK_PERIOD_MASK 0xffffffff00000000ULL /* Clock period mask. */ + +/** + * HPET timer registers. From the Intel IA-PC HPET (High Precision Event + * Timers) Specification. + */ +struct eal_hpet_regs { + /* Memory-mapped, software visible registers */ + uint64_t capabilities; /**< RO General Capabilities Register. */ + uint64_t reserved0; /**< Reserved for future use. */ + uint64_t config; /**< RW General Configuration Register. */ + uint64_t reserved1; /**< Reserved for future use. */ + uint64_t isr; /**< RW Clear General Interrupt Status. */ + uint64_t reserved2[25]; /**< Reserved for future use. */ + union { + uint64_t counter; /**< RW Main Counter Value Register. */ + struct { + uint32_t counter_l; /**< RW Main Counter Low. */ + uint32_t counter_h; /**< RW Main Counter High. */ + }; + }; + uint64_t reserved3; /**< Reserved for future use. */ + struct { + uint64_t config; /**< RW Timer Config and Capability Reg. */ + uint64_t comp; /**< RW Timer Comparator Value Register. */ + uint64_t fsb; /**< RW FSB Interrupt Route Register. */ + uint64_t reserved4; /**< Reserved for future use. */ + } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */ +}; + +/* Mmap'd hpet registers */ +static volatile struct eal_hpet_regs *eal_hpet = NULL; + +/* Period at which the HPET counter increments in + * femtoseconds (10^-15 seconds). */ +static uint32_t eal_hpet_resolution_fs = 0; + +/* Frequency of the HPET counter in Hz */ +static uint64_t eal_hpet_resolution_hz = 0; + +/* Incremented 4 times during one 32bits hpet full count */ +static uint32_t eal_hpet_msb; + +static pthread_t msb_inc_thread_id; + +/* + * This function runs on a specific thread to update a global variable + * containing used to process MSB of the HPET (unfortunatelly, we need + * this because hpet is 32 bits by default under linux). + */ +static void +hpet_msb_inc(__attribute__((unused)) void *arg) +{ + uint32_t t; + + while (1) { + t = (eal_hpet->counter_l >> 30); + if (t != (eal_hpet_msb & 3)) + eal_hpet_msb ++; + sleep(10); + } +} + +uint64_t +rte_get_hpet_hz(void) +{ + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + return eal_hpet_resolution_hz; +} + +uint64_t +rte_get_hpet_cycles(void) +{ + uint32_t t, msb; + uint64_t ret; + + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + t = eal_hpet->counter_l; + msb = eal_hpet_msb; + ret = (msb + 2 - (t >> 30)) / 4; + ret <<= 32; + ret += t; + return ret; +} + +#endif + +#ifdef RTE_LIBEAL_USE_HPET +/* + * Open and mmap /dev/hpet (high precision event timer) that will + * provide our time reference. + */ +int +rte_eal_hpet_init(int make_default) +{ + int fd, ret; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + if (internal_config.no_hpet) { + RTE_LOG(NOTICE, EAL, "HPET is disabled\n"); + return -1; + } + + fd = open(DEV_HPET, O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n", + strerror(errno)); + internal_config.no_hpet = 1; + return -1; + } + eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0); + if (eal_hpet == MAP_FAILED) { + RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n" + "Please enable CONFIG_HPET_MMAP in your kernel configuration " + "to allow HPET support.\n" + "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n " + "in your build configuration or use '--no-hpet' EAL flag.\n"); + close(fd); + internal_config.no_hpet = 1; + return -1; + } + close(fd); + + eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities & + CLK_PERIOD_MASK) >> + CLK_PERIOD_SHIFT); + + eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) / + (uint64_t)eal_hpet_resolution_fs; + + RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n", + eal_hpet_resolution_hz/1000); + + eal_hpet_msb = (eal_hpet->counter_l >> 30); + + /* create a thread that will increment a global variable for + * msb (hpet is 32 bits by default under linux) */ + ret = pthread_create(&msb_inc_thread_id, NULL, + (void *(*)(void *))hpet_msb_inc, NULL); + if (ret != 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); + internal_config.no_hpet = 1; + return -1; + } + + /* + * Set thread_name for aid in debugging. + */ + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "hpet-msb-inc"); + ret = rte_thread_setname(msb_inc_thread_id, thread_name); + if (ret != 0) + RTE_LOG(ERR, EAL, + "ERROR: Cannot set HPET timer thread name!\n"); + + if (make_default) + eal_timer_source = EAL_TIMER_HPET; + return 0; +} +#endif + +static void +check_tsc_flags(void) +{ + char line[512]; + FILE *stream; + + stream = fopen("/proc/cpuinfo", "r"); + if (!stream) { + RTE_LOG(WARNING, EAL, "WARNING: Unable to open /proc/cpuinfo\n"); + return; + } + + while (fgets(line, sizeof line, stream)) { + char *constant_tsc; + char *nonstop_tsc; + + if (strncmp(line, "flags", 5) != 0) + continue; + + constant_tsc = strstr(line, "constant_tsc"); + nonstop_tsc = strstr(line, "nonstop_tsc"); + if (!constant_tsc || !nonstop_tsc) + RTE_LOG(WARNING, EAL, + "WARNING: cpu flags " + "constant_tsc=%s " + "nonstop_tsc=%s " + "-> using unreliable clock cycles !\n", + constant_tsc ? "yes":"no", + nonstop_tsc ? "yes":"no"); + break; + } + + fclose(stream); +} + +uint64_t +get_tsc_freq(void) +{ +#ifdef CLOCK_MONOTONIC_RAW +#define NS_PER_SEC 1E9 + + struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */ + + struct timespec t_start, t_end; + uint64_t tsc_hz; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) { + uint64_t ns, end, start = rte_rdtsc(); + nanosleep(&sleeptime,NULL); + clock_gettime(CLOCK_MONOTONIC_RAW, &t_end); + end = rte_rdtsc(); + ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC); + ns += (t_end.tv_nsec - t_start.tv_nsec); + + double secs = (double)ns/NS_PER_SEC; + tsc_hz = (uint64_t)((end - start)/secs); + return tsc_hz; + } +#endif + return 0; +} + +int +rte_eal_timer_init(void) +{ + + eal_timer_source = EAL_TIMER_TSC; + + set_tsc_freq(); + check_tsc_flags(); + return 0; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h new file mode 100644 index 00000000..f483bf40 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h @@ -0,0 +1,67 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef EAL_VFIO_H_ +#define EAL_VFIO_H_ + +/* + * determine if VFIO is present on the system + */ +#ifdef RTE_EAL_VFIO +#include <linux/version.h> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) +#include <linux/vfio.h> + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0) +#define RTE_PCI_MSIX_TABLE_BIR 0x7 +#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8 +#define RTE_PCI_MSIX_FLAGS_QSIZE 0x07ff +#else +#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR +#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET +#define RTE_PCI_MSIX_FLAGS_QSIZE PCI_MSIX_FLAGS_QSIZE +#endif + +#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0) +#define RTE_VFIO_NOIOMMU 8 +#else +#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU +#endif + +#define VFIO_PRESENT +#endif /* kernel version */ +#endif /* RTE_EAL_VFIO */ + +#endif /* EAL_VFIO_H_ */ diff --git a/lib/librte_eal/linuxapp/eal/eal_xen_memory.c b/lib/librte_eal/linuxapp/eal/eal_xen_memory.c new file mode 100644 index 00000000..495eef9e --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/eal_xen_memory.c @@ -0,0 +1,369 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <errno.h> +#include <stdarg.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <inttypes.h> +#include <string.h> +#include <stdarg.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/queue.h> +#include <sys/file.h> +#include <unistd.h> +#include <limits.h> +#include <errno.h> +#include <sys/ioctl.h> +#include <sys/time.h> + +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_string_fns.h> + +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include <exec-env/rte_dom0_common.h> + +#define PAGE_SIZE RTE_PGSIZE_4K +#define DEFAUL_DOM0_NAME "dom0-mem" + +static int xen_fd = -1; +static const char sys_dir_path[] = "/sys/kernel/mm/dom0-mm/memsize-mB"; + +/* + * Try to mmap *size bytes in /dev/zero. If it is successful, return the + * pointer to the mmap'd area and keep *size unmodified. Else, retry + * with a smaller zone: decrease *size by mem_size until it reaches + * 0. In this case, return NULL. Note: this function returns an address + * which is a multiple of mem_size size. + */ +static void * +xen_get_virtual_area(size_t *size, size_t mem_size) +{ + void *addr; + int fd; + long aligned_addr; + + RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zu bytes\n", *size); + + fd = open("/dev/zero", O_RDONLY); + if (fd < 0){ + RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n"); + return NULL; + } + do { + addr = mmap(NULL, (*size) + mem_size, PROT_READ, + MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) + *size -= mem_size; + } while (addr == MAP_FAILED && *size > 0); + + if (addr == MAP_FAILED) { + close(fd); + RTE_LOG(ERR, EAL, "Cannot get a virtual area\n"); + return NULL; + } + + munmap(addr, (*size) + mem_size); + close(fd); + + /* align addr to a mem_size boundary */ + aligned_addr = (uintptr_t)addr; + aligned_addr = RTE_ALIGN_CEIL(aligned_addr, mem_size); + addr = (void *)(aligned_addr); + + RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", + addr, *size); + + return addr; +} + +/** + * Get memory size configuration from /sys/devices/virtual/misc/dom0_mm + * /memsize-mB/memsize file, and the size unit is mB. + */ +static int +get_xen_memory_size(void) +{ + char path[PATH_MAX]; + unsigned long mem_size = 0; + static const char *file_name; + + file_name = "memsize"; + snprintf(path, sizeof(path), "%s/%s", + sys_dir_path, file_name); + + if (eal_parse_sysfs_value(path, &mem_size) < 0) + return -1; + + if (mem_size == 0) + rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s was not" + " configured.\n",sys_dir_path, file_name); + if (mem_size % 2) + rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s must be" + " even number.\n",sys_dir_path, file_name); + + if (mem_size > DOM0_CONFIG_MEMSIZE) + rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s should not be larger" + " than %d mB\n",sys_dir_path, file_name, DOM0_CONFIG_MEMSIZE); + + return mem_size; +} + +/** + * Based on physical address to caculate MFN in Xen Dom0. + */ +phys_addr_t +rte_xen_mem_phy2mch(uint32_t memseg_id, const phys_addr_t phy_addr) +{ + int mfn_id; + uint64_t mfn, mfn_offset; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg *memseg = mcfg->memseg; + + mfn_id = (phy_addr - memseg[memseg_id].phys_addr) / RTE_PGSIZE_2M; + + /*the MFN is contiguous in 2M */ + mfn_offset = (phy_addr - memseg[memseg_id].phys_addr) % + RTE_PGSIZE_2M / PAGE_SIZE; + mfn = mfn_offset + memseg[memseg_id].mfn[mfn_id]; + + /** return mechine address */ + return mfn * PAGE_SIZE + phy_addr % PAGE_SIZE; +} + +int +rte_xen_dom0_memory_init(void) +{ + void *vir_addr, *vma_addr = NULL; + int err, ret = 0; + uint32_t i, requested, mem_size, memseg_idx, num_memseg = 0; + size_t vma_len = 0; + struct memory_info meminfo; + struct memseg_info seginfo[RTE_MAX_MEMSEG]; + int flags, page_size = getpagesize(); + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg *memseg = mcfg->memseg; + uint64_t total_mem = internal_config.memory; + + memset(seginfo, 0, sizeof(seginfo)); + memset(&meminfo, 0, sizeof(struct memory_info)); + + mem_size = get_xen_memory_size(); + requested = (unsigned) (total_mem / 0x100000); + if (requested > mem_size) + /* if we didn't satisfy total memory requirements */ + rte_exit(EXIT_FAILURE,"Not enough memory available! Requested: %uMB," + " available: %uMB\n", requested, mem_size); + else if (total_mem != 0) + mem_size = requested; + + /* Check FD and open once */ + if (xen_fd < 0) { + xen_fd = open(DOM0_MM_DEV, O_RDWR); + if (xen_fd < 0) { + RTE_LOG(ERR, EAL, "Can not open %s\n",DOM0_MM_DEV); + return -1; + } + } + + meminfo.size = mem_size; + + /* construct memory mangement name for Dom0 */ + snprintf(meminfo.name, DOM0_NAME_MAX, "%s-%s", + internal_config.hugefile_prefix, DEFAUL_DOM0_NAME); + + /* Notify kernel driver to allocate memory */ + ret = ioctl(xen_fd, RTE_DOM0_IOCTL_PREPARE_MEMSEG, &meminfo); + if (ret < 0) { + RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memory\n"); + err = -EIO; + goto fail; + } + + /* Get number of memory segment from driver */ + ret = ioctl(xen_fd, RTE_DOM0_IOCTL_GET_NUM_MEMSEG, &num_memseg); + if (ret < 0) { + RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memseg count.\n"); + err = -EIO; + goto fail; + } + + if(num_memseg > RTE_MAX_MEMSEG){ + RTE_LOG(ERR, EAL, "XEN DOM0: the memseg count %d is greater" + " than max memseg %d.\n",num_memseg, RTE_MAX_MEMSEG); + err = -EIO; + goto fail; + } + + /* get all memory segements information */ + ret = ioctl(xen_fd, RTE_DOM0_IOCTL_GET_MEMSEG_INFO, seginfo); + if (ret < 0) { + RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memseg info.\n"); + err = -EIO; + goto fail; + } + + /* map all memory segments to contiguous user space */ + for (memseg_idx = 0; memseg_idx < num_memseg; memseg_idx++) + { + vma_len = seginfo[memseg_idx].size; + + /** + * get the biggest virtual memory area up to vma_len. If it fails, + * vma_addr is NULL, so let the kernel provide the address. + */ + vma_addr = xen_get_virtual_area(&vma_len, RTE_PGSIZE_2M); + if (vma_addr == NULL) { + flags = MAP_SHARED; + vma_len = RTE_PGSIZE_2M; + } else + flags = MAP_SHARED | MAP_FIXED; + + seginfo[memseg_idx].size = vma_len; + vir_addr = mmap(vma_addr, seginfo[memseg_idx].size, + PROT_READ|PROT_WRITE, flags, xen_fd, + memseg_idx * page_size); + if (vir_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "XEN DOM0:Could not mmap %s\n", + DOM0_MM_DEV); + err = -EIO; + goto fail; + } + + memseg[memseg_idx].addr = vir_addr; + memseg[memseg_idx].phys_addr = page_size * + seginfo[memseg_idx].pfn ; + memseg[memseg_idx].len = seginfo[memseg_idx].size; + for ( i = 0; i < seginfo[memseg_idx].size / RTE_PGSIZE_2M; i++) + memseg[memseg_idx].mfn[i] = seginfo[memseg_idx].mfn[i]; + + /* MFNs are continuous in 2M, so assume that page size is 2M */ + memseg[memseg_idx].hugepage_sz = RTE_PGSIZE_2M; + + memseg[memseg_idx].nchannel = mcfg->nchannel; + memseg[memseg_idx].nrank = mcfg->nrank; + + /* NUMA is not suppoted in Xen Dom0, so only set socket 0*/ + memseg[memseg_idx].socket_id = 0; + } + + return 0; +fail: + if (xen_fd > 0) { + close(xen_fd); + xen_fd = -1; + } + return err; +} + +/* + * This creates the memory mappings in the secondary process to match that of + * the server process. It goes through each memory segment in the DPDK runtime + * configuration, mapping them in order to form a contiguous block in the + * virtual memory space + */ +int +rte_xen_dom0_memory_attach(void) +{ + const struct rte_mem_config *mcfg; + unsigned s = 0; /* s used to track the segment number */ + int xen_fd = -1; + int ret = -1; + void *vir_addr; + char name[DOM0_NAME_MAX] = {0}; + int page_size = getpagesize(); + + mcfg = rte_eal_get_configuration()->mem_config; + + /* Check FD and open once */ + if (xen_fd < 0) { + xen_fd = open(DOM0_MM_DEV, O_RDWR); + if (xen_fd < 0) { + RTE_LOG(ERR, EAL, "Can not open %s\n",DOM0_MM_DEV); + goto error; + } + } + + /* construct memory mangement name for Dom0 */ + snprintf(name, DOM0_NAME_MAX, "%s-%s", + internal_config.hugefile_prefix, DEFAUL_DOM0_NAME); + /* attach to memory segments of primary process */ + ret = ioctl(xen_fd, RTE_DOM0_IOCTL_ATTACH_TO_MEMSEG, name); + if (ret) { + RTE_LOG(ERR, EAL,"attach memory segments fail.\n"); + goto error; + } + + /* map all segments into memory to make sure we get the addrs */ + for (s = 0; s < RTE_MAX_MEMSEG; ++s) { + + /* + * the first memory segment with len==0 is the one that + * follows the last valid segment. + */ + if (mcfg->memseg[s].len == 0) + break; + + vir_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len, + PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED, xen_fd, + s * page_size); + if (vir_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes " + "in %s to requested address [%p]\n", + (unsigned long long)mcfg->memseg[s].len, DOM0_MM_DEV, + mcfg->memseg[s].addr); + goto error; + } + } + return 0; + +error: + if (xen_fd >= 0) { + close(xen_fd); + xen_fd = -1; + } + return -1; +} diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dom0_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dom0_common.h new file mode 100644 index 00000000..d9707780 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dom0_common.h @@ -0,0 +1,108 @@ +/*- + * This file is provided under a dual BSD/LGPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GNU LESSER GENERAL PUBLIC LICENSE + * + * Copyright(c) 2007-2014 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * Contact Information: + * Intel Corporation + * + * + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _RTE_DOM0_COMMON_H_ +#define _RTE_DOM0_COMMON_H_ + +#ifdef __KERNEL__ +#include <linux/if.h> +#endif + +#define DOM0_NAME_MAX 256 +#define DOM0_MM_DEV "/dev/dom0_mm" + +#define DOM0_CONTIG_NUM_ORDER 9 /**< order of 2M */ +#define DOM0_NUM_MEMSEG 512 /**< Maximum nb. of memory segment. */ +#define DOM0_MEMBLOCK_SIZE 0x200000 /**< size of memory block(2M). */ +#define DOM0_CONFIG_MEMSIZE 4096 /**< Maximum config memory size(4G). */ +#define DOM0_NUM_MEMBLOCK (DOM0_CONFIG_MEMSIZE / 2) /**< Maximum nb. of 2M memory block. */ + +#define RTE_DOM0_IOCTL_PREPARE_MEMSEG _IOWR(0, 1 , struct memory_info) +#define RTE_DOM0_IOCTL_ATTACH_TO_MEMSEG _IOWR(0, 2 , char *) +#define RTE_DOM0_IOCTL_GET_NUM_MEMSEG _IOWR(0, 3, int) +#define RTE_DOM0_IOCTL_GET_MEMSEG_INFO _IOWR(0, 4, void *) + +/** + * A structure used to store memory information. + */ +struct memory_info { + char name[DOM0_NAME_MAX]; + uint64_t size; +}; + +/** + * A structure used to store memory segment information. + */ +struct memseg_info { + uint32_t idx; + uint64_t pfn; + uint64_t size; + uint64_t mfn[DOM0_NUM_MEMBLOCK]; +}; + +/** + * A structure used to store memory block information. + */ +struct memblock_info { + uint8_t exchange_flag; + uint8_t used; + uint64_t vir_addr; + uint64_t pfn; + uint64_t mfn; +}; +#endif /* _RTE_DOM0_COMMON_H_ */ diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h new file mode 100644 index 00000000..3dacbff8 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h @@ -0,0 +1,228 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_INTERRUPTS_H_ +#error "don't include this file directly, please include generic <rte_interrupts.h>" +#endif + +#ifndef _RTE_LINUXAPP_INTERRUPTS_H_ +#define _RTE_LINUXAPP_INTERRUPTS_H_ + +#define RTE_MAX_RXTX_INTR_VEC_ID 32 +#define RTE_INTR_VEC_ZERO_OFFSET 0 +#define RTE_INTR_VEC_RXTX_OFFSET 1 + +enum rte_intr_handle_type { + RTE_INTR_HANDLE_UNKNOWN = 0, + RTE_INTR_HANDLE_UIO, /**< uio device handle */ + RTE_INTR_HANDLE_UIO_INTX, /**< uio generic handle */ + RTE_INTR_HANDLE_VFIO_LEGACY, /**< vfio device handle (legacy) */ + RTE_INTR_HANDLE_VFIO_MSI, /**< vfio device handle (MSI) */ + RTE_INTR_HANDLE_VFIO_MSIX, /**< vfio device handle (MSIX) */ + RTE_INTR_HANDLE_ALARM, /**< alarm handle */ + RTE_INTR_HANDLE_EXT, /**< external handler */ + RTE_INTR_HANDLE_MAX +}; + +#define RTE_INTR_EVENT_ADD 1UL +#define RTE_INTR_EVENT_DEL 2UL + +typedef void (*rte_intr_event_cb_t)(int fd, void *arg); + +struct rte_epoll_data { + uint32_t event; /**< event type */ + void *data; /**< User data */ + rte_intr_event_cb_t cb_fun; /**< IN: callback fun */ + void *cb_arg; /**< IN: callback arg */ +}; + +enum { + RTE_EPOLL_INVALID = 0, + RTE_EPOLL_VALID, + RTE_EPOLL_EXEC, +}; + +/** interrupt epoll event obj, taken by epoll_event.ptr */ +struct rte_epoll_event { + volatile uint32_t status; /**< OUT: event status */ + int fd; /**< OUT: event fd */ + int epfd; /**< OUT: epoll instance the ev associated with */ + struct rte_epoll_data epdata; +}; + +/** Handle for interrupts. */ +struct rte_intr_handle { + union { + int vfio_dev_fd; /**< VFIO device file descriptor */ + int uio_cfg_fd; /**< UIO config file descriptor + for uio_pci_generic */ + }; + int fd; /**< interrupt event file descriptor */ + enum rte_intr_handle_type type; /**< handle type */ + uint32_t max_intr; /**< max interrupt requested */ + uint32_t nb_efd; /**< number of available efd(event fd) */ + int efds[RTE_MAX_RXTX_INTR_VEC_ID]; /**< intr vectors/efds mapping */ + struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID]; + /**< intr vector epoll event */ + int *intr_vec; /**< intr vector number array */ +}; + +#define RTE_EPOLL_PER_THREAD -1 /**< to hint using per thread epfd */ + +/** + * It waits for events on the epoll instance. + * + * @param epfd + * Epoll instance fd on which the caller wait for events. + * @param events + * Memory area contains the events that will be available for the caller. + * @param maxevents + * Up to maxevents are returned, must greater than zero. + * @param timeout + * Specifying a timeout of -1 causes a block indefinitely. + * Specifying a timeout equal to zero cause to return immediately. + * @return + * - On success, returns the number of available event. + * - On failure, a negative value. + */ +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout); + +/** + * It performs control operations on epoll instance referred by the epfd. + * It requests that the operation op be performed for the target fd. + * + * @param epfd + * Epoll instance fd on which the caller perform control operations. + * @param op + * The operation be performed for the target fd. + * @param fd + * The target fd on which the control ops perform. + * @param event + * Describes the object linked to the fd. + * Note: The caller must take care the object deletion after CTL_DEL. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_epoll_ctl(int epfd, int op, int fd, + struct rte_epoll_event *event); + +/** + * The function returns the per thread epoll instance. + * + * @return + * epfd the epoll instance referred to. + */ +int +rte_intr_tls_epfd(void); + +/** + * @param intr_handle + * Pointer to the interrupt handle. + * @param epfd + * Epoll instance fd which the intr vector associated to. + * @param op + * The operation be performed for the vector. + * Operation type of {ADD, DEL}. + * @param vec + * RX intr vector number added to the epoll instance wait list. + * @param data + * User raw data. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, + int epfd, int op, unsigned int vec, void *data); + +/** + * It enables the packet I/O interrupt event if it's necessary. + * It creates event fd for each interrupt vector when MSIX is used, + * otherwise it multiplexes a single event fd. + * + * @param intr_handle + * Pointer to the interrupt handle. + * @param nb_efd + * Number of interrupt vector trying to enable. + * The value 0 is not allowed. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd); + +/** + * It disables the packet I/O interrupt event. + * It deletes registered eventfds and closes the open fds. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle); + +/** + * The packet I/O interrupt on datapath is enabled or not. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle); + +/** + * The interrupt handle instance allows other causes or not. + * Other causes stand for any none packet I/O interrupts. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle); + +/** + * The multiple interrupt vector capability of interrupt handle instance. + * It returns zero if no multiple interrupt vector support. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle); + +#endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */ diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h new file mode 100644 index 00000000..7e5e5984 --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h @@ -0,0 +1,172 @@ +/*- + * This file is provided under a dual BSD/LGPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GNU LESSER GENERAL PUBLIC LICENSE + * + * Copyright(c) 2007-2014 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * Contact Information: + * Intel Corporation + * + * + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _RTE_KNI_COMMON_H_ +#define _RTE_KNI_COMMON_H_ + +#ifdef __KERNEL__ +#include <linux/if.h> +#endif + +/** + * KNI name is part of memzone name. + */ +#define RTE_KNI_NAMESIZE 32 + +#define RTE_CACHE_LINE_MIN_SIZE 64 + +/* + * Request id. + */ +enum rte_kni_req_id { + RTE_KNI_REQ_UNKNOWN = 0, + RTE_KNI_REQ_CHANGE_MTU, + RTE_KNI_REQ_CFG_NETWORK_IF, + RTE_KNI_REQ_MAX, +}; + +/* + * Structure for KNI request. + */ +struct rte_kni_request { + uint32_t req_id; /**< Request id */ + union { + uint32_t new_mtu; /**< New MTU */ + uint8_t if_up; /**< 1: interface up, 0: interface down */ + }; + int32_t result; /**< Result for processing request */ +} __attribute__((__packed__)); + +/* + * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO + * Write and read should wrap around. Fifo is empty when write == read + * Writing should never overwrite the read position + */ +struct rte_kni_fifo { + volatile unsigned write; /**< Next position to be written*/ + volatile unsigned read; /**< Next position to be read */ + unsigned len; /**< Circular buffer length */ + unsigned elem_size; /**< Pointer size - for 32/64 bit OS */ + void * volatile buffer[0]; /**< The buffer contains mbuf pointers */ +}; + +/* + * The kernel image of the rte_mbuf struct, with only the relevant fields. + * Padding is necessary to assure the offsets of these fields + */ +struct rte_kni_mbuf { + void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE))); + char pad0[10]; + uint16_t data_off; /**< Start address of data in segment buffer. */ + char pad1[4]; + uint64_t ol_flags; /**< Offload features. */ + char pad2[4]; + uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */ + uint16_t data_len; /**< Amount of data in segment buffer. */ + + /* fields on second cache line */ + char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE))); + void *pool; + void *next; +}; + +/* + * Struct used to create a KNI device. Passed to the kernel in IOCTL call + */ + +struct rte_kni_device_info { + char name[RTE_KNI_NAMESIZE]; /**< Network device name for KNI */ + + phys_addr_t tx_phys; + phys_addr_t rx_phys; + phys_addr_t alloc_phys; + phys_addr_t free_phys; + + /* Used by Ethtool */ + phys_addr_t req_phys; + phys_addr_t resp_phys; + phys_addr_t sync_phys; + void * sync_va; + + /* mbuf mempool */ + void * mbuf_va; + phys_addr_t mbuf_phys; + + /* PCI info */ + uint16_t vendor_id; /**< Vendor ID or PCI_ANY_ID. */ + uint16_t device_id; /**< Device ID or PCI_ANY_ID. */ + uint8_t bus; /**< Device bus */ + uint8_t devid; /**< Device ID */ + uint8_t function; /**< Device function. */ + + uint16_t group_id; /**< Group ID */ + uint32_t core_id; /**< core ID to bind for kernel thread */ + + uint8_t force_bind : 1; /**< Flag for kernel thread binding */ + + /* mbuf size */ + unsigned mbuf_size; +}; + +#define KNI_DEVICE "kni" + +#define RTE_KNI_IOCTL_TEST _IOWR(0, 1, int) +#define RTE_KNI_IOCTL_CREATE _IOWR(0, 2, struct rte_kni_device_info) +#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info) + +#endif /* _RTE_KNI_COMMON_H_ */ diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map new file mode 100644 index 00000000..12503efa --- /dev/null +++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map @@ -0,0 +1,156 @@ +DPDK_2.0 { + global: + + __rte_panic; + devargs_list; + eal_parse_sysfs_value; + eal_timer_source; + lcore_config; + pci_device_list; + pci_driver_list; + per_lcore__lcore_id; + per_lcore__rte_errno; + rte_calloc; + rte_calloc_socket; + rte_cpu_check_supported; + rte_cpu_get_flag_enabled; + rte_cycles_vmware_tsc_map; + rte_delay_us; + rte_dump_physmem_layout; + rte_dump_registers; + rte_dump_stack; + rte_dump_tailq; + rte_eal_alarm_cancel; + rte_eal_alarm_set; + rte_eal_dev_init; + rte_eal_devargs_add; + rte_eal_devargs_dump; + rte_eal_devargs_type_count; + rte_eal_driver_register; + rte_eal_driver_unregister; + rte_eal_get_configuration; + rte_eal_get_lcore_state; + rte_eal_get_physmem_layout; + rte_eal_get_physmem_size; + rte_eal_has_hugepages; + rte_eal_hpet_init; + rte_eal_init; + rte_eal_iopl_init; + rte_eal_lcore_role; + rte_eal_mp_remote_launch; + rte_eal_mp_wait_lcore; + rte_eal_parse_devargs_str; + rte_eal_pci_dump; + rte_eal_pci_probe; + rte_eal_pci_probe_one; + rte_eal_pci_register; + rte_eal_pci_scan; + rte_eal_pci_unregister; + rte_eal_process_type; + rte_eal_remote_launch; + rte_eal_tailq_lookup; + rte_eal_tailq_register; + rte_eal_vdev_init; + rte_eal_vdev_uninit; + rte_eal_wait_lcore; + rte_exit; + rte_free; + rte_get_hpet_cycles; + rte_get_hpet_hz; + rte_get_log_level; + rte_get_log_type; + rte_get_tsc_hz; + rte_hexdump; + rte_intr_callback_register; + rte_intr_callback_unregister; + rte_intr_disable; + rte_intr_enable; + rte_log; + rte_log_add_in_history; + rte_log_cur_msg_loglevel; + rte_log_cur_msg_logtype; + rte_log_dump_history; + rte_log_set_history; + rte_logs; + rte_malloc; + rte_malloc_dump_stats; + rte_malloc_get_socket_stats; + rte_malloc_set_limit; + rte_malloc_socket; + rte_malloc_validate; + rte_malloc_virt2phy; + rte_mem_lock_page; + rte_mem_phy2mch; + rte_mem_virt2phy; + rte_memdump; + rte_memory_get_nchannel; + rte_memory_get_nrank; + rte_memzone_dump; + rte_memzone_lookup; + rte_memzone_reserve; + rte_memzone_reserve_aligned; + rte_memzone_reserve_bounded; + rte_memzone_walk; + rte_openlog_stream; + rte_realloc; + rte_set_application_usage_hook; + rte_set_log_level; + rte_set_log_type; + rte_socket_id; + rte_strerror; + rte_strsplit; + rte_sys_gettid; + rte_thread_get_affinity; + rte_thread_set_affinity; + rte_vlog; + rte_xen_dom0_memory_attach; + rte_xen_dom0_memory_init; + rte_zmalloc; + rte_zmalloc_socket; + + local: *; +}; + +DPDK_2.1 { + global: + + rte_eal_pci_detach; + rte_eal_pci_read_config; + rte_eal_pci_write_config; + rte_epoll_ctl; + rte_epoll_wait; + rte_intr_allow_others; + rte_intr_dp_is_en; + rte_intr_efd_disable; + rte_intr_efd_enable; + rte_intr_rx_ctl; + rte_intr_tls_epfd; + rte_memzone_free; + +} DPDK_2.0; + +DPDK_2.2 { + global: + + rte_intr_cap_multiple; + rte_keepalive_create; + rte_keepalive_dispatch_pings; + rte_keepalive_mark_alive; + rte_keepalive_register_core; + rte_xen_dom0_supported; + +} DPDK_2.1; + +DPDK_16.04 { + global: + + rte_cpu_get_flag_name; + rte_eal_pci_ioport_map; + rte_eal_pci_ioport_read; + rte_eal_pci_ioport_unmap; + rte_eal_pci_ioport_write; + rte_eal_pci_map_device; + rte_eal_pci_unmap_device; + rte_eal_primary_proc_alive; + +} DPDK_2.2; |