Imported Upstream version 16.04

Change-Id: I77eadcd8538a9122e4773cbe55b24033dc451757 Signed-off-by: C.J. Collier <cjcollier@linuxfoundation.org>
author: C.J. Collier <cjcollier@linuxfoundation.org> 2016-06-14 07:50:17 -0700
committer: C.J. Collier <cjcollier@linuxfoundation.org> 2016-06-14 12:17:54 -0700
commit: 97f17497d162afdb82c8704bf097f0fee3724b2e (patch)
tree: 1c6269614c0c15ffef8451c58ae8f8b30a1bc804 /lib/librte_eal/linuxapp/eal
parent: e04be89c2409570e0055b2cda60bd11395bb93b0 (diff)
23 files changed, 10312 insertions, 0 deletions
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
new file mode 100644
index 00000000..e1093619
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -0,0 +1,139 @@
+#   BSD LICENSE
+#
+#   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+#   All rights reserved.
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions
+#   are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in
+#       the documentation and/or other materials provided with the
+#       distribution.
+#     * Neither the name of Intel Corporation nor the names of its
+#       contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+LIB = librte_eal.a
+
+ARCH_DIR ?= $(RTE_ARCH)
+EXPORT_MAP := rte_eal_version.map
+VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR)
+
+LIBABIVER := 2
+
+VPATH += $(RTE_SDK)/lib/librte_eal/common
+
+CFLAGS += -I$(SRCDIR)/include
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include
+CFLAGS += -I$(RTE_SDK)/lib/librte_ring
+CFLAGS += -I$(RTE_SDK)/lib/librte_mempool
+CFLAGS += -I$(RTE_SDK)/lib/librte_ivshmem
+CFLAGS += $(WERROR_FLAGS) -O3
+
+LDLIBS += -ldl
+LDLIBS += -lpthread
+LDLIBS += -lgcc_s
+LDLIBS += -lrt
+
+# specific to linuxapp exec-env
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_hugepage_info.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memory.c
+ifeq ($(CONFIG_RTE_LIBRTE_XEN_DOM0),y)
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_xen_memory.c
+endif
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci_uio.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci_vfio.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci_vfio_mp_sync.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c
+ifeq ($(CONFIG_RTE_LIBRTE_IVSHMEM),y)
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_ivshmem.c
+endif
+
+# from common dir
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_pci.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_pci_uio.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_cpuflags.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_string_fns.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hexdump.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_devargs.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c
+
+# from arch dir
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_cpuflags.c
+
+CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
+
+CFLAGS_eal.o := -D_GNU_SOURCE
+CFLAGS_eal_interrupts.o := -D_GNU_SOURCE
+CFLAGS_eal_pci_vfio_mp_sync.o := -D_GNU_SOURCE
+CFLAGS_eal_timer.o := -D_GNU_SOURCE
+CFLAGS_eal_lcore.o := -D_GNU_SOURCE
+CFLAGS_eal_thread.o := -D_GNU_SOURCE
+CFLAGS_eal_log.o := -D_GNU_SOURCE
+CFLAGS_eal_common_log.o := -D_GNU_SOURCE
+CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE
+CFLAGS_eal_pci.o := -D_GNU_SOURCE
+CFLAGS_eal_pci_uio.o := -D_GNU_SOURCE
+CFLAGS_eal_pci_vfio.o := -D_GNU_SOURCE
+CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE
+CFLAGS_eal_common_options.o := -D_GNU_SOURCE
+CFLAGS_eal_common_thread.o := -D_GNU_SOURCE
+CFLAGS_eal_common_lcore.o := -D_GNU_SOURCE
+
+# workaround for a gcc bug with noreturn attribute
+# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
+ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
+CFLAGS_eal_thread.o += -Wno-return-type
+endif
+
+INC := rte_interrupts.h rte_kni_common.h rte_dom0_common.h
+
+SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \
+	$(addprefix include/exec-env/,$(INC))
+
+DEPDIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += lib/librte_eal/common
+DEPDIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += lib/librte_eal/common/arch/$(ARCH_DIR)
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
new file mode 100644
index 00000000..8aafd519
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -0,0 +1,936 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2012-2014 6WIND S.A.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <syslog.h>
+#include <getopt.h>
+#include <sys/file.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <errno.h>
+#include <limits.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+#if defined(RTE_ARCH_X86)
+#include <sys/io.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
+#include <rte_random.h>
+#include <rte_cycles.h>
+#include <rte_string_fns.h>
+#include <rte_cpuflags.h>
+#include <rte_interrupts.h>
+#include <rte_pci.h>
+#include <rte_devargs.h>
+#include <rte_common.h>
+#include <rte_version.h>
+#include <rte_atomic.h>
+#include <malloc_heap.h>
+
+#include "eal_private.h"
+#include "eal_thread.h"
+#include "eal_internal_cfg.h"
+#include "eal_filesystem.h"
+#include "eal_hugepages.h"
+#include "eal_options.h"
+
+#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
+
+#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10)
+
+/* Allow the application to print its usage message too if set */
+static rte_usage_hook_t	rte_application_usage_hook = NULL;
+
+/* early configuration structure, when memory config is not mmapped */
+static struct rte_mem_config early_mem_config;
+
+/* define fd variable here, because file needs to be kept open for the
+ * duration of the program, as we hold a write lock on it in the primary proc */
+static int mem_cfg_fd = -1;
+
+static struct flock wr_lock = {
+		.l_type = F_WRLCK,
+		.l_whence = SEEK_SET,
+		.l_start = offsetof(struct rte_mem_config, memseg),
+		.l_len = sizeof(early_mem_config.memseg),
+};
+
+/* Address of global and public configuration */
+static struct rte_config rte_config = {
+		.mem_config = &early_mem_config,
+};
+
+/* internal configuration (per-core) */
+struct lcore_config lcore_config[RTE_MAX_LCORE];
+
+/* internal configuration */
+struct internal_config internal_config;
+
+/* used by rte_rdtsc() */
+int rte_cycles_vmware_tsc_map;
+
+/* Return a pointer to the configuration structure */
+struct rte_config *
+rte_eal_get_configuration(void)
+{
+	return &rte_config;
+}
+
+/* parse a sysfs (or other) file containing one integer value */
+int
+eal_parse_sysfs_value(const char *filename, unsigned long *val)
+{
+	FILE *f;
+	char buf[BUFSIZ];
+	char *end = NULL;
+
+	if ((f = fopen(filename, "r")) == NULL) {
+		RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n",
+			__func__, filename);
+		return -1;
+	}
+
+	if (fgets(buf, sizeof(buf), f) == NULL) {
+		RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n",
+			__func__, filename);
+		fclose(f);
+		return -1;
+	}
+	*val = strtoul(buf, &end, 0);
+	if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) {
+		RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n",
+				__func__, filename);
+		fclose(f);
+		return -1;
+	}
+	fclose(f);
+	return 0;
+}
+
+
+/* create memory configuration in shared/mmap memory. Take out
+ * a write lock on the memsegs, so we can auto-detect primary/secondary.
+ * This means we never close the file while running (auto-close on exit).
+ * We also don't lock the whole file, so that in future we can use read-locks
+ * on other parts, e.g. memzones, to detect if there are running secondary
+ * processes. */
+static void
+rte_eal_config_create(void)
+{
+	void *rte_mem_cfg_addr;
+	int retval;
+
+	const char *pathname = eal_runtime_config_path();
+
+	if (internal_config.no_shconf)
+		return;
+
+	/* map the config before hugepage address so that we don't waste a page */
+	if (internal_config.base_virtaddr != 0)
+		rte_mem_cfg_addr = (void *)
+			RTE_ALIGN_FLOOR(internal_config.base_virtaddr -
+			sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE));
+	else
+		rte_mem_cfg_addr = NULL;
+
+	if (mem_cfg_fd < 0){
+		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
+		if (mem_cfg_fd < 0)
+			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+	}
+
+	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
+	if (retval < 0){
+		close(mem_cfg_fd);
+		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
+	}
+
+	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
+	if (retval < 0){
+		close(mem_cfg_fd);
+		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
+				"process running?\n", pathname);
+	}
+
+	rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),
+				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
+
+	if (rte_mem_cfg_addr == MAP_FAILED){
+		rte_panic("Cannot mmap memory for rte_config\n");
+	}
+	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
+	rte_config.mem_config = (struct rte_mem_config *) rte_mem_cfg_addr;
+
+	/* store address of the config in the config itself so that secondary
+	 * processes could later map the config into this exact location */
+	rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
+
+}
+
+/* attach to an existing shared memory config */
+static void
+rte_eal_config_attach(void)
+{
+	struct rte_mem_config *mem_config;
+
+	const char *pathname = eal_runtime_config_path();
+
+	if (internal_config.no_shconf)
+		return;
+
+	if (mem_cfg_fd < 0){
+		mem_cfg_fd = open(pathname, O_RDWR);
+		if (mem_cfg_fd < 0)
+			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+	}
+
+	/* map it as read-only first */
+	mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
+			PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
+	if (mem_config == MAP_FAILED)
+		rte_panic("Cannot mmap memory for rte_config\n");
+
+	rte_config.mem_config = mem_config;
+}
+
+/* reattach the shared config at exact memory location primary process has it */
+static void
+rte_eal_config_reattach(void)
+{
+	struct rte_mem_config *mem_config;
+	void *rte_mem_cfg_addr;
+
+	if (internal_config.no_shconf)
+		return;
+
+	/* save the address primary process has mapped shared config to */
+	rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr;
+
+	/* unmap original config */
+	munmap(rte_config.mem_config, sizeof(struct rte_mem_config));
+
+	/* remap the config at proper address */
+	mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr,
+			sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED,
+			mem_cfg_fd, 0);
+	close(mem_cfg_fd);
+	if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr)
+		rte_panic("Cannot mmap memory for rte_config\n");
+
+	rte_config.mem_config = mem_config;
+}
+
+/* Detect if we are a primary or a secondary process */
+enum rte_proc_type_t
+eal_proc_type_detect(void)
+{
+	enum rte_proc_type_t ptype = RTE_PROC_PRIMARY;
+	const char *pathname = eal_runtime_config_path();
+
+	/* if we can open the file but not get a write-lock we are a secondary
+	 * process. NOTE: if we get a file handle back, we keep that open
+	 * and don't close it to prevent a race condition between multiple opens */
+	if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) &&
+			(fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0))
+		ptype = RTE_PROC_SECONDARY;
+
+	RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n",
+			ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY");
+
+	return ptype;
+}
+
+/* Sets up rte_config structure with the pointer to shared memory config.*/
+static void
+rte_config_init(void)
+{
+	rte_config.process_type = internal_config.process_type;
+
+	switch (rte_config.process_type){
+	case RTE_PROC_PRIMARY:
+		rte_eal_config_create();
+		break;
+	case RTE_PROC_SECONDARY:
+		rte_eal_config_attach();
+		rte_eal_mcfg_wait_complete(rte_config.mem_config);
+		rte_eal_config_reattach();
+		break;
+	case RTE_PROC_AUTO:
+	case RTE_PROC_INVALID:
+		rte_panic("Invalid process type\n");
+	}
+}
+
+/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
+static void
+eal_hugedirs_unlock(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_HUGEPAGE_SIZES; i++)
+	{
+		/* skip uninitialized */
+		if (internal_config.hugepage_info[i].lock_descriptor < 0)
+			continue;
+		/* unlock hugepage file */
+		flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN);
+		close(internal_config.hugepage_info[i].lock_descriptor);
+		/* reset the field */
+		internal_config.hugepage_info[i].lock_descriptor = -1;
+	}
+}
+
+/* display usage */
+static void
+eal_usage(const char *prgname)
+{
+	printf("\nUsage: %s ", prgname);
+	eal_common_usage();
+	printf("EAL Linux options:\n"
+	       "  --"OPT_SOCKET_MEM"        Memory to allocate on sockets (comma separated values)\n"
+	       "  --"OPT_HUGE_DIR"          Directory where hugetlbfs is mounted\n"
+	       "  --"OPT_FILE_PREFIX"       Prefix for hugepage filenames\n"
+	       "  --"OPT_BASE_VIRTADDR"     Base virtual address\n"
+	       "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by hotplug)\n"
+	       "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO (legacy|msi|msix)\n"
+	       "  --"OPT_XEN_DOM0"          Support running on Xen dom0 without hugetlbfs\n"
+	       "\n");
+	/* Allow the application to print its usage message too if hook is set */
+	if ( rte_application_usage_hook ) {
+		printf("===== Application Usage =====\n\n");
+		rte_application_usage_hook(prgname);
+	}
+}
+
+/* Set a per-application usage message */
+rte_usage_hook_t
+rte_set_application_usage_hook( rte_usage_hook_t usage_func )
+{
+	rte_usage_hook_t	old_func;
+
+	/* Will be NULL on the first call to denote the last usage routine. */
+	old_func					= rte_application_usage_hook;
+	rte_application_usage_hook	= usage_func;
+
+	return old_func;
+}
+
+static int
+eal_parse_socket_mem(char *socket_mem)
+{
+	char * arg[RTE_MAX_NUMA_NODES];
+	char *end;
+	int arg_num, i, len;
+	uint64_t total_mem = 0;
+
+	len = strnlen(socket_mem, SOCKET_MEM_STRLEN);
+	if (len == SOCKET_MEM_STRLEN) {
+		RTE_LOG(ERR, EAL, "--socket-mem is too long\n");
+		return -1;
+	}
+
+	/* all other error cases will be caught later */
+	if (!isdigit(socket_mem[len-1]))
+		return -1;
+
+	/* split the optarg into separate socket values */
+	arg_num = rte_strsplit(socket_mem, len,
+			arg, RTE_MAX_NUMA_NODES, ',');
+
+	/* if split failed, or 0 arguments */
+	if (arg_num <= 0)
+		return -1;
+
+	internal_config.force_sockets = 1;
+
+	/* parse each defined socket option */
+	errno = 0;
+	for (i = 0; i < arg_num; i++) {
+		end = NULL;
+		internal_config.socket_mem[i] = strtoull(arg[i], &end, 10);
+
+		/* check for invalid input */
+		if ((errno != 0)  ||
+				(arg[i][0] == '\0') || (end == NULL) || (*end != '\0'))
+			return -1;
+		internal_config.socket_mem[i] *= 1024ULL;
+		internal_config.socket_mem[i] *= 1024ULL;
+		total_mem += internal_config.socket_mem[i];
+	}
+
+	/* check if we have a positive amount of total memory */
+	if (total_mem == 0)
+		return -1;
+
+	return 0;
+}
+
+static int
+eal_parse_base_virtaddr(const char *arg)
+{
+	char *end;
+	uint64_t addr;
+
+	errno = 0;
+	addr = strtoull(arg, &end, 16);
+
+	/* check for errors */
+	if ((errno != 0) || (arg[0] == '\0') || end == NULL || (*end != '\0'))
+		return -1;
+
+	/* make sure we don't exceed 32-bit boundary on 32-bit target */
+#ifndef RTE_ARCH_64
+	if (addr >= UINTPTR_MAX)
+		return -1;
+#endif
+
+	/* align the addr on 16M boundary, 16MB is the minimum huge page
+	 * size on IBM Power architecture. If the addr is aligned to 16MB,
+	 * it can align to 2MB for x86. So this alignment can also be used
+	 * on x86 */
+	internal_config.base_virtaddr =
+		RTE_PTR_ALIGN_CEIL((uintptr_t)addr, (size_t)RTE_PGSIZE_16M);
+
+	return 0;
+}
+
+static int
+eal_parse_vfio_intr(const char *mode)
+{
+	unsigned i;
+	static struct {
+		const char *name;
+		enum rte_intr_mode value;
+	} map[] = {
+		{ "legacy", RTE_INTR_MODE_LEGACY },
+		{ "msi", RTE_INTR_MODE_MSI },
+		{ "msix", RTE_INTR_MODE_MSIX },
+	};
+
+	for (i = 0; i < RTE_DIM(map); i++) {
+		if (!strcmp(mode, map[i].name)) {
+			internal_config.vfio_intr_mode = map[i].value;
+			return 0;
+		}
+	}
+	return -1;
+}
+
+static inline size_t
+eal_get_hugepage_mem_size(void)
+{
+	uint64_t size = 0;
+	unsigned i, j;
+
+	for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+		if (hpi->hugedir != NULL) {
+			for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+				size += hpi->hugepage_sz * hpi->num_pages[j];
+			}
+		}
+	}
+
+	return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
+}
+
+/* Parse the arguments for --log-level only */
+static void
+eal_log_level_parse(int argc, char **argv)
+{
+	int opt;
+	char **argvopt;
+	int option_index;
+	const int old_optind = optind;
+	const int old_optopt = optopt;
+	char * const old_optarg = optarg;
+
+	argvopt = argv;
+	optind = 1;
+
+	eal_reset_internal_config(&internal_config);
+
+	while ((opt = getopt_long(argc, argvopt, eal_short_options,
+				  eal_long_options, &option_index)) != EOF) {
+
+		int ret;
+
+		/* getopt is not happy, stop right now */
+		if (opt == '?')
+			break;
+
+		ret = (opt == OPT_LOG_LEVEL_NUM) ?
+			eal_parse_common_option(opt, optarg, &internal_config) : 0;
+
+		/* common parser is not happy */
+		if (ret < 0)
+			break;
+	}
+
+	/* restore getopt lib */
+	optind = old_optind;
+	optopt = old_optopt;
+	optarg = old_optarg;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+eal_parse_args(int argc, char **argv)
+{
+	int opt, ret;
+	char **argvopt;
+	int option_index;
+	char *prgname = argv[0];
+	const int old_optind = optind;
+	const int old_optopt = optopt;
+	char * const old_optarg = optarg;
+
+	argvopt = argv;
+	optind = 1;
+
+	while ((opt = getopt_long(argc, argvopt, eal_short_options,
+				  eal_long_options, &option_index)) != EOF) {
+
+		/* getopt is not happy, stop right now */
+		if (opt == '?') {
+			eal_usage(prgname);
+			ret = -1;
+			goto out;
+		}
+
+		ret = eal_parse_common_option(opt, optarg, &internal_config);
+		/* common parser is not happy */
+		if (ret < 0) {
+			eal_usage(prgname);
+			ret = -1;
+			goto out;
+		}
+		/* common parser handled this option */
+		if (ret == 0)
+			continue;
+
+		switch (opt) {
+		case 'h':
+			eal_usage(prgname);
+			exit(EXIT_SUCCESS);
+
+		/* long options */
+		case OPT_XEN_DOM0_NUM:
+#ifdef RTE_LIBRTE_XEN_DOM0
+			internal_config.xen_dom0_support = 1;
+#else
+			RTE_LOG(ERR, EAL, "Can't support DPDK app "
+				"running on Dom0, please configure"
+				" RTE_LIBRTE_XEN_DOM0=y\n");
+			ret = -1;
+			goto out;
+#endif
+			break;
+
+		case OPT_HUGE_DIR_NUM:
+			internal_config.hugepage_dir = optarg;
+			break;
+
+		case OPT_FILE_PREFIX_NUM:
+			internal_config.hugefile_prefix = optarg;
+			break;
+
+		case OPT_SOCKET_MEM_NUM:
+			if (eal_parse_socket_mem(optarg) < 0) {
+				RTE_LOG(ERR, EAL, "invalid parameters for --"
+						OPT_SOCKET_MEM "\n");
+				eal_usage(prgname);
+				ret = -1;
+				goto out;
+			}
+			break;
+
+		case OPT_BASE_VIRTADDR_NUM:
+			if (eal_parse_base_virtaddr(optarg) < 0) {
+				RTE_LOG(ERR, EAL, "invalid parameter for --"
+						OPT_BASE_VIRTADDR "\n");
+				eal_usage(prgname);
+				ret = -1;
+				goto out;
+			}
+			break;
+
+		case OPT_VFIO_INTR_NUM:
+			if (eal_parse_vfio_intr(optarg) < 0) {
+				RTE_LOG(ERR, EAL, "invalid parameters for --"
+						OPT_VFIO_INTR "\n");
+				eal_usage(prgname);
+				ret = -1;
+				goto out;
+			}
+			break;
+
+		case OPT_CREATE_UIO_DEV_NUM:
+			internal_config.create_uio_dev = 1;
+			break;
+
+		default:
+			if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
+				RTE_LOG(ERR, EAL, "Option %c is not supported "
+					"on Linux\n", opt);
+			} else if (opt >= OPT_LONG_MIN_NUM &&
+				   opt < OPT_LONG_MAX_NUM) {
+				RTE_LOG(ERR, EAL, "Option %s is not supported "
+					"on Linux\n",
+					eal_long_options[option_index].name);
+			} else {
+				RTE_LOG(ERR, EAL, "Option %d is not supported "
+					"on Linux\n", opt);
+			}
+			eal_usage(prgname);
+			ret = -1;
+			goto out;
+		}
+	}
+
+	if (eal_adjust_config(&internal_config) != 0) {
+		ret = -1;
+		goto out;
+	}
+
+	/* sanity checks */
+	if (eal_check_common_options(&internal_config) != 0) {
+		eal_usage(prgname);
+		ret = -1;
+		goto out;
+	}
+
+	/* --xen-dom0 doesn't make sense with --socket-mem */
+	if (internal_config.xen_dom0_support && internal_config.force_sockets == 1) {
+		RTE_LOG(ERR, EAL, "Options --"OPT_SOCKET_MEM" cannot be specified "
+			"together with --"OPT_XEN_DOM0"\n");
+		eal_usage(prgname);
+		ret = -1;
+		goto out;
+	}
+
+	if (optind >= 0)
+		argv[optind-1] = prgname;
+	ret = optind-1;
+
+out:
+	/* restore getopt lib */
+	optind = old_optind;
+	optopt = old_optopt;
+	optarg = old_optarg;
+
+	return ret;
+}
+
+static void
+eal_check_mem_on_local_socket(void)
+{
+	const struct rte_memseg *ms;
+	int i, socket_id;
+
+	socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
+
+	ms = rte_eal_get_physmem_layout();
+
+	for (i = 0; i < RTE_MAX_MEMSEG; i++)
+		if (ms[i].socket_id == socket_id &&
+				ms[i].len > 0)
+			return;
+
+	RTE_LOG(WARNING, EAL, "WARNING: Master core has no "
+			"memory on local socket!\n");
+}
+
+static int
+sync_func(__attribute__((unused)) void *arg)
+{
+	return 0;
+}
+
+inline static void
+rte_eal_mcfg_complete(void)
+{
+	/* ALL shared mem_config related INIT DONE */
+	if (rte_config.process_type == RTE_PROC_PRIMARY)
+		rte_config.mem_config->magic = RTE_MAGIC;
+}
+
+/*
+ * Request iopl privilege for all RPL, returns 0 on success
+ * iopl() call is mostly for the i386 architecture. For other architectures,
+ * return -1 to indicate IO privilege can't be changed in this way.
+ */
+int
+rte_eal_iopl_init(void)
+{
+#if defined(RTE_ARCH_X86)
+	if (iopl(3) != 0)
+		return -1;
+	return 0;
+#elif defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
+	return 0; /* iopl syscall not supported for ARM/ARM64 */
+#else
+	return -1;
+#endif
+}
+
+/* Launch threads, called at application init(). */
+int
+rte_eal_init(int argc, char **argv)
+{
+	int i, fctret, ret;
+	pthread_t thread_id;
+	static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
+	const char *logid;
+	char cpuset[RTE_CPU_AFFINITY_STR_LEN];
+	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+	if (!rte_atomic32_test_and_set(&run_once))
+		return -1;
+
+	logid = strrchr(argv[0], '/');
+	logid = strdup(logid ? logid + 1: argv[0]);
+
+	thread_id = pthread_self();
+
+	if (rte_eal_log_early_init() < 0)
+		rte_panic("Cannot init early logs\n");
+
+	eal_log_level_parse(argc, argv);
+
+	/* set log level as early as possible */
+	rte_set_log_level(internal_config.log_level);
+
+	if (rte_eal_cpu_init() < 0)
+		rte_panic("Cannot detect lcores\n");
+
+	fctret = eal_parse_args(argc, argv);
+	if (fctret < 0)
+		exit(1);
+
+	if (internal_config.no_hugetlbfs == 0 &&
+			internal_config.process_type != RTE_PROC_SECONDARY &&
+			internal_config.xen_dom0_support == 0 &&
+			eal_hugepage_info_init() < 0)
+		rte_panic("Cannot get hugepage information\n");
+
+	if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
+		if (internal_config.no_hugetlbfs)
+			internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
+		else
+			internal_config.memory = eal_get_hugepage_mem_size();
+	}
+
+	if (internal_config.vmware_tsc_map == 1) {
+#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT
+		rte_cycles_vmware_tsc_map = 1;
+		RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "
+				"you must have monitor_control.pseudo_perfctr = TRUE\n");
+#else
+		RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "
+				"RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");
+#endif
+	}
+
+	rte_srand(rte_rdtsc());
+
+	rte_config_init();
+
+	if (rte_eal_pci_init() < 0)
+		rte_panic("Cannot init PCI\n");
+
+#ifdef RTE_LIBRTE_IVSHMEM
+	if (rte_eal_ivshmem_init() < 0)
+		rte_panic("Cannot init IVSHMEM\n");
+#endif
+
+	if (rte_eal_memory_init() < 0)
+		rte_panic("Cannot init memory\n");
+
+	/* the directories are locked during eal_hugepage_info_init */
+	eal_hugedirs_unlock();
+
+	if (rte_eal_memzone_init() < 0)
+		rte_panic("Cannot init memzone\n");
+
+	if (rte_eal_tailqs_init() < 0)
+		rte_panic("Cannot init tail queues for objects\n");
+
+#ifdef RTE_LIBRTE_IVSHMEM
+	if (rte_eal_ivshmem_obj_init() < 0)
+		rte_panic("Cannot init IVSHMEM objects\n");
+#endif
+
+	if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0)
+		rte_panic("Cannot init logs\n");
+
+	if (rte_eal_alarm_init() < 0)
+		rte_panic("Cannot init interrupt-handling thread\n");
+
+	if (rte_eal_timer_init() < 0)
+		rte_panic("Cannot init HPET or TSC timers\n");
+
+	eal_check_mem_on_local_socket();
+
+	if (eal_plugins_init() < 0)
+		rte_panic("Cannot init plugins\n");
+
+	eal_thread_init_master(rte_config.master_lcore);
+
+	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
+
+	RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
+		rte_config.master_lcore, (int)thread_id, cpuset,
+		ret == 0 ? "" : "...");
+
+	if (rte_eal_dev_init() < 0)
+		rte_panic("Cannot init pmd devices\n");
+
+	if (rte_eal_intr_init() < 0)
+		rte_panic("Cannot init interrupt-handling thread\n");
+
+	RTE_LCORE_FOREACH_SLAVE(i) {
+
+		/*
+		 * create communication pipes between master thread
+		 * and children
+		 */
+		if (pipe(lcore_config[i].pipe_master2slave) < 0)
+			rte_panic("Cannot create pipe\n");
+		if (pipe(lcore_config[i].pipe_slave2master) < 0)
+			rte_panic("Cannot create pipe\n");
+
+		lcore_config[i].state = WAIT;
+
+		/* create a thread for each lcore */
+		ret = pthread_create(&lcore_config[i].thread_id, NULL,
+				     eal_thread_loop, NULL);
+		if (ret != 0)
+			rte_panic("Cannot create thread\n");
+
+		/* Set thread_name for aid in debugging. */
+		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
+			"lcore-slave-%d", i);
+		ret = rte_thread_setname(lcore_config[i].thread_id,
+						thread_name);
+		if (ret != 0)
+			RTE_LOG(ERR, EAL,
+				"Cannot set name for lcore thread\n");
+	}
+
+	/*
+	 * Launch a dummy function on all slave lcores, so that master lcore
+	 * knows they are all ready when this function returns.
+	 */
+	rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
+	rte_eal_mp_wait_lcore();
+
+	/* Probe & Initialize PCI devices */
+	if (rte_eal_pci_probe())
+		rte_panic("Cannot probe PCI\n");
+
+	rte_eal_mcfg_complete();
+
+	return fctret;
+}
+
+/* get core role */
+enum rte_lcore_role_t
+rte_eal_lcore_role(unsigned lcore_id)
+{
+	return rte_config.lcore_role[lcore_id];
+}
+
+enum rte_proc_type_t
+rte_eal_process_type(void)
+{
+	return rte_config.process_type;
+}
+
+int rte_eal_has_hugepages(void)
+{
+	return ! internal_config.no_hugetlbfs;
+}
+
+int
+rte_eal_check_module(const char *module_name)
+{
+	char sysfs_mod_name[PATH_MAX];
+	struct stat st;
+	int n;
+
+	if (NULL == module_name)
+		return -1;
+
+	/* Check if there is sysfs mounted */
+	if (stat("/sys/module", &st) != 0) {
+		RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n",
+			errno, strerror(errno));
+		return -1;
+	}
+
+	/* A module might be built-in, therefore try sysfs */
+	n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name);
+	if (n < 0 || n > PATH_MAX) {
+		RTE_LOG(DEBUG, EAL, "Could not format module path\n");
+		return -1;
+	}
+
+	if (stat(sysfs_mod_name, &st) != 0) {
+		RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n",
+		        sysfs_mod_name, errno, strerror(errno));
+		return 0;
+	}
+
+	/* Module has been found */
+	return 1;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_alarm.c b/lib/librte_eal/linuxapp/eal/eal_alarm.c
new file mode 100644
index 00000000..8b042abc
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_alarm.c
@@ -0,0 +1,273 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/queue.h>
+#include <sys/time.h>
+#include <sys/timerfd.h>
+
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_interrupts.h>
+#include <rte_alarm.h>
+#include <rte_common.h>
+#include <rte_per_lcore.h>
+#include <rte_eal.h>
+#include <rte_launch.h>
+#include <rte_lcore.h>
+#include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_spinlock.h>
+#include <eal_private.h>
+
+#ifndef	TFD_NONBLOCK
+#include <fcntl.h>
+#define	TFD_NONBLOCK	O_NONBLOCK
+#endif
+
+#define NS_PER_US 1000
+#define US_PER_MS 1000
+#define MS_PER_S 1000
+#define US_PER_S (US_PER_MS * MS_PER_S)
+
+#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
+#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW
+#else
+#define CLOCK_TYPE_ID CLOCK_MONOTONIC
+#endif
+
+struct alarm_entry {
+	LIST_ENTRY(alarm_entry) next;
+	struct timeval time;
+	rte_eal_alarm_callback cb_fn;
+	void *cb_arg;
+	volatile uint8_t executing;
+	volatile pthread_t executing_id;
+};
+
+static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER();
+static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER;
+
+static struct rte_intr_handle intr_handle = {.fd = -1 };
+static int handler_registered = 0;
+static void eal_alarm_callback(struct rte_intr_handle *hdl, void *arg);
+
+int
+rte_eal_alarm_init(void)
+{
+	intr_handle.type = RTE_INTR_HANDLE_ALARM;
+	/* create a timerfd file descriptor */
+	intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+	if (intr_handle.fd == -1)
+		goto error;
+
+	return 0;
+
+error:
+	rte_errno = errno;
+	return -1;
+}
+
+static void
+eal_alarm_callback(struct rte_intr_handle *hdl __rte_unused,
+		void *arg __rte_unused)
+{
+	struct timespec now;
+	struct alarm_entry *ap;
+
+	rte_spinlock_lock(&alarm_list_lk);
+	while ((ap = LIST_FIRST(&alarm_list)) !=NULL &&
+			clock_gettime(CLOCK_TYPE_ID, &now) == 0 &&
+			(ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec &&
+						(ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) {
+		ap->executing = 1;
+		ap->executing_id = pthread_self();
+		rte_spinlock_unlock(&alarm_list_lk);
+
+		ap->cb_fn(ap->cb_arg);
+
+		rte_spinlock_lock(&alarm_list_lk);
+
+		LIST_REMOVE(ap, next);
+		rte_free(ap);
+	}
+
+	if (!LIST_EMPTY(&alarm_list)) {
+		struct itimerspec atime = { .it_interval = { 0, 0 } };
+
+		ap = LIST_FIRST(&alarm_list);
+		atime.it_value.tv_sec = ap->time.tv_sec;
+		atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US;
+		/* perform borrow for subtraction if necessary */
+		if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US))
+			atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US;
+
+		atime.it_value.tv_sec -= now.tv_sec;
+		atime.it_value.tv_nsec -= now.tv_nsec;
+		timerfd_settime(intr_handle.fd, 0, &atime, NULL);
+	}
+	rte_spinlock_unlock(&alarm_list_lk);
+}
+
+int
+rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg)
+{
+	struct timespec now;
+	int ret = 0;
+	struct alarm_entry *ap, *new_alarm;
+
+	/* Check parameters, including that us won't cause a uint64_t overflow */
+	if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL)
+		return -EINVAL;
+
+	new_alarm = rte_zmalloc(NULL, sizeof(*new_alarm), 0);
+	if (new_alarm == NULL)
+		return -ENOMEM;
+
+	/* use current time to calculate absolute time of alarm */
+	clock_gettime(CLOCK_TYPE_ID, &now);
+
+	new_alarm->cb_fn = cb_fn;
+	new_alarm->cb_arg = cb_arg;
+	new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S;
+	new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S);
+
+	rte_spinlock_lock(&alarm_list_lk);
+	if (!handler_registered) {
+		ret |= rte_intr_callback_register(&intr_handle,
+				eal_alarm_callback, NULL);
+		handler_registered = (ret == 0) ? 1 : 0;
+	}
+
+	if (LIST_EMPTY(&alarm_list))
+		LIST_INSERT_HEAD(&alarm_list, new_alarm, next);
+	else {
+		LIST_FOREACH(ap, &alarm_list, next) {
+			if (ap->time.tv_sec > new_alarm->time.tv_sec ||
+					(ap->time.tv_sec == new_alarm->time.tv_sec &&
+							ap->time.tv_usec > new_alarm->time.tv_usec)){
+				LIST_INSERT_BEFORE(ap, new_alarm, next);
+				break;
+			}
+			if (LIST_NEXT(ap, next) == NULL) {
+				LIST_INSERT_AFTER(ap, new_alarm, next);
+				break;
+			}
+		}
+	}
+
+	if (LIST_FIRST(&alarm_list) == new_alarm) {
+		struct itimerspec alarm_time = {
+			.it_interval = {0, 0},
+			.it_value = {
+				.tv_sec = us / US_PER_S,
+				.tv_nsec = (us % US_PER_S) * NS_PER_US,
+			},
+		};
+		ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL);
+	}
+	rte_spinlock_unlock(&alarm_list_lk);
+
+	return ret;
+}
+
+int
+rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg)
+{
+	struct alarm_entry *ap, *ap_prev;
+	int count = 0;
+	int err = 0;
+	int executing;
+
+	if (!cb_fn) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	do {
+		executing = 0;
+		rte_spinlock_lock(&alarm_list_lk);
+		/* remove any matches at the start of the list */
+		while ((ap = LIST_FIRST(&alarm_list)) != NULL &&
+				cb_fn == ap->cb_fn &&
+				(cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) {
+
+			if (ap->executing == 0) {
+				LIST_REMOVE(ap, next);
+				rte_free(ap);
+				count++;
+			} else {
+				/* If calling from other context, mark that alarm is executing
+				 * so loop can spin till it finish. Otherwise we are trying to
+				 * cancel our self - mark it by EINPROGRESS */
+				if (pthread_equal(ap->executing_id, pthread_self()) == 0)
+					executing++;
+				else
+					err = EINPROGRESS;
+
+				break;
+			}
+		}
+		ap_prev = ap;
+
+		/* now go through list, removing entries not at start */
+		LIST_FOREACH(ap, &alarm_list, next) {
+			/* this won't be true first time through */
+			if (cb_fn == ap->cb_fn &&
+					(cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) {
+
+				if (ap->executing == 0) {
+					LIST_REMOVE(ap, next);
+					rte_free(ap);
+					count++;
+					ap = ap_prev;
+				} else if (pthread_equal(ap->executing_id, pthread_self()) == 0)
+					executing++;
+				else
+					err = EINPROGRESS;
+			}
+			ap_prev = ap;
+		}
+		rte_spinlock_unlock(&alarm_list_lk);
+	} while (executing != 0);
+
+	if (count == 0 && err == 0)
+		rte_errno = ENOENT;
+	else if (err)
+		rte_errno = err;
+
+	return count;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_debug.c b/lib/librte_eal/linuxapp/eal/eal_debug.c
new file mode 100644
index 00000000..907fbfa7
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_debug.c
@@ -0,0 +1,119 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <execinfo.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_common.h>
+
+#define BACKTRACE_SIZE 256
+
+/* dump the stack of the calling core */
+void rte_dump_stack(void)
+{
+	void *func[BACKTRACE_SIZE];
+	char **symb = NULL;
+	int size;
+
+	size = backtrace(func, BACKTRACE_SIZE);
+	symb = backtrace_symbols(func, size);
+
+	if (symb == NULL)
+		return;
+
+	while (size > 0) {
+		rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL,
+			"%d: [%s]\n", size, symb[size - 1]);
+		size --;
+	}
+
+	free(symb);
+}
+
+/* not implemented in this environment */
+void rte_dump_registers(void)
+{
+	return;
+}
+
+/* call abort(), it will generate a coredump if enabled */
+void __rte_panic(const char *funcname, const char *format, ...)
+{
+	va_list ap;
+
+	/* disable history */
+	rte_log_set_history(0);
+
+	rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
+	va_start(ap, format);
+	rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
+	va_end(ap);
+	rte_dump_stack();
+	rte_dump_registers();
+	abort();
+}
+
+/*
+ * Like rte_panic this terminates the application. However, no traceback is
+ * provided and no core-dump is generated.
+ */
+void
+rte_exit(int exit_code, const char *format, ...)
+{
+	va_list ap;
+
+	/* disable history */
+	rte_log_set_history(0);
+
+	if (exit_code != 0)
+		RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
+				"  Cause: ", exit_code);
+
+	va_start(ap, format);
+	rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
+	va_end(ap);
+
+#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR
+	exit(exit_code);
+#else
+	rte_dump_stack();
+	rte_dump_registers();
+	abort();
+#endif
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
new file mode 100644
index 00000000..18858e2d
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -0,0 +1,365 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <sys/types.h>
+#include <sys/file.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fnmatch.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/queue.h>
+
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_eal.h>
+#include <rte_launch.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_debug.h>
+#include <rte_log.h>
+#include <rte_common.h>
+#include "rte_string_fns.h"
+#include "eal_internal_cfg.h"
+#include "eal_hugepages.h"
+#include "eal_filesystem.h"
+
+static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
+
+/* this function is only called from eal_hugepage_info_init which itself
+ * is only called from a primary process */
+static uint32_t
+get_num_hugepages(const char *subdir)
+{
+	char path[PATH_MAX];
+	long unsigned resv_pages, num_pages = 0;
+	const char *nr_hp_file = "free_hugepages";
+	const char *nr_rsvd_file = "resv_hugepages";
+
+	/* first, check how many reserved pages kernel reports */
+	snprintf(path, sizeof(path), "%s/%s/%s",
+			sys_dir_path, subdir, nr_rsvd_file);
+	if (eal_parse_sysfs_value(path, &resv_pages) < 0)
+		return 0;
+
+	snprintf(path, sizeof(path), "%s/%s/%s",
+			sys_dir_path, subdir, nr_hp_file);
+	if (eal_parse_sysfs_value(path, &num_pages) < 0)
+		return 0;
+
+	if (num_pages == 0)
+		RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
+				subdir);
+
+	/* adjust num_pages */
+	if (num_pages >= resv_pages)
+		num_pages -= resv_pages;
+	else if (resv_pages)
+		num_pages = 0;
+
+	/* we want to return a uint32_t and more than this looks suspicious
+	 * anyway ... */
+	if (num_pages > UINT32_MAX)
+		num_pages = UINT32_MAX;
+
+	return num_pages;
+}
+
+static uint64_t
+get_default_hp_size(void)
+{
+	const char proc_meminfo[] = "/proc/meminfo";
+	const char str_hugepagesz[] = "Hugepagesize:";
+	unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1;
+	char buffer[256];
+	unsigned long long size = 0;
+
+	FILE *fd = fopen(proc_meminfo, "r");
+	if (fd == NULL)
+		rte_panic("Cannot open %s\n", proc_meminfo);
+	while(fgets(buffer, sizeof(buffer), fd)){
+		if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){
+			size = rte_str_to_size(&buffer[hugepagesz_len]);
+			break;
+		}
+	}
+	fclose(fd);
+	if (size == 0)
+		rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo);
+	return size;
+}
+
+static const char *
+get_hugepage_dir(uint64_t hugepage_sz)
+{
+	enum proc_mount_fieldnames {
+		DEVICE = 0,
+		MOUNTPT,
+		FSTYPE,
+		OPTIONS,
+		_FIELDNAME_MAX
+	};
+	static uint64_t default_size = 0;
+	const char proc_mounts[] = "/proc/mounts";
+	const char hugetlbfs_str[] = "hugetlbfs";
+	const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1;
+	const char pagesize_opt[] = "pagesize=";
+	const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1;
+	const char split_tok = ' ';
+	char *splitstr[_FIELDNAME_MAX];
+	char buf[BUFSIZ];
+	char *retval = NULL;
+
+	FILE *fd = fopen(proc_mounts, "r");
+	if (fd == NULL)
+		rte_panic("Cannot open %s\n", proc_mounts);
+
+	if (default_size == 0)
+		default_size = get_default_hp_size();
+
+	while (fgets(buf, sizeof(buf), fd)){
+		if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX,
+				split_tok) != _FIELDNAME_MAX) {
+			RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts);
+			break; /* return NULL */
+		}
+
+		/* we have a specified --huge-dir option, only examine that dir */
+		if (internal_config.hugepage_dir != NULL &&
+				strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0)
+			continue;
+
+		if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){
+			const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt);
+
+			/* if no explicit page size, the default page size is compared */
+			if (pagesz_str == NULL){
+				if (hugepage_sz == default_size){
+					retval = strdup(splitstr[MOUNTPT]);
+					break;
+				}
+			}
+			/* there is an explicit page size, so check it */
+			else {
+				uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]);
+				if (pagesz == hugepage_sz) {
+					retval = strdup(splitstr[MOUNTPT]);
+					break;
+				}
+			}
+		} /* end if strncmp hugetlbfs */
+	} /* end while fgets */
+
+	fclose(fd);
+	return retval;
+}
+
+/*
+ * Clear the hugepage directory of whatever hugepage files
+ * there are. Checks if the file is locked (i.e.
+ * if it's in use by another DPDK process).
+ */
+static int
+clear_hugedir(const char * hugedir)
+{
+	DIR *dir;
+	struct dirent *dirent;
+	int dir_fd, fd, lck_result;
+	const char filter[] = "*map_*"; /* matches hugepage files */
+
+	/* open directory */
+	dir = opendir(hugedir);
+	if (!dir) {
+		RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n",
+				hugedir);
+		goto error;
+	}
+	dir_fd = dirfd(dir);
+
+	dirent = readdir(dir);
+	if (!dirent) {
+		RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n",
+				hugedir);
+		goto error;
+	}
+
+	while(dirent != NULL){
+		/* skip files that don't match the hugepage pattern */
+		if (fnmatch(filter, dirent->d_name, 0) > 0) {
+			dirent = readdir(dir);
+			continue;
+		}
+
+		/* try and lock the file */
+		fd = openat(dir_fd, dirent->d_name, O_RDONLY);
+
+		/* skip to next file */
+		if (fd == -1) {
+			dirent = readdir(dir);
+			continue;
+		}
+
+		/* non-blocking lock */
+		lck_result = flock(fd, LOCK_EX | LOCK_NB);
+
+		/* if lock succeeds, unlock and remove the file */
+		if (lck_result != -1) {
+			flock(fd, LOCK_UN);
+			unlinkat(dir_fd, dirent->d_name, 0);
+		}
+		close (fd);
+		dirent = readdir(dir);
+	}
+
+	closedir(dir);
+	return 0;
+
+error:
+	if (dir)
+		closedir(dir);
+
+	RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n",
+		strerror(errno));
+
+	return -1;
+}
+
+static int
+compare_hpi(const void *a, const void *b)
+{
+	const struct hugepage_info *hpi_a = a;
+	const struct hugepage_info *hpi_b = b;
+
+	return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
+}
+
+/*
+ * when we initialize the hugepage info, everything goes
+ * to socket 0 by default. it will later get sorted by memory
+ * initialization procedure.
+ */
+int
+eal_hugepage_info_init(void)
+{
+	const char dirent_start_text[] = "hugepages-";
+	const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
+	unsigned i, num_sizes = 0;
+	DIR *dir;
+	struct dirent *dirent;
+
+	dir = opendir(sys_dir_path);
+	if (dir == NULL)
+		rte_panic("Cannot open directory %s to read system hugepage "
+			  "info\n", sys_dir_path);
+
+	for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
+		struct hugepage_info *hpi;
+
+		if (strncmp(dirent->d_name, dirent_start_text,
+			    dirent_start_len) != 0)
+			continue;
+
+		if (num_sizes >= MAX_HUGEPAGE_SIZES)
+			break;
+
+		hpi = &internal_config.hugepage_info[num_sizes];
+		hpi->hugepage_sz =
+			rte_str_to_size(&dirent->d_name[dirent_start_len]);
+		hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz);
+
+		/* first, check if we have a mountpoint */
+		if (hpi->hugedir == NULL) {
+			uint32_t num_pages;
+
+			num_pages = get_num_hugepages(dirent->d_name);
+			if (num_pages > 0)
+				RTE_LOG(NOTICE, EAL,
+					"%" PRIu32 " hugepages of size "
+					"%" PRIu64 " reserved, but no mounted "
+					"hugetlbfs found for that size\n",
+					num_pages, hpi->hugepage_sz);
+			continue;
+		}
+
+		/* try to obtain a writelock */
+		hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
+
+		/* if blocking lock failed */
+		if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
+			RTE_LOG(CRIT, EAL,
+				"Failed to lock hugepage directory!\n");
+			break;
+		}
+		/* clear out the hugepages dir from unused pages */
+		if (clear_hugedir(hpi->hugedir) == -1)
+			break;
+
+		/* for now, put all pages into socket 0,
+		 * later they will be sorted */
+		hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+
+#ifndef RTE_ARCH_64
+		/* for 32-bit systems, limit number of hugepages to
+		 * 1GB per page size */
+		hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
+					    RTE_PGSIZE_1G / hpi->hugepage_sz);
+#endif
+
+		num_sizes++;
+	}
+	closedir(dir);
+
+	/* something went wrong, and we broke from the for loop above */
+	if (dirent != NULL)
+		return -1;
+
+	internal_config.num_hugepage_sizes = num_sizes;
+
+	/* sort the page directory entries by size, largest to smallest */
+	qsort(&internal_config.hugepage_info[0], num_sizes,
+	      sizeof(internal_config.hugepage_info[0]), compare_hpi);
+
+	/* now we have all info, check we have at least one valid size */
+	for (i = 0; i < num_sizes; i++)
+		if (internal_config.hugepage_info[i].hugedir != NULL &&
+		    internal_config.hugepage_info[i].num_pages[0] > 0)
+			return 0;
+
+	/* no valid hugepage mounts available, return error */
+	return -1;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
new file mode 100644
index 00000000..06b26a9e
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -0,0 +1,1224 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <sys/queue.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <sys/epoll.h>
+#include <sys/signalfd.h>
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include <assert.h>
+
+#include <rte_common.h>
+#include <rte_interrupts.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_ring.h>
+#include <rte_debug.h>
+#include <rte_log.h>
+#include <rte_mempool.h>
+#include <rte_pci.h>
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_spinlock.h>
+
+#include "eal_private.h"
+#include "eal_vfio.h"
+#include "eal_thread.h"
+
+#define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
+#define NB_OTHER_INTR               1
+
+static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
+
+/**
+ * union for pipe fds.
+ */
+union intr_pipefds{
+	struct {
+		int pipefd[2];
+	};
+	struct {
+		int readfd;
+		int writefd;
+	};
+};
+
+/**
+ * union buffer for reading on different devices
+ */
+union rte_intr_read_buffer {
+	int uio_intr_count;              /* for uio device */
+#ifdef VFIO_PRESENT
+	uint64_t vfio_intr_count;        /* for vfio device */
+#endif
+	uint64_t timerfd_num;            /* for timerfd */
+	char charbuf[16];                /* for others */
+};
+
+TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
+TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
+
+struct rte_intr_callback {
+	TAILQ_ENTRY(rte_intr_callback) next;
+	rte_intr_callback_fn cb_fn;  /**< callback address */
+	void *cb_arg;                /**< parameter for callback */
+};
+
+struct rte_intr_source {
+	TAILQ_ENTRY(rte_intr_source) next;
+	struct rte_intr_handle intr_handle; /**< interrupt handle */
+	struct rte_intr_cb_list callbacks;  /**< user callbacks */
+	uint32_t active;
+};
+
+/* global spinlock for interrupt data operation */
+static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* union buffer for pipe read/write */
+static union intr_pipefds intr_pipe;
+
+/* interrupt sources list */
+static struct rte_intr_source_list intr_sources;
+
+/* interrupt handling thread */
+static pthread_t intr_thread;
+
+/* VFIO interrupts */
+#ifdef VFIO_PRESENT
+
+#define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
+/* irq set buffer length for queue interrupts and LSC interrupt */
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+			      sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
+
+/* enable legacy (INTx) interrupts */
+static int
+vfio_enable_intx(struct rte_intr_handle *intr_handle) {
+	struct vfio_irq_set *irq_set;
+	char irq_set_buf[IRQ_SET_BUF_LEN];
+	int len, ret;
+	int *fd_ptr;
+
+	len = sizeof(irq_set_buf);
+
+	/* enable INTx */
+	irq_set = (struct vfio_irq_set *) irq_set_buf;
+	irq_set->argsz = len;
+	irq_set->count = 1;
+	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+	irq_set->start = 0;
+	fd_ptr = (int *) &irq_set->data;
+	*fd_ptr = intr_handle->fd;
+
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
+						intr_handle->fd);
+		return -1;
+	}
+
+	/* unmask INTx after enabling */
+	memset(irq_set, 0, len);
+	len = sizeof(struct vfio_irq_set);
+	irq_set->argsz = len;
+	irq_set->count = 1;
+	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
+	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+	irq_set->start = 0;
+
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
+						intr_handle->fd);
+		return -1;
+	}
+	return 0;
+}
+
+/* disable legacy (INTx) interrupts */
+static int
+vfio_disable_intx(struct rte_intr_handle *intr_handle) {
+	struct vfio_irq_set *irq_set;
+	char irq_set_buf[IRQ_SET_BUF_LEN];
+	int len, ret;
+
+	len = sizeof(struct vfio_irq_set);
+
+	/* mask interrupts before disabling */
+	irq_set = (struct vfio_irq_set *) irq_set_buf;
+	irq_set->argsz = len;
+	irq_set->count = 1;
+	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
+	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+	irq_set->start = 0;
+
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
+						intr_handle->fd);
+		return -1;
+	}
+
+	/* disable INTx*/
+	memset(irq_set, 0, len);
+	irq_set->argsz = len;
+	irq_set->count = 0;
+	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+	irq_set->start = 0;
+
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+	if (ret) {
+		RTE_LOG(ERR, EAL,
+			"Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
+		return -1;
+	}
+	return 0;
+}
+
+/* enable MSI interrupts */
+static int
+vfio_enable_msi(struct rte_intr_handle *intr_handle) {
+	int len, ret;
+	char irq_set_buf[IRQ_SET_BUF_LEN];
+	struct vfio_irq_set *irq_set;
+	int *fd_ptr;
+
+	len = sizeof(irq_set_buf);
+
+	irq_set = (struct vfio_irq_set *) irq_set_buf;
+	irq_set->argsz = len;
+	irq_set->count = 1;
+	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
+	irq_set->start = 0;
+	fd_ptr = (int *) &irq_set->data;
+	*fd_ptr = intr_handle->fd;
+
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
+						intr_handle->fd);
+		return -1;
+	}
+	return 0;
+}
+
+/* disable MSI interrupts */
+static int
+vfio_disable_msi(struct rte_intr_handle *intr_handle) {
+	struct vfio_irq_set *irq_set;
+	char irq_set_buf[IRQ_SET_BUF_LEN];
+	int len, ret;
+
+	len = sizeof(struct vfio_irq_set);
+
+	irq_set = (struct vfio_irq_set *) irq_set_buf;
+	irq_set->argsz = len;
+	irq_set->count = 0;
+	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
+	irq_set->start = 0;
+
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+	if (ret)
+		RTE_LOG(ERR, EAL,
+			"Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
+
+	return ret;
+}
+
+/* enable MSI-X interrupts */
+static int
+vfio_enable_msix(struct rte_intr_handle *intr_handle) {
+	int len, ret;
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+	struct vfio_irq_set *irq_set;
+	int *fd_ptr;
+
+	len = sizeof(irq_set_buf);
+
+	irq_set = (struct vfio_irq_set *) irq_set_buf;
+	irq_set->argsz = len;
+	if (!intr_handle->max_intr)
+		intr_handle->max_intr = 1;
+	else if (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID)
+		intr_handle->max_intr = RTE_MAX_RXTX_INTR_VEC_ID + 1;
+
+	irq_set->count = intr_handle->max_intr;
+	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+	irq_set->start = 0;
+	fd_ptr = (int *) &irq_set->data;
+	/* INTR vector offset 0 reserve for non-efds mapping */
+	fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
+	memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
+		sizeof(*intr_handle->efds) * intr_handle->nb_efd);
+
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
+						intr_handle->fd);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* disable MSI-X interrupts */
+static int
+vfio_disable_msix(struct rte_intr_handle *intr_handle) {
+	struct vfio_irq_set *irq_set;
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+	int len, ret;
+
+	len = sizeof(struct vfio_irq_set);
+
+	irq_set = (struct vfio_irq_set *) irq_set_buf;
+	irq_set->argsz = len;
+	irq_set->count = 0;
+	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+	irq_set->start = 0;
+
+	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+	if (ret)
+		RTE_LOG(ERR, EAL,
+			"Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
+
+	return ret;
+}
+#endif
+
+static int
+uio_intx_intr_disable(struct rte_intr_handle *intr_handle)
+{
+	unsigned char command_high;
+
+	/* use UIO config file descriptor for uio_pci_generic */
+	if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+		RTE_LOG(ERR, EAL,
+			"Error reading interrupts status for fd %d\n",
+			intr_handle->uio_cfg_fd);
+		return -1;
+	}
+	/* disable interrupts */
+	command_high |= 0x4;
+	if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+		RTE_LOG(ERR, EAL,
+			"Error disabling interrupts for fd %d\n",
+			intr_handle->uio_cfg_fd);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+uio_intx_intr_enable(struct rte_intr_handle *intr_handle)
+{
+	unsigned char command_high;
+
+	/* use UIO config file descriptor for uio_pci_generic */
+	if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+		RTE_LOG(ERR, EAL,
+			"Error reading interrupts status for fd %d\n",
+			intr_handle->uio_cfg_fd);
+		return -1;
+	}
+	/* enable interrupts */
+	command_high &= ~0x4;
+	if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+		RTE_LOG(ERR, EAL,
+			"Error enabling interrupts for fd %d\n",
+			intr_handle->uio_cfg_fd);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+uio_intr_disable(struct rte_intr_handle *intr_handle)
+{
+	const int value = 0;
+
+	if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
+		RTE_LOG(ERR, EAL,
+			"Error disabling interrupts for fd %d (%s)\n",
+			intr_handle->fd, strerror(errno));
+		return -1;
+	}
+	return 0;
+}
+
+static int
+uio_intr_enable(struct rte_intr_handle *intr_handle)
+{
+	const int value = 1;
+
+	if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
+		RTE_LOG(ERR, EAL,
+			"Error enabling interrupts for fd %d (%s)\n",
+			intr_handle->fd, strerror(errno));
+		return -1;
+	}
+	return 0;
+}
+
+int
+rte_intr_callback_register(struct rte_intr_handle *intr_handle,
+			rte_intr_callback_fn cb, void *cb_arg)
+{
+	int ret, wake_thread;
+	struct rte_intr_source *src;
+	struct rte_intr_callback *callback;
+
+	wake_thread = 0;
+
+	/* first do parameter checking */
+	if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
+		RTE_LOG(ERR, EAL,
+			"Registering with invalid input parameter\n");
+		return -EINVAL;
+	}
+
+	/* allocate a new interrupt callback entity */
+	callback = rte_zmalloc("interrupt callback list",
+				sizeof(*callback), 0);
+	if (callback == NULL) {
+		RTE_LOG(ERR, EAL, "Can not allocate memory\n");
+		return -ENOMEM;
+	}
+	callback->cb_fn = cb;
+	callback->cb_arg = cb_arg;
+
+	rte_spinlock_lock(&intr_lock);
+
+	/* check if there is at least one callback registered for the fd */
+	TAILQ_FOREACH(src, &intr_sources, next) {
+		if (src->intr_handle.fd == intr_handle->fd) {
+			/* we had no interrupts for this */
+			if TAILQ_EMPTY(&src->callbacks)
+				wake_thread = 1;
+
+			TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
+			ret = 0;
+			break;
+		}
+	}
+
+	/* no existing callbacks for this - add new source */
+	if (src == NULL) {
+		if ((src = rte_zmalloc("interrupt source list",
+				sizeof(*src), 0)) == NULL) {
+			RTE_LOG(ERR, EAL, "Can not allocate memory\n");
+			rte_free(callback);
+			ret = -ENOMEM;
+		} else {
+			src->intr_handle = *intr_handle;
+			TAILQ_INIT(&src->callbacks);
+			TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
+			TAILQ_INSERT_TAIL(&intr_sources, src, next);
+			wake_thread = 1;
+			ret = 0;
+		}
+	}
+
+	rte_spinlock_unlock(&intr_lock);
+
+	/**
+	 * check if need to notify the pipe fd waited by epoll_wait to
+	 * rebuild the wait list.
+	 */
+	if (wake_thread)
+		if (write(intr_pipe.writefd, "1", 1) < 0)
+			return -EPIPE;
+
+	return ret;
+}
+
+int
+rte_intr_callback_unregister(struct rte_intr_handle *intr_handle,
+			rte_intr_callback_fn cb_fn, void *cb_arg)
+{
+	int ret;
+	struct rte_intr_source *src;
+	struct rte_intr_callback *cb, *next;
+
+	/* do parameter checking first */
+	if (intr_handle == NULL || intr_handle->fd < 0) {
+		RTE_LOG(ERR, EAL,
+		"Unregistering with invalid input parameter\n");
+		return -EINVAL;
+	}
+
+	rte_spinlock_lock(&intr_lock);
+
+	/* check if the insterrupt source for the fd is existent */
+	TAILQ_FOREACH(src, &intr_sources, next)
+		if (src->intr_handle.fd == intr_handle->fd)
+			break;
+
+	/* No interrupt source registered for the fd */
+	if (src == NULL) {
+		ret = -ENOENT;
+
+	/* interrupt source has some active callbacks right now. */
+	} else if (src->active != 0) {
+		ret = -EAGAIN;
+
+	/* ok to remove. */
+	} else {
+		ret = 0;
+
+		/*walk through the callbacks and remove all that match. */
+		for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
+
+			next = TAILQ_NEXT(cb, next);
+
+			if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
+					cb->cb_arg == cb_arg)) {
+				TAILQ_REMOVE(&src->callbacks, cb, next);
+				rte_free(cb);
+				ret++;
+			}
+		}
+
+		/* all callbacks for that source are removed. */
+		if (TAILQ_EMPTY(&src->callbacks)) {
+			TAILQ_REMOVE(&intr_sources, src, next);
+			rte_free(src);
+		}
+	}
+
+	rte_spinlock_unlock(&intr_lock);
+
+	/* notify the pipe fd waited by epoll_wait to rebuild the wait list */
+	if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
+		ret = -EPIPE;
+	}
+
+	return ret;
+}
+
+int
+rte_intr_enable(struct rte_intr_handle *intr_handle)
+{
+	if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+		return -1;
+
+	switch (intr_handle->type){
+	/* write to the uio fd to enable the interrupt */
+	case RTE_INTR_HANDLE_UIO:
+		if (uio_intr_enable(intr_handle))
+			return -1;
+		break;
+	case RTE_INTR_HANDLE_UIO_INTX:
+		if (uio_intx_intr_enable(intr_handle))
+			return -1;
+		break;
+	/* not used at this moment */
+	case RTE_INTR_HANDLE_ALARM:
+		return -1;
+#ifdef VFIO_PRESENT
+	case RTE_INTR_HANDLE_VFIO_MSIX:
+		if (vfio_enable_msix(intr_handle))
+			return -1;
+		break;
+	case RTE_INTR_HANDLE_VFIO_MSI:
+		if (vfio_enable_msi(intr_handle))
+			return -1;
+		break;
+	case RTE_INTR_HANDLE_VFIO_LEGACY:
+		if (vfio_enable_intx(intr_handle))
+			return -1;
+		break;
+#endif
+	/* unknown handle type */
+	default:
+		RTE_LOG(ERR, EAL,
+			"Unknown handle type of fd %d\n",
+					intr_handle->fd);
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+rte_intr_disable(struct rte_intr_handle *intr_handle)
+{
+	if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+		return -1;
+
+	switch (intr_handle->type){
+	/* write to the uio fd to disable the interrupt */
+	case RTE_INTR_HANDLE_UIO:
+		if (uio_intr_disable(intr_handle))
+			return -1;
+		break;
+	case RTE_INTR_HANDLE_UIO_INTX:
+		if (uio_intx_intr_disable(intr_handle))
+			return -1;
+		break;
+	/* not used at this moment */
+	case RTE_INTR_HANDLE_ALARM:
+		return -1;
+#ifdef VFIO_PRESENT
+	case RTE_INTR_HANDLE_VFIO_MSIX:
+		if (vfio_disable_msix(intr_handle))
+			return -1;
+		break;
+	case RTE_INTR_HANDLE_VFIO_MSI:
+		if (vfio_disable_msi(intr_handle))
+			return -1;
+		break;
+	case RTE_INTR_HANDLE_VFIO_LEGACY:
+		if (vfio_disable_intx(intr_handle))
+			return -1;
+		break;
+#endif
+	/* unknown handle type */
+	default:
+		RTE_LOG(ERR, EAL,
+			"Unknown handle type of fd %d\n",
+					intr_handle->fd);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+eal_intr_process_interrupts(struct epoll_event *events, int nfds)
+{
+	int n, bytes_read;
+	struct rte_intr_source *src;
+	struct rte_intr_callback *cb;
+	union rte_intr_read_buffer buf;
+	struct rte_intr_callback active_cb;
+
+	for (n = 0; n < nfds; n++) {
+
+		/**
+		 * if the pipe fd is ready to read, return out to
+		 * rebuild the wait list.
+		 */
+		if (events[n].data.fd == intr_pipe.readfd){
+			int r = read(intr_pipe.readfd, buf.charbuf,
+					sizeof(buf.charbuf));
+			RTE_SET_USED(r);
+			return -1;
+		}
+		rte_spinlock_lock(&intr_lock);
+		TAILQ_FOREACH(src, &intr_sources, next)
+			if (src->intr_handle.fd ==
+					events[n].data.fd)
+				break;
+		if (src == NULL){
+			rte_spinlock_unlock(&intr_lock);
+			continue;
+		}
+
+		/* mark this interrupt source as active and release the lock. */
+		src->active = 1;
+		rte_spinlock_unlock(&intr_lock);
+
+		/* set the length to be read dor different handle type */
+		switch (src->intr_handle.type) {
+		case RTE_INTR_HANDLE_UIO:
+		case RTE_INTR_HANDLE_UIO_INTX:
+			bytes_read = sizeof(buf.uio_intr_count);
+			break;
+		case RTE_INTR_HANDLE_ALARM:
+			bytes_read = sizeof(buf.timerfd_num);
+			break;
+#ifdef VFIO_PRESENT
+		case RTE_INTR_HANDLE_VFIO_MSIX:
+		case RTE_INTR_HANDLE_VFIO_MSI:
+		case RTE_INTR_HANDLE_VFIO_LEGACY:
+			bytes_read = sizeof(buf.vfio_intr_count);
+			break;
+#endif
+		case RTE_INTR_HANDLE_EXT:
+		default:
+			bytes_read = 1;
+			break;
+		}
+
+		if (src->intr_handle.type != RTE_INTR_HANDLE_EXT) {
+			/**
+			 * read out to clear the ready-to-be-read flag
+			 * for epoll_wait.
+			 */
+			bytes_read = read(events[n].data.fd, &buf, bytes_read);
+			if (bytes_read < 0) {
+				if (errno == EINTR || errno == EWOULDBLOCK)
+					continue;
+
+				RTE_LOG(ERR, EAL, "Error reading from file "
+					"descriptor %d: %s\n",
+					events[n].data.fd,
+					strerror(errno));
+			} else if (bytes_read == 0)
+				RTE_LOG(ERR, EAL, "Read nothing from file "
+					"descriptor %d\n", events[n].data.fd);
+		}
+
+		/* grab a lock, again to call callbacks and update status. */
+		rte_spinlock_lock(&intr_lock);
+
+		if (bytes_read > 0) {
+
+			/* Finally, call all callbacks. */
+			TAILQ_FOREACH(cb, &src->callbacks, next) {
+
+				/* make a copy and unlock. */
+				active_cb = *cb;
+				rte_spinlock_unlock(&intr_lock);
+
+				/* call the actual callback */
+				active_cb.cb_fn(&src->intr_handle,
+					active_cb.cb_arg);
+
+				/*get the lock back. */
+				rte_spinlock_lock(&intr_lock);
+			}
+		}
+
+		/* we done with that interrupt source, release it. */
+		src->active = 0;
+		rte_spinlock_unlock(&intr_lock);
+	}
+
+	return 0;
+}
+
+/**
+ * It handles all the interrupts.
+ *
+ * @param pfd
+ *  epoll file descriptor.
+ * @param totalfds
+ *  The number of file descriptors added in epoll.
+ *
+ * @return
+ *  void
+ */
+static void
+eal_intr_handle_interrupts(int pfd, unsigned totalfds)
+{
+	struct epoll_event events[totalfds];
+	int nfds = 0;
+
+	for(;;) {
+		nfds = epoll_wait(pfd, events, totalfds,
+			EAL_INTR_EPOLL_WAIT_FOREVER);
+		/* epoll_wait fail */
+		if (nfds < 0) {
+			if (errno == EINTR)
+				continue;
+			RTE_LOG(ERR, EAL,
+				"epoll_wait returns with fail\n");
+			return;
+		}
+		/* epoll_wait timeout, will never happens here */
+		else if (nfds == 0)
+			continue;
+		/* epoll_wait has at least one fd ready to read */
+		if (eal_intr_process_interrupts(events, nfds) < 0)
+			return;
+	}
+}
+
+/**
+ * It builds/rebuilds up the epoll file descriptor with all the
+ * file descriptors being waited on. Then handles the interrupts.
+ *
+ * @param arg
+ *  pointer. (unused)
+ *
+ * @return
+ *  never return;
+ */
+static __attribute__((noreturn)) void *
+eal_intr_thread_main(__rte_unused void *arg)
+{
+	struct epoll_event ev;
+
+	/* host thread, never break out */
+	for (;;) {
+		/* build up the epoll fd with all descriptors we are to
+		 * wait on then pass it to the handle_interrupts function
+		 */
+		static struct epoll_event pipe_event = {
+			.events = EPOLLIN | EPOLLPRI,
+		};
+		struct rte_intr_source *src;
+		unsigned numfds = 0;
+
+		/* create epoll fd */
+		int pfd = epoll_create(1);
+		if (pfd < 0)
+			rte_panic("Cannot create epoll instance\n");
+
+		pipe_event.data.fd = intr_pipe.readfd;
+		/**
+		 * add pipe fd into wait list, this pipe is used to
+		 * rebuild the wait list.
+		 */
+		if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
+						&pipe_event) < 0) {
+			rte_panic("Error adding fd to %d epoll_ctl, %s\n",
+					intr_pipe.readfd, strerror(errno));
+		}
+		numfds++;
+
+		rte_spinlock_lock(&intr_lock);
+
+		TAILQ_FOREACH(src, &intr_sources, next) {
+			if (src->callbacks.tqh_first == NULL)
+				continue; /* skip those with no callbacks */
+			ev.events = EPOLLIN | EPOLLPRI;
+			ev.data.fd = src->intr_handle.fd;
+
+			/**
+			 * add all the uio device file descriptor
+			 * into wait list.
+			 */
+			if (epoll_ctl(pfd, EPOLL_CTL_ADD,
+					src->intr_handle.fd, &ev) < 0){
+				rte_panic("Error adding fd %d epoll_ctl, %s\n",
+					src->intr_handle.fd, strerror(errno));
+			}
+			else
+				numfds++;
+		}
+		rte_spinlock_unlock(&intr_lock);
+		/* serve the interrupt */
+		eal_intr_handle_interrupts(pfd, numfds);
+
+		/**
+		 * when we return, we need to rebuild the
+		 * list of fds to monitor.
+		 */
+		close(pfd);
+	}
+}
+
+int
+rte_eal_intr_init(void)
+{
+	int ret = 0, ret_1 = 0;
+	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+	/* init the global interrupt source head */
+	TAILQ_INIT(&intr_sources);
+
+	/**
+	 * create a pipe which will be waited by epoll and notified to
+	 * rebuild the wait list of epoll.
+	 */
+	if (pipe(intr_pipe.pipefd) < 0)
+		return -1;
+
+	/* create the host thread to wait/handle the interrupt */
+	ret = pthread_create(&intr_thread, NULL,
+			eal_intr_thread_main, NULL);
+	if (ret != 0) {
+		RTE_LOG(ERR, EAL,
+			"Failed to create thread for interrupt handling\n");
+	} else {
+		/* Set thread_name for aid in debugging. */
+		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
+			"eal-intr-thread");
+		ret_1 = rte_thread_setname(intr_thread, thread_name);
+		if (ret_1 != 0)
+			RTE_LOG(ERR, EAL,
+			"Failed to set thread name for interrupt handling\n");
+	}
+
+	return -ret;
+}
+
+static void
+eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
+{
+	union rte_intr_read_buffer buf;
+	int bytes_read = 1;
+	int nbytes;
+
+	switch (intr_handle->type) {
+	case RTE_INTR_HANDLE_UIO:
+	case RTE_INTR_HANDLE_UIO_INTX:
+		bytes_read = sizeof(buf.uio_intr_count);
+		break;
+#ifdef VFIO_PRESENT
+	case RTE_INTR_HANDLE_VFIO_MSIX:
+	case RTE_INTR_HANDLE_VFIO_MSI:
+	case RTE_INTR_HANDLE_VFIO_LEGACY:
+		bytes_read = sizeof(buf.vfio_intr_count);
+		break;
+#endif
+	default:
+		bytes_read = 1;
+		RTE_LOG(INFO, EAL, "unexpected intr type\n");
+		break;
+	}
+
+	/**
+	 * read out to clear the ready-to-be-read flag
+	 * for epoll_wait.
+	 */
+	do {
+		nbytes = read(fd, &buf, bytes_read);
+		if (nbytes < 0) {
+			if (errno == EINTR || errno == EWOULDBLOCK ||
+			    errno == EAGAIN)
+				continue;
+			RTE_LOG(ERR, EAL,
+				"Error reading from fd %d: %s\n",
+				fd, strerror(errno));
+		} else if (nbytes == 0)
+			RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
+		return;
+	} while (1);
+}
+
+static int
+eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
+			struct rte_epoll_event *events)
+{
+	unsigned int i, count = 0;
+	struct rte_epoll_event *rev;
+
+	for (i = 0; i < n; i++) {
+		rev = evs[i].data.ptr;
+		if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
+						 RTE_EPOLL_EXEC))
+			continue;
+
+		events[count].status        = RTE_EPOLL_VALID;
+		events[count].fd            = rev->fd;
+		events[count].epfd          = rev->epfd;
+		events[count].epdata.event  = rev->epdata.event;
+		events[count].epdata.data   = rev->epdata.data;
+		if (rev->epdata.cb_fun)
+			rev->epdata.cb_fun(rev->fd,
+					   rev->epdata.cb_arg);
+
+		rte_compiler_barrier();
+		rev->status = RTE_EPOLL_VALID;
+		count++;
+	}
+	return count;
+}
+
+static inline int
+eal_init_tls_epfd(void)
+{
+	int pfd = epoll_create(255);
+
+	if (pfd < 0) {
+		RTE_LOG(ERR, EAL,
+			"Cannot create epoll instance\n");
+		return -1;
+	}
+	return pfd;
+}
+
+int
+rte_intr_tls_epfd(void)
+{
+	if (RTE_PER_LCORE(_epfd) == -1)
+		RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
+
+	return RTE_PER_LCORE(_epfd);
+}
+
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+	       int maxevents, int timeout)
+{
+	struct epoll_event evs[maxevents];
+	int rc;
+
+	if (!events) {
+		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+		return -1;
+	}
+
+	/* using per thread epoll fd */
+	if (epfd == RTE_EPOLL_PER_THREAD)
+		epfd = rte_intr_tls_epfd();
+
+	while (1) {
+		rc = epoll_wait(epfd, evs, maxevents, timeout);
+		if (likely(rc > 0)) {
+			/* epoll_wait has at least one fd ready to read */
+			rc = eal_epoll_process_event(evs, rc, events);
+			break;
+		} else if (rc < 0) {
+			if (errno == EINTR)
+				continue;
+			/* epoll_wait fail */
+			RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
+				strerror(errno));
+			rc = -1;
+			break;
+		} else {
+			/* rc == 0, epoll_wait timed out */
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static inline void
+eal_epoll_data_safe_free(struct rte_epoll_event *ev)
+{
+	while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID,
+				    RTE_EPOLL_INVALID))
+		while (ev->status != RTE_EPOLL_VALID)
+			rte_pause();
+	memset(&ev->epdata, 0, sizeof(ev->epdata));
+	ev->fd = -1;
+	ev->epfd = -1;
+}
+
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+	      struct rte_epoll_event *event)
+{
+	struct epoll_event ev;
+
+	if (!event) {
+		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+		return -1;
+	}
+
+	/* using per thread epoll fd */
+	if (epfd == RTE_EPOLL_PER_THREAD)
+		epfd = rte_intr_tls_epfd();
+
+	if (op == EPOLL_CTL_ADD) {
+		event->status = RTE_EPOLL_VALID;
+		event->fd = fd;  /* ignore fd in event */
+		event->epfd = epfd;
+		ev.data.ptr = (void *)event;
+	}
+
+	ev.events = event->epdata.event;
+	if (epoll_ctl(epfd, op, fd, &ev) < 0) {
+		RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
+			op, fd, strerror(errno));
+		if (op == EPOLL_CTL_ADD)
+			/* rollback status when CTL_ADD fail */
+			event->status = RTE_EPOLL_INVALID;
+		return -1;
+	}
+
+	if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
+		eal_epoll_data_safe_free(event);
+
+	return 0;
+}
+
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
+		int op, unsigned int vec, void *data)
+{
+	struct rte_epoll_event *rev;
+	struct rte_epoll_data *epdata;
+	int epfd_op;
+	unsigned int efd_idx;
+	int rc = 0;
+
+	efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
+		(vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
+
+	if (!intr_handle || intr_handle->nb_efd == 0 ||
+	    efd_idx >= intr_handle->nb_efd) {
+		RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
+		return -EPERM;
+	}
+
+	switch (op) {
+	case RTE_INTR_EVENT_ADD:
+		epfd_op = EPOLL_CTL_ADD;
+		rev = &intr_handle->elist[efd_idx];
+		if (rev->status != RTE_EPOLL_INVALID) {
+			RTE_LOG(INFO, EAL, "Event already been added.\n");
+			return -EEXIST;
+		}
+
+		/* attach to intr vector fd */
+		epdata = &rev->epdata;
+		epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
+		epdata->data   = data;
+		epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
+		epdata->cb_arg = (void *)intr_handle;
+		rc = rte_epoll_ctl(epfd, epfd_op,
+				   intr_handle->efds[efd_idx], rev);
+		if (!rc)
+			RTE_LOG(DEBUG, EAL,
+				"efd %d associated with vec %d added on epfd %d"
+				"\n", rev->fd, vec, epfd);
+		else
+			rc = -EPERM;
+		break;
+	case RTE_INTR_EVENT_DEL:
+		epfd_op = EPOLL_CTL_DEL;
+		rev = &intr_handle->elist[efd_idx];
+		if (rev->status == RTE_EPOLL_INVALID) {
+			RTE_LOG(INFO, EAL, "Event does not exist.\n");
+			return -EPERM;
+		}
+
+		rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
+		if (rc)
+			rc = -EPERM;
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "event op type mismatch\n");
+		rc = -EPERM;
+	}
+
+	return rc;
+}
+
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
+{
+	uint32_t i;
+	int fd;
+	uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+
+	assert(nb_efd != 0);
+
+	if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
+		for (i = 0; i < n; i++) {
+			fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+			if (fd < 0) {
+				RTE_LOG(ERR, EAL,
+					"can't setup eventfd, error %i (%s)\n",
+					errno, strerror(errno));
+				return -1;
+			}
+			intr_handle->efds[i] = fd;
+		}
+		intr_handle->nb_efd   = n;
+		intr_handle->max_intr = NB_OTHER_INTR + n;
+	} else {
+		intr_handle->efds[0]  = intr_handle->fd;
+		intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
+		intr_handle->max_intr = NB_OTHER_INTR;
+	}
+
+	return 0;
+}
+
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
+{
+	uint32_t i;
+	struct rte_epoll_event *rev;
+
+	for (i = 0; i < intr_handle->nb_efd; i++) {
+		rev = &intr_handle->elist[i];
+		if (rev->status == RTE_EPOLL_INVALID)
+			continue;
+		if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
+			/* force free if the entry valid */
+			eal_epoll_data_safe_free(rev);
+			rev->status = RTE_EPOLL_INVALID;
+		}
+	}
+
+	if (intr_handle->max_intr > intr_handle->nb_efd) {
+		for (i = 0; i < intr_handle->nb_efd; i++)
+			close(intr_handle->efds[i]);
+	}
+	intr_handle->nb_efd = 0;
+	intr_handle->max_intr = 0;
+}
+
+int
+rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
+{
+	return !(!intr_handle->nb_efd);
+}
+
+int
+rte_intr_allow_others(struct rte_intr_handle *intr_handle)
+{
+	if (!rte_intr_dp_is_en(intr_handle))
+		return 1;
+	else
+		return !!(intr_handle->max_intr - intr_handle->nb_efd);
+}
+
+int
+rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
+{
+	if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
+		return 1;
+
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_ivshmem.c b/lib/librte_eal/linuxapp/eal/eal_ivshmem.c
new file mode 100644
index 00000000..07aec694
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_ivshmem.c
@@ -0,0 +1,955 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef RTE_LIBRTE_IVSHMEM /* hide it from coverage */
+
+#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/file.h>
+#include <string.h>
+#include <sys/queue.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
+#include <rte_string_fns.h>
+#include <rte_errno.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_common.h>
+#include <rte_ivshmem.h>
+
+#include "eal_internal_cfg.h"
+#include "eal_private.h"
+
+#define PCI_VENDOR_ID_IVSHMEM 0x1Af4
+#define PCI_DEVICE_ID_IVSHMEM 0x1110
+
+#define IVSHMEM_MAGIC 0x0BADC0DE
+
+#define IVSHMEM_RESOURCE_PATH "/sys/bus/pci/devices/%04x:%02x:%02x.%x/resource2"
+#define IVSHMEM_CONFIG_PATH "/var/run/.%s_ivshmem_config"
+
+#define PHYS 0x1
+#define VIRT 0x2
+#define IOREMAP 0x4
+#define FULL (PHYS|VIRT|IOREMAP)
+
+#define METADATA_SIZE_ALIGNED \
+	(RTE_ALIGN_CEIL(sizeof(struct rte_ivshmem_metadata),pagesz))
+
+#define CONTAINS(x,y)\
+	(((y).addr_64 >= (x).addr_64) && ((y).addr_64 < (x).addr_64 + (x).len))
+
+#define DIM(x) (sizeof(x)/sizeof(x[0]))
+
+struct ivshmem_pci_device {
+	char path[PATH_MAX];
+	phys_addr_t ioremap_addr;
+};
+
+/* data type to store in config */
+struct ivshmem_segment {
+	struct rte_ivshmem_metadata_entry entry;
+	uint64_t align;
+	char path[PATH_MAX];
+};
+struct ivshmem_shared_config {
+	struct ivshmem_segment segment[RTE_MAX_MEMSEG];
+	uint32_t segment_idx;
+	struct ivshmem_pci_device pci_devs[RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS];
+	uint32_t pci_devs_idx;
+};
+static struct ivshmem_shared_config * ivshmem_config;
+static int memseg_idx;
+static int pagesz;
+
+/* Tailq heads to add rings to */
+TAILQ_HEAD(rte_ring_list, rte_tailq_entry);
+
+/*
+ * Utility functions
+ */
+
+static int
+is_ivshmem_device(struct rte_pci_device * dev)
+{
+	return dev->id.vendor_id == PCI_VENDOR_ID_IVSHMEM
+			&& dev->id.device_id == PCI_DEVICE_ID_IVSHMEM;
+}
+
+static void *
+map_metadata(int fd, uint64_t len)
+{
+	size_t metadata_len = sizeof(struct rte_ivshmem_metadata);
+	size_t aligned_len = METADATA_SIZE_ALIGNED;
+
+	return mmap(NULL, metadata_len, PROT_READ | PROT_WRITE,
+			MAP_SHARED, fd, len - aligned_len);
+}
+
+static void
+unmap_metadata(void * ptr)
+{
+	munmap(ptr, sizeof(struct rte_ivshmem_metadata));
+}
+
+static int
+has_ivshmem_metadata(int fd, uint64_t len)
+{
+	struct rte_ivshmem_metadata metadata;
+	void * ptr;
+
+	ptr = map_metadata(fd, len);
+
+	if (ptr == MAP_FAILED)
+		return -1;
+
+	metadata = *(struct rte_ivshmem_metadata*) (ptr);
+
+	unmap_metadata(ptr);
+
+	return metadata.magic_number == IVSHMEM_MAGIC;
+}
+
+static void
+remove_segment(struct ivshmem_segment * ms, int len, int idx)
+{
+	int i;
+
+	for (i = idx; i < len - 1; i++)
+		memcpy(&ms[i], &ms[i+1], sizeof(struct ivshmem_segment));
+	memset(&ms[len-1], 0, sizeof(struct ivshmem_segment));
+}
+
+static int
+overlap(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
+{
+	uint64_t start1, end1, start2, end2;
+	uint64_t p_start1, p_end1, p_start2, p_end2;
+	uint64_t i_start1, i_end1, i_start2, i_end2;
+	int result = 0;
+
+	/* gather virtual addresses */
+	start1 = mz1->addr_64;
+	end1 = mz1->addr_64 + mz1->len;
+	start2 = mz2->addr_64;
+	end2 = mz2->addr_64 + mz2->len;
+
+	/* gather physical addresses */
+	p_start1 = mz1->phys_addr;
+	p_end1 = mz1->phys_addr + mz1->len;
+	p_start2 = mz2->phys_addr;
+	p_end2 = mz2->phys_addr + mz2->len;
+
+	/* gather ioremap addresses */
+	i_start1 = mz1->ioremap_addr;
+	i_end1 = mz1->ioremap_addr + mz1->len;
+	i_start2 = mz2->ioremap_addr;
+	i_end2 = mz2->ioremap_addr + mz2->len;
+
+	/* check for overlap in virtual addresses */
+	if (start1 > start2 && start1 < end2)
+		result |= VIRT;
+	if (start2 >= start1 && start2 < end1)
+		result |= VIRT;
+
+	/* check for overlap in physical addresses */
+	if (p_start1 > p_start2 && p_start1 < p_end2)
+		result |= PHYS;
+	if (p_start2 > p_start1 && p_start2 < p_end1)
+		result |= PHYS;
+
+	/* check for overlap in ioremap addresses */
+	if (i_start1 > i_start2 && i_start1 < i_end2)
+		result |= IOREMAP;
+	if (i_start2 > i_start1 && i_start2 < i_end1)
+		result |= IOREMAP;
+
+	return result;
+}
+
+static int
+adjacent(const struct rte_memzone * mz1, const struct rte_memzone * mz2)
+{
+	uint64_t start1, end1, start2, end2;
+	uint64_t p_start1, p_end1, p_start2, p_end2;
+	uint64_t i_start1, i_end1, i_start2, i_end2;
+	int result = 0;
+
+	/* gather virtual addresses */
+	start1 = mz1->addr_64;
+	end1 = mz1->addr_64 + mz1->len;
+	start2 = mz2->addr_64;
+	end2 = mz2->addr_64 + mz2->len;
+
+	/* gather physical addresses */
+	p_start1 = mz1->phys_addr;
+	p_end1 = mz1->phys_addr + mz1->len;
+	p_start2 = mz2->phys_addr;
+	p_end2 = mz2->phys_addr + mz2->len;
+
+	/* gather ioremap addresses */
+	i_start1 = mz1->ioremap_addr;
+	i_end1 = mz1->ioremap_addr + mz1->len;
+	i_start2 = mz2->ioremap_addr;
+	i_end2 = mz2->ioremap_addr + mz2->len;
+
+	/* check if segments are virtually adjacent */
+	if (start1 == end2)
+		result |= VIRT;
+	if (start2 == end1)
+		result |= VIRT;
+
+	/* check if segments are physically adjacent */
+	if (p_start1 == p_end2)
+		result |= PHYS;
+	if (p_start2 == p_end1)
+		result |= PHYS;
+
+	/* check if segments are ioremap-adjacent */
+	if (i_start1 == i_end2)
+		result |= IOREMAP;
+	if (i_start2 == i_end1)
+		result |= IOREMAP;
+
+	return result;
+}
+
+static int
+has_adjacent_segments(struct ivshmem_segment * ms, int len)
+{
+	int i, j;
+
+	for (i = 0; i < len; i++)
+		for (j = i + 1; j < len; j++) {
+			/* we're only interested in fully adjacent segments; partially
+			 * adjacent segments can coexist.
+			 */
+			if (adjacent(&ms[i].entry.mz, &ms[j].entry.mz) == FULL)
+				return 1;
+		}
+	return 0;
+}
+
+static int
+has_overlapping_segments(struct ivshmem_segment * ms, int len)
+{
+	int i, j;
+
+	for (i = 0; i < len; i++)
+		for (j = i + 1; j < len; j++)
+			if (overlap(&ms[i].entry.mz, &ms[j].entry.mz))
+				return 1;
+	return 0;
+}
+
+static int
+seg_compare(const void * a, const void * b)
+{
+	const struct ivshmem_segment * s1 = (const struct ivshmem_segment*) a;
+	const struct ivshmem_segment * s2 = (const struct ivshmem_segment*) b;
+
+	/* move unallocated zones to the end */
+	if (s1->entry.mz.addr == NULL && s2->entry.mz.addr == NULL)
+		return 0;
+	if (s1->entry.mz.addr == 0)
+		return 1;
+	if (s2->entry.mz.addr == 0)
+		return -1;
+
+	return s1->entry.mz.phys_addr > s2->entry.mz.phys_addr;
+}
+
+#ifdef RTE_LIBRTE_IVSHMEM_DEBUG
+static void
+entry_dump(struct rte_ivshmem_metadata_entry *e)
+{
+	RTE_LOG(DEBUG, EAL, "\tvirt: %p-%p\n", e->mz.addr,
+			RTE_PTR_ADD(e->mz.addr, e->mz.len));
+	RTE_LOG(DEBUG, EAL, "\tphys: 0x%" PRIx64 "-0x%" PRIx64 "\n",
+			e->mz.phys_addr,
+			e->mz.phys_addr + e->mz.len);
+	RTE_LOG(DEBUG, EAL, "\tio: 0x%" PRIx64 "-0x%" PRIx64 "\n",
+			e->mz.ioremap_addr,
+			e->mz.ioremap_addr + e->mz.len);
+	RTE_LOG(DEBUG, EAL, "\tlen: 0x%" PRIx64 "\n", e->mz.len);
+	RTE_LOG(DEBUG, EAL, "\toff: 0x%" PRIx64 "\n", e->offset);
+}
+#endif
+
+
+
+/*
+ * Actual useful code
+ */
+
+/* read through metadata mapped from the IVSHMEM device */
+static int
+read_metadata(char * path, int path_len, int fd, uint64_t flen)
+{
+	struct rte_ivshmem_metadata metadata;
+	struct rte_ivshmem_metadata_entry * entry;
+	int idx, i;
+	void * ptr;
+
+	ptr = map_metadata(fd, flen);
+
+	if (ptr == MAP_FAILED)
+		return -1;
+
+	metadata = *(struct rte_ivshmem_metadata*) (ptr);
+
+	unmap_metadata(ptr);
+
+	RTE_LOG(DEBUG, EAL, "Parsing metadata for \"%s\"\n", metadata.name);
+
+	idx = ivshmem_config->segment_idx;
+
+	for (i = 0; i < RTE_LIBRTE_IVSHMEM_MAX_ENTRIES &&
+		idx <= RTE_MAX_MEMSEG; i++) {
+
+		if (idx == RTE_MAX_MEMSEG) {
+			RTE_LOG(ERR, EAL, "Not enough memory segments!\n");
+			return -1;
+		}
+
+		entry = &metadata.entry[i];
+
+		/* stop on uninitialized memzone */
+		if (entry->mz.len == 0)
+			break;
+
+		/* copy metadata entry */
+		memcpy(&ivshmem_config->segment[idx].entry, entry,
+				sizeof(struct rte_ivshmem_metadata_entry));
+
+		/* copy path */
+		snprintf(ivshmem_config->segment[idx].path, path_len, "%s", path);
+
+		idx++;
+	}
+	ivshmem_config->segment_idx = idx;
+
+	return 0;
+}
+
+/* check through each segment and look for adjacent or overlapping ones. */
+static int
+cleanup_segments(struct ivshmem_segment * ms, int tbl_len)
+{
+	struct ivshmem_segment * s, * tmp;
+	int i, j, concat, seg_adjacent, seg_overlapping;
+	uint64_t start1, start2, end1, end2, p_start1, p_start2, i_start1, i_start2;
+
+	qsort(ms, tbl_len, sizeof(struct ivshmem_segment),
+				seg_compare);
+
+	while (has_overlapping_segments(ms, tbl_len) ||
+			has_adjacent_segments(ms, tbl_len)) {
+
+		for (i = 0; i < tbl_len; i++) {
+			s = &ms[i];
+
+			concat = 0;
+
+			for (j = i + 1; j < tbl_len; j++) {
+				tmp = &ms[j];
+
+				/* check if this segment is overlapping with existing segment,
+				 * or is adjacent to existing segment */
+				seg_overlapping = overlap(&s->entry.mz, &tmp->entry.mz);
+				seg_adjacent = adjacent(&s->entry.mz, &tmp->entry.mz);
+
+				/* check if segments fully overlap or are fully adjacent */
+				if ((seg_adjacent == FULL) || (seg_overlapping == FULL)) {
+
+#ifdef RTE_LIBRTE_IVSHMEM_DEBUG
+					RTE_LOG(DEBUG, EAL, "Concatenating segments\n");
+					RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
+					entry_dump(&s->entry);
+					RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
+					entry_dump(&tmp->entry);
+#endif
+
+					start1 = s->entry.mz.addr_64;
+					start2 = tmp->entry.mz.addr_64;
+					p_start1 = s->entry.mz.phys_addr;
+					p_start2 = tmp->entry.mz.phys_addr;
+					i_start1 = s->entry.mz.ioremap_addr;
+					i_start2 = tmp->entry.mz.ioremap_addr;
+					end1 = s->entry.mz.addr_64 + s->entry.mz.len;
+					end2 = tmp->entry.mz.addr_64 + tmp->entry.mz.len;
+
+					/* settle for minimum start address and maximum length */
+					s->entry.mz.addr_64 = RTE_MIN(start1, start2);
+					s->entry.mz.phys_addr = RTE_MIN(p_start1, p_start2);
+					s->entry.mz.ioremap_addr = RTE_MIN(i_start1, i_start2);
+					s->entry.offset = RTE_MIN(s->entry.offset, tmp->entry.offset);
+					s->entry.mz.len = RTE_MAX(end1, end2) - s->entry.mz.addr_64;
+					concat = 1;
+
+#ifdef RTE_LIBRTE_IVSHMEM_DEBUG
+					RTE_LOG(DEBUG, EAL, "Resulting segment:\n");
+					entry_dump(&s->entry);
+
+#endif
+				}
+				/* if segments not fully overlap, we have an error condition.
+				 * adjacent segments can coexist.
+				 */
+				else if (seg_overlapping > 0) {
+					RTE_LOG(ERR, EAL, "Segments %i and %i overlap!\n", i, j);
+#ifdef RTE_LIBRTE_IVSHMEM_DEBUG
+					RTE_LOG(DEBUG, EAL, "Segment %i:\n", i);
+					entry_dump(&s->entry);
+					RTE_LOG(DEBUG, EAL, "Segment %i:\n", j);
+					entry_dump(&tmp->entry);
+#endif
+					return -1;
+				}
+				if (concat)
+					break;
+			}
+			/* if we concatenated, remove segment at j */
+			if (concat) {
+				remove_segment(ms, tbl_len, j);
+				tbl_len--;
+				break;
+			}
+		}
+	}
+
+	return tbl_len;
+}
+
+static int
+create_shared_config(void)
+{
+	char path[PATH_MAX];
+	int fd;
+
+	/* build ivshmem config file path */
+	snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
+			internal_config.hugefile_prefix);
+
+	fd = open(path, O_CREAT | O_RDWR, 0600);
+
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "Could not open %s: %s\n", path, strerror(errno));
+		return -1;
+	}
+
+	/* try ex-locking first - if the file is locked, we have a problem */
+	if (flock(fd, LOCK_EX | LOCK_NB) == -1) {
+		RTE_LOG(ERR, EAL, "Locking %s failed: %s\n", path, strerror(errno));
+		close(fd);
+		return -1;
+	}
+
+	if (ftruncate(fd, sizeof(struct ivshmem_shared_config)) < 0) {
+		RTE_LOG(ERR, EAL, "ftruncate failed: %s\n", strerror(errno));
+		return -1;
+	}
+
+	ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
+			PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+	if (ivshmem_config == MAP_FAILED)
+		return -1;
+
+	memset(ivshmem_config, 0, sizeof(struct ivshmem_shared_config));
+
+	/* change the exclusive lock we got earlier to a shared lock */
+	if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
+		RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
+		return -1;
+	}
+
+	close(fd);
+
+	return 0;
+}
+
+/* open shared config file and, if present, map the config.
+ * having no config file is not an error condition, as we later check if
+ * ivshmem_config is NULL (if it is, that means nothing was mapped). */
+static int
+open_shared_config(void)
+{
+	char path[PATH_MAX];
+	int fd;
+
+	/* build ivshmem config file path */
+	snprintf(path, sizeof(path), IVSHMEM_CONFIG_PATH,
+			internal_config.hugefile_prefix);
+
+	fd = open(path, O_RDONLY);
+
+	/* if the file doesn't exist, just return success */
+	if (fd < 0 && errno == ENOENT)
+		return 0;
+	/* else we have an error condition */
+	else if (fd < 0) {
+		RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
+				path, strerror(errno));
+		return -1;
+	}
+
+	/* try ex-locking first - if the lock *does* succeed, this means it's a
+	 * stray config file, so it should be deleted.
+	 */
+	if (flock(fd, LOCK_EX | LOCK_NB) != -1) {
+
+		/* if we can't remove the file, something is wrong */
+		if (unlink(path) < 0) {
+			RTE_LOG(ERR, EAL, "Could not remove %s: %s\n", path,
+					strerror(errno));
+			return -1;
+		}
+
+		/* release the lock */
+		flock(fd, LOCK_UN);
+		close(fd);
+
+		/* return success as having a stray config file is equivalent to not
+		 * having config file at all.
+		 */
+		return 0;
+	}
+
+	ivshmem_config = mmap(NULL, sizeof(struct ivshmem_shared_config),
+			PROT_READ, MAP_SHARED, fd, 0);
+
+	if (ivshmem_config == MAP_FAILED)
+		return -1;
+
+	/* place a shared lock on config file */
+	if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
+		RTE_LOG(ERR, EAL, "Locking %s failed: %s \n", path, strerror(errno));
+		return -1;
+	}
+
+	close(fd);
+
+	return 0;
+}
+
+/*
+ * This function does the following:
+ *
+ * 1) Builds a table of ivshmem_segments with proper offset alignment
+ * 2) Cleans up that table so that we don't have any overlapping or adjacent
+ *    memory segments
+ * 3) Creates memsegs from this table and maps them into memory.
+ */
+static inline int
+map_all_segments(void)
+{
+	struct ivshmem_segment ms_tbl[RTE_MAX_MEMSEG];
+	struct ivshmem_pci_device * pci_dev;
+	struct rte_mem_config * mcfg;
+	struct ivshmem_segment * seg;
+	int fd, fd_zero;
+	unsigned i, j;
+	struct rte_memzone mz;
+	struct rte_memseg ms;
+	void * base_addr;
+	uint64_t align, len;
+	phys_addr_t ioremap_addr;
+
+	ioremap_addr = 0;
+
+	memset(ms_tbl, 0, sizeof(ms_tbl));
+	memset(&mz, 0, sizeof(struct rte_memzone));
+	memset(&ms, 0, sizeof(struct rte_memseg));
+
+	/* first, build a table of memsegs to map, to avoid failed mmaps due to
+	 * overlaps
+	 */
+	for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMSEG; i++) {
+		if (i == RTE_MAX_MEMSEG) {
+			RTE_LOG(ERR, EAL, "Too many segments requested!\n");
+			return -1;
+		}
+
+		seg = &ivshmem_config->segment[i];
+
+		/* copy segment to table */
+		memcpy(&ms_tbl[i], seg, sizeof(struct ivshmem_segment));
+
+		/* find ioremap addr */
+		for (j = 0; j < DIM(ivshmem_config->pci_devs); j++) {
+			pci_dev = &ivshmem_config->pci_devs[j];
+			if (!strncmp(pci_dev->path, seg->path, sizeof(pci_dev->path))) {
+				ioremap_addr = pci_dev->ioremap_addr;
+				break;
+			}
+		}
+		if (ioremap_addr == 0) {
+			RTE_LOG(ERR, EAL, "Cannot find ioremap addr!\n");
+			return -1;
+		}
+
+		/* work out alignments */
+		align = seg->entry.mz.addr_64 -
+				RTE_ALIGN_FLOOR(seg->entry.mz.addr_64, 0x1000);
+		len = RTE_ALIGN_CEIL(seg->entry.mz.len + align, 0x1000);
+
+		/* save original alignments */
+		ms_tbl[i].align = align;
+
+		/* create a memory zone */
+		mz.addr_64 = seg->entry.mz.addr_64 - align;
+		mz.len = len;
+		mz.hugepage_sz = seg->entry.mz.hugepage_sz;
+		mz.phys_addr = seg->entry.mz.phys_addr - align;
+
+		/* find true physical address */
+		mz.ioremap_addr = ioremap_addr + seg->entry.offset - align;
+
+		ms_tbl[i].entry.offset = seg->entry.offset - align;
+
+		memcpy(&ms_tbl[i].entry.mz, &mz, sizeof(struct rte_memzone));
+	}
+
+	/* clean up the segments */
+	memseg_idx = cleanup_segments(ms_tbl, ivshmem_config->segment_idx);
+
+	if (memseg_idx < 0)
+		return -1;
+
+	mcfg = rte_eal_get_configuration()->mem_config;
+
+	fd_zero = open("/dev/zero", O_RDWR);
+
+	if (fd_zero < 0) {
+		RTE_LOG(ERR, EAL, "Cannot open /dev/zero: %s\n", strerror(errno));
+		return -1;
+	}
+
+	/* create memsegs and put them into DPDK memory */
+	for (i = 0; i < (unsigned) memseg_idx; i++) {
+
+		seg = &ms_tbl[i];
+
+		ms.addr_64 = seg->entry.mz.addr_64;
+		ms.hugepage_sz = seg->entry.mz.hugepage_sz;
+		ms.len = seg->entry.mz.len;
+		ms.nchannel = rte_memory_get_nchannel();
+		ms.nrank = rte_memory_get_nrank();
+		ms.phys_addr = seg->entry.mz.phys_addr;
+		ms.ioremap_addr = seg->entry.mz.ioremap_addr;
+		ms.socket_id = seg->entry.mz.socket_id;
+
+		base_addr = mmap(ms.addr, ms.len,
+				PROT_READ | PROT_WRITE, MAP_PRIVATE, fd_zero, 0);
+
+		if (base_addr == MAP_FAILED || base_addr != ms.addr) {
+			RTE_LOG(ERR, EAL, "Cannot map /dev/zero!\n");
+			return -1;
+		}
+
+		fd = open(seg->path, O_RDWR);
+
+		if (fd < 0) {
+			RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", seg->path,
+					strerror(errno));
+			return -1;
+		}
+
+		munmap(ms.addr, ms.len);
+
+		base_addr = mmap(ms.addr, ms.len,
+				PROT_READ | PROT_WRITE, MAP_SHARED, fd,
+				seg->entry.offset);
+
+
+		if (base_addr == MAP_FAILED || base_addr != ms.addr) {
+			RTE_LOG(ERR, EAL, "Cannot map segment into memory: "
+					"expected %p got %p (%s)\n", ms.addr, base_addr,
+					strerror(errno));
+			return -1;
+		}
+
+		RTE_LOG(DEBUG, EAL, "Memory segment mapped: %p (len %" PRIx64 ") at "
+				"offset 0x%" PRIx64 "\n",
+				ms.addr, ms.len, seg->entry.offset);
+
+		/* put the pointers back into their real positions using original
+		 * alignment */
+		ms.addr_64 += seg->align;
+		ms.phys_addr += seg->align;
+		ms.ioremap_addr += seg->align;
+		ms.len -= seg->align;
+
+		/* at this point, the rest of DPDK memory is not initialized, so we
+		 * expect memsegs to be empty */
+		memcpy(&mcfg->memseg[i], &ms,
+				sizeof(struct rte_memseg));
+
+		close(fd);
+
+		RTE_LOG(DEBUG, EAL, "IVSHMEM segment found, size: 0x%lx\n",
+				ms.len);
+	}
+
+	return 0;
+}
+
+/* this happens at a later stage, after general EAL memory initialization */
+int
+rte_eal_ivshmem_obj_init(void)
+{
+	struct rte_ring_list* ring_list = NULL;
+	struct rte_mem_config * mcfg;
+	struct ivshmem_segment * seg;
+	struct rte_memzone * mz;
+	struct rte_ring * r;
+	struct rte_tailq_entry *te;
+	unsigned i, ms, idx;
+	uint64_t offset;
+
+	/* secondary process would not need any object discovery - it'll all
+	 * already be in shared config */
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY || ivshmem_config == NULL)
+		return 0;
+
+	/* check that we have an initialised ring tail queue */
+	ring_list = RTE_TAILQ_LOOKUP(RTE_TAILQ_RING_NAME, rte_ring_list);
+	if (ring_list == NULL) {
+		RTE_LOG(ERR, EAL, "No rte_ring tailq found!\n");
+		return -1;
+	}
+
+	mcfg = rte_eal_get_configuration()->mem_config;
+
+	/* create memzones */
+	for (i = 0; i < ivshmem_config->segment_idx && i <= RTE_MAX_MEMZONE; i++) {
+
+		seg = &ivshmem_config->segment[i];
+
+		/* add memzone */
+		if (mcfg->memzone_cnt == RTE_MAX_MEMZONE) {
+			RTE_LOG(ERR, EAL, "No more memory zones available!\n");
+			return -1;
+		}
+
+		idx = mcfg->memzone_cnt;
+
+		RTE_LOG(DEBUG, EAL, "Found memzone: '%s' at %p (len 0x%" PRIx64 ")\n",
+				seg->entry.mz.name, seg->entry.mz.addr, seg->entry.mz.len);
+
+		memcpy(&mcfg->memzone[idx], &seg->entry.mz,
+				sizeof(struct rte_memzone));
+
+		/* find ioremap address */
+		for (ms = 0; ms <= RTE_MAX_MEMSEG; ms++) {
+			if (ms == RTE_MAX_MEMSEG) {
+				RTE_LOG(ERR, EAL, "Physical address of segment not found!\n");
+				return -1;
+			}
+			if (CONTAINS(mcfg->memseg[ms], mcfg->memzone[idx])) {
+				offset = mcfg->memzone[idx].addr_64 -
+								mcfg->memseg[ms].addr_64;
+				mcfg->memzone[idx].ioremap_addr = mcfg->memseg[ms].ioremap_addr +
+						offset;
+				break;
+			}
+		}
+
+		mcfg->memzone_cnt++;
+	}
+
+	rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
+
+	/* find rings */
+	for (i = 0; i < mcfg->memzone_cnt; i++) {
+		mz = &mcfg->memzone[i];
+
+		/* check if memzone has a ring prefix */
+		if (strncmp(mz->name, RTE_RING_MZ_PREFIX,
+				sizeof(RTE_RING_MZ_PREFIX) - 1) != 0)
+			continue;
+
+		r = (struct rte_ring*) (mz->addr_64);
+
+		te = rte_zmalloc("RING_TAILQ_ENTRY", sizeof(*te), 0);
+		if (te == NULL) {
+			RTE_LOG(ERR, EAL, "Cannot allocate ring tailq entry!\n");
+			return -1;
+		}
+
+		te->data = (void *) r;
+
+		TAILQ_INSERT_TAIL(ring_list, te, next);
+
+		RTE_LOG(DEBUG, EAL, "Found ring: '%s' at %p\n", r->name, mz->addr);
+	}
+	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
+
+#ifdef RTE_LIBRTE_IVSHMEM_DEBUG
+	rte_memzone_dump(stdout);
+	rte_ring_list_dump(stdout);
+#endif
+
+	return 0;
+}
+
+/* initialize ivshmem structures */
+int rte_eal_ivshmem_init(void)
+{
+	struct rte_pci_device * dev;
+	struct rte_pci_resource * res;
+	int fd, ret;
+	char path[PATH_MAX];
+
+	/* initialize everything to 0 */
+	memset(path, 0, sizeof(path));
+	ivshmem_config = NULL;
+
+	pagesz = getpagesize();
+
+	RTE_LOG(DEBUG, EAL, "Searching for IVSHMEM devices...\n");
+
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+
+		if (open_shared_config() < 0) {
+			RTE_LOG(ERR, EAL, "Could not open IVSHMEM config!\n");
+			return -1;
+		}
+	}
+	else {
+
+		TAILQ_FOREACH(dev, &pci_device_list, next) {
+
+			if (is_ivshmem_device(dev)) {
+
+				/* IVSHMEM memory is always on BAR2 */
+				res = &dev->mem_resource[2];
+
+				/* if we don't have a BAR2 */
+				if (res->len == 0)
+					continue;
+
+				/* construct pci device path */
+				snprintf(path, sizeof(path), IVSHMEM_RESOURCE_PATH,
+						dev->addr.domain, dev->addr.bus, dev->addr.devid,
+						dev->addr.function);
+
+				/* try to find memseg */
+				fd = open(path, O_RDWR);
+				if (fd < 0) {
+					RTE_LOG(ERR, EAL, "Could not open %s\n", path);
+					return -1;
+				}
+
+				/* check if it's a DPDK IVSHMEM device */
+				ret = has_ivshmem_metadata(fd, res->len);
+
+				/* is DPDK device */
+				if (ret == 1) {
+
+					/* config file creation is deferred until the first
+					 * DPDK device is found. then, it has to be created
+					 * only once. */
+					if (ivshmem_config == NULL &&
+							create_shared_config() < 0) {
+						RTE_LOG(ERR, EAL, "Could not create IVSHMEM config!\n");
+						close(fd);
+						return -1;
+					}
+
+					if (read_metadata(path, sizeof(path), fd, res->len) < 0) {
+						RTE_LOG(ERR, EAL, "Could not read metadata from"
+								" device %02x:%02x.%x!\n", dev->addr.bus,
+								dev->addr.devid, dev->addr.function);
+						close(fd);
+						return -1;
+					}
+
+					if (ivshmem_config->pci_devs_idx == RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS) {
+						RTE_LOG(WARNING, EAL,
+								"IVSHMEM PCI device limit exceeded. Increase "
+								"CONFIG_RTE_LIBRTE_IVSHMEM_MAX_PCI_DEVS  in "
+								"your config file.\n");
+						break;
+					}
+
+					RTE_LOG(INFO, EAL, "Found IVSHMEM device %02x:%02x.%x\n",
+							dev->addr.bus, dev->addr.devid, dev->addr.function);
+
+					ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].ioremap_addr = res->phys_addr;
+					snprintf(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path,
+							sizeof(ivshmem_config->pci_devs[ivshmem_config->pci_devs_idx].path),
+							"%s", path);
+
+					ivshmem_config->pci_devs_idx++;
+				}
+				/* failed to read */
+				else if (ret < 0) {
+					RTE_LOG(ERR, EAL, "Could not read IVSHMEM device: %s\n",
+							strerror(errno));
+					close(fd);
+					return -1;
+				}
+				/* not a DPDK device */
+				else
+					RTE_LOG(DEBUG, EAL, "Skipping non-DPDK IVSHMEM device\n");
+
+				/* close the BAR fd */
+				close(fd);
+			}
+		}
+	}
+
+	/* ivshmem_config is not NULL only if config was created and/or mapped */
+	if (ivshmem_config) {
+		if (map_all_segments() < 0) {
+			RTE_LOG(ERR, EAL, "Mapping IVSHMEM segments failed!\n");
+			return -1;
+		}
+	}
+	else {
+		RTE_LOG(DEBUG, EAL, "No IVSHMEM configuration found! \n");
+	}
+
+	return 0;
+}
+
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_lcore.c b/lib/librte_eal/linuxapp/eal/eal_lcore.c
new file mode 100644
index 00000000..de5b4260
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_lcore.c
@@ -0,0 +1,110 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <limits.h>
+#include <string.h>
+#include <dirent.h>
+
+#include <rte_log.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
+#include <rte_string_fns.h>
+#include <rte_debug.h>
+
+#include "eal_private.h"
+#include "eal_filesystem.h"
+#include "eal_thread.h"
+
+#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u"
+#define CORE_ID_FILE "topology/core_id"
+#define NUMA_NODE_PATH "/sys/devices/system/node"
+
+/* Check if a cpu is present by the presence of the cpu information for it */
+int
+eal_cpu_detected(unsigned lcore_id)
+{
+	char path[PATH_MAX];
+	int len = snprintf(path, sizeof(path), SYS_CPU_DIR
+		"/"CORE_ID_FILE, lcore_id);
+	if (len <= 0 || (unsigned)len >= sizeof(path))
+		return 0;
+	if (access(path, F_OK) != 0)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Get CPU socket id (NUMA node) for a logical core.
+ *
+ * This searches each nodeX directories in /sys for the symlink for the given
+ * lcore_id and returns the numa node where the lcore is found. If lcore is not
+ * found on any numa node, returns zero.
+ */
+unsigned
+eal_cpu_socket_id(unsigned lcore_id)
+{
+	unsigned socket;
+
+	for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
+		char path[PATH_MAX];
+
+		snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH,
+				socket, lcore_id);
+		if (access(path, F_OK) == 0)
+			return socket;
+	}
+	return 0;
+}
+
+/* Get the cpu core id value from the /sys/.../cpuX core_id value */
+unsigned
+eal_cpu_core_id(unsigned lcore_id)
+{
+	char path[PATH_MAX];
+	unsigned long id;
+
+	int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE);
+	if (len <= 0 || (unsigned)len >= sizeof(path))
+		goto err;
+	if (eal_parse_sysfs_value(path, &id) != 0)
+		goto err;
+	return (unsigned)id;
+
+err:
+	RTE_LOG(ERR, EAL, "Error reading core id value from %s "
+			"for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id);
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_log.c b/lib/librte_eal/linuxapp/eal/eal_log.c
new file mode 100644
index 00000000..0b133c3e
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_log.c
@@ -0,0 +1,146 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <syslog.h>
+#include <sys/queue.h>
+
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_eal.h>
+#include <rte_launch.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_spinlock.h>
+#include <rte_log.h>
+
+#include "eal_private.h"
+
+/*
+ * default log function, used once mempool (hence log history) is
+ * available
+ */
+static ssize_t
+console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
+{
+	char copybuf[BUFSIZ + 1];
+	ssize_t ret;
+	uint32_t loglevel;
+
+	/* add this log in history */
+	rte_log_add_in_history(buf, size);
+
+	/* write on stdout */
+	ret = fwrite(buf, 1, size, stdout);
+	fflush(stdout);
+
+	/* truncate message if too big (should not happen) */
+	if (size > BUFSIZ)
+		size = BUFSIZ;
+
+	/* Syslog error levels are from 0 to 7, so subtract 1 to convert */
+	loglevel = rte_log_cur_msg_loglevel() - 1;
+	memcpy(copybuf, buf, size);
+	copybuf[size] = '\0';
+
+	/* write on syslog too */
+	syslog(loglevel, "%s", copybuf);
+
+	return ret;
+}
+
+static cookie_io_functions_t console_log_func = {
+	.write = console_log_write,
+};
+
+/*
+ * set the log to default function, called during eal init process,
+ * once memzones are available.
+ */
+int
+rte_eal_log_init(const char *id, int facility)
+{
+	FILE *log_stream;
+
+	log_stream = fopencookie(NULL, "w+", console_log_func);
+	if (log_stream == NULL)
+		return -1;
+
+	openlog(id, LOG_NDELAY | LOG_PID, facility);
+
+	if (rte_eal_common_log_init(log_stream) < 0)
+		return -1;
+
+	return 0;
+}
+
+/* early logs */
+
+/*
+ * early log function, used during boot when mempool (hence log
+ * history) is not available
+ */
+static ssize_t
+early_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
+{
+	ssize_t ret;
+	ret = fwrite(buf, size, 1, stdout);
+	fflush(stdout);
+	if (ret == 0)
+		return -1;
+	return ret;
+}
+
+static cookie_io_functions_t early_log_func = {
+	.write = early_log_write,
+};
+static FILE *early_log_stream;
+
+/*
+ * init the log library, called by rte_eal_init() to enable early
+ * logs
+ */
+int
+rte_eal_log_early_init(void)
+{
+	early_log_stream = fopencookie(NULL, "w+", early_log_func);
+	if (early_log_stream == NULL) {
+		printf("Cannot configure early_log_stream\n");
+		return -1;
+	}
+	rte_openlog_stream(early_log_stream);
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
new file mode 100644
index 00000000..5b9132c6
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -0,0 +1,1559 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*   BSD LICENSE
+ *
+ *   Copyright(c) 2013 6WIND.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdarg.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <limits.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
+#include <rte_string_fns.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+#include "eal_filesystem.h"
+#include "eal_hugepages.h"
+
+#ifdef RTE_LIBRTE_XEN_DOM0
+int rte_xen_dom0_supported(void)
+{
+	return internal_config.xen_dom0_support;
+}
+#endif
+
+/**
+ * @file
+ * Huge page mapping under linux
+ *
+ * To reserve a big contiguous amount of memory, we use the hugepage
+ * feature of linux. For that, we need to have hugetlbfs mounted. This
+ * code will create many files in this directory (one per page) and
+ * map them in virtual memory. For each page, we will retrieve its
+ * physical address and remap it in order to have a virtual contiguous
+ * zone as well as a physical contiguous zone.
+ */
+
+static uint64_t baseaddr_offset;
+
+static unsigned proc_pagemap_readable;
+
+#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
+
+static void
+test_proc_pagemap_readable(void)
+{
+	int fd = open("/proc/self/pagemap", O_RDONLY);
+
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL,
+			"Cannot open /proc/self/pagemap: %s. "
+			"virt2phys address translation will not work\n",
+			strerror(errno));
+		return;
+	}
+
+	/* Is readable */
+	close(fd);
+	proc_pagemap_readable = 1;
+}
+
+/* Lock page in physical memory and prevent from swapping. */
+int
+rte_mem_lock_page(const void *virt)
+{
+	unsigned long virtual = (unsigned long)virt;
+	int page_size = getpagesize();
+	unsigned long aligned = (virtual & ~ (page_size - 1));
+	return mlock((void*)aligned, page_size);
+}
+
+/*
+ * Get physical address of any mapped virtual address in the current process.
+ */
+phys_addr_t
+rte_mem_virt2phy(const void *virtaddr)
+{
+	int fd;
+	uint64_t page, physaddr;
+	unsigned long virt_pfn;
+	int page_size;
+	off_t offset;
+
+	/* Cannot parse /proc/self/pagemap, no need to log errors everywhere */
+	if (!proc_pagemap_readable)
+		return RTE_BAD_PHYS_ADDR;
+
+	/* standard page size */
+	page_size = getpagesize();
+
+	fd = open("/proc/self/pagemap", O_RDONLY);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n",
+			__func__, strerror(errno));
+		return RTE_BAD_PHYS_ADDR;
+	}
+
+	virt_pfn = (unsigned long)virtaddr / page_size;
+	offset = sizeof(uint64_t) * virt_pfn;
+	if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
+		RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n",
+				__func__, strerror(errno));
+		close(fd);
+		return RTE_BAD_PHYS_ADDR;
+	}
+	if (read(fd, &page, sizeof(uint64_t)) < 0) {
+		RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n",
+				__func__, strerror(errno));
+		close(fd);
+		return RTE_BAD_PHYS_ADDR;
+	}
+
+	/*
+	 * the pfn (page frame number) are bits 0-54 (see
+	 * pagemap.txt in linux Documentation)
+	 */
+	physaddr = ((page & 0x7fffffffffffffULL) * page_size)
+		+ ((unsigned long)virtaddr % page_size);
+	close(fd);
+	return physaddr;
+}
+
+/*
+ * For each hugepage in hugepg_tbl, fill the physaddr value. We find
+ * it by browsing the /proc/self/pagemap special file.
+ */
+static int
+find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
+{
+	unsigned i;
+	phys_addr_t addr;
+
+	for (i = 0; i < hpi->num_pages[0]; i++) {
+		addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
+		if (addr == RTE_BAD_PHYS_ADDR)
+			return -1;
+		hugepg_tbl[i].physaddr = addr;
+	}
+	return 0;
+}
+
+/*
+ * Check whether address-space layout randomization is enabled in
+ * the kernel. This is important for multi-process as it can prevent
+ * two processes mapping data to the same virtual address
+ * Returns:
+ *    0 - address space randomization disabled
+ *    1/2 - address space randomization enabled
+ *    negative error code on error
+ */
+static int
+aslr_enabled(void)
+{
+	char c;
+	int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+	retval = read(fd, &c, 1);
+	close(fd);
+	if (retval < 0)
+		return -errno;
+	if (retval == 0)
+		return -EIO;
+	switch (c) {
+		case '0' : return 0;
+		case '1' : return 1;
+		case '2' : return 2;
+		default: return -EINVAL;
+	}
+}
+
+/*
+ * Try to mmap *size bytes in /dev/zero. If it is successful, return the
+ * pointer to the mmap'd area and keep *size unmodified. Else, retry
+ * with a smaller zone: decrease *size by hugepage_sz until it reaches
+ * 0. In this case, return NULL. Note: this function returns an address
+ * which is a multiple of hugepage size.
+ */
+static void *
+get_virtual_area(size_t *size, size_t hugepage_sz)
+{
+	void *addr;
+	int fd;
+	long aligned_addr;
+
+	if (internal_config.base_virtaddr != 0) {
+		addr = (void*) (uintptr_t) (internal_config.base_virtaddr +
+				baseaddr_offset);
+	}
+	else addr = NULL;
+
+	RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
+
+	fd = open("/dev/zero", O_RDONLY);
+	if (fd < 0){
+		RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n");
+		return NULL;
+	}
+	do {
+		addr = mmap(addr,
+				(*size) + hugepage_sz, PROT_READ, MAP_PRIVATE, fd, 0);
+		if (addr == MAP_FAILED)
+			*size -= hugepage_sz;
+	} while (addr == MAP_FAILED && *size > 0);
+
+	if (addr == MAP_FAILED) {
+		close(fd);
+		RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
+			strerror(errno));
+		return NULL;
+	}
+
+	munmap(addr, (*size) + hugepage_sz);
+	close(fd);
+
+	/* align addr to a huge page size boundary */
+	aligned_addr = (long)addr;
+	aligned_addr += (hugepage_sz - 1);
+	aligned_addr &= (~(hugepage_sz - 1));
+	addr = (void *)(aligned_addr);
+
+	RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
+		addr, *size);
+
+	/* increment offset */
+	baseaddr_offset += *size;
+
+	return addr;
+}
+
+/*
+ * Mmap all hugepages of hugepage table: it first open a file in
+ * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
+ * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
+ * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
+ * map continguous physical blocks in contiguous virtual blocks.
+ */
+static int
+map_all_hugepages(struct hugepage_file *hugepg_tbl,
+		struct hugepage_info *hpi, int orig)
+{
+	int fd;
+	unsigned i;
+	void *virtaddr;
+	void *vma_addr = NULL;
+	size_t vma_len = 0;
+
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+	RTE_SET_USED(vma_len);
+#endif
+
+	for (i = 0; i < hpi->num_pages[0]; i++) {
+		uint64_t hugepage_sz = hpi->hugepage_sz;
+
+		if (orig) {
+			hugepg_tbl[i].file_id = i;
+			hugepg_tbl[i].size = hugepage_sz;
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+			eal_get_hugefile_temp_path(hugepg_tbl[i].filepath,
+					sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
+					hugepg_tbl[i].file_id);
+#else
+			eal_get_hugefile_path(hugepg_tbl[i].filepath,
+					sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
+					hugepg_tbl[i].file_id);
+#endif
+			hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0';
+		}
+#ifndef RTE_ARCH_64
+		/* for 32-bit systems, don't remap 1G and 16G pages, just reuse
+		 * original map address as final map address.
+		 */
+		else if ((hugepage_sz == RTE_PGSIZE_1G)
+			|| (hugepage_sz == RTE_PGSIZE_16G)) {
+			hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
+			hugepg_tbl[i].orig_va = NULL;
+			continue;
+		}
+#endif
+
+#ifndef RTE_EAL_SINGLE_FILE_SEGMENTS
+		else if (vma_len == 0) {
+			unsigned j, num_pages;
+
+			/* reserve a virtual area for next contiguous
+			 * physical block: count the number of
+			 * contiguous physical pages. */
+			for (j = i+1; j < hpi->num_pages[0] ; j++) {
+#ifdef RTE_ARCH_PPC_64
+				/* The physical addresses are sorted in
+				 * descending order on PPC64 */
+				if (hugepg_tbl[j].physaddr !=
+				    hugepg_tbl[j-1].physaddr - hugepage_sz)
+					break;
+#else
+				if (hugepg_tbl[j].physaddr !=
+				    hugepg_tbl[j-1].physaddr + hugepage_sz)
+					break;
+#endif
+			}
+			num_pages = j - i;
+			vma_len = num_pages * hugepage_sz;
+
+			/* get the biggest virtual memory area up to
+			 * vma_len. If it fails, vma_addr is NULL, so
+			 * let the kernel provide the address. */
+			vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
+			if (vma_addr == NULL)
+				vma_len = hugepage_sz;
+		}
+#endif
+
+		/* try to create hugepage file */
+		fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);
+		if (fd < 0) {
+			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
+					strerror(errno));
+			return -1;
+		}
+
+		/* map the segment, and populate page tables,
+		 * the kernel fills this segment with zeros */
+		virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
+				MAP_SHARED | MAP_POPULATE, fd, 0);
+		if (virtaddr == MAP_FAILED) {
+			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
+					strerror(errno));
+			close(fd);
+			return -1;
+		}
+
+		if (orig) {
+			hugepg_tbl[i].orig_va = virtaddr;
+		}
+		else {
+			hugepg_tbl[i].final_va = virtaddr;
+		}
+
+		/* set shared flock on the file. */
+		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
+			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
+				__func__, strerror(errno));
+			close(fd);
+			return -1;
+		}
+
+		close(fd);
+
+		vma_addr = (char *)vma_addr + hugepage_sz;
+		vma_len -= hugepage_sz;
+	}
+	return 0;
+}
+
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+
+/*
+ * Remaps all hugepages into single file segments
+ */
+static int
+remap_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
+{
+	int fd;
+	unsigned i = 0, j, num_pages, page_idx = 0;
+	void *vma_addr = NULL, *old_addr = NULL, *page_addr = NULL;
+	size_t vma_len = 0;
+	size_t hugepage_sz = hpi->hugepage_sz;
+	size_t total_size, offset;
+	char filepath[MAX_HUGEPAGE_PATH];
+	phys_addr_t physaddr;
+	int socket;
+
+	while (i < hpi->num_pages[0]) {
+
+#ifndef RTE_ARCH_64
+		/* for 32-bit systems, don't remap 1G pages and 16G pages,
+		 * just reuse original map address as final map address.
+		 */
+		if ((hugepage_sz == RTE_PGSIZE_1G)
+			|| (hugepage_sz == RTE_PGSIZE_16G)) {
+			hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
+			hugepg_tbl[i].orig_va = NULL;
+			i++;
+			continue;
+		}
+#endif
+
+		/* reserve a virtual area for next contiguous
+		 * physical block: count the number of
+		 * contiguous physical pages. */
+		for (j = i+1; j < hpi->num_pages[0] ; j++) {
+#ifdef RTE_ARCH_PPC_64
+			/* The physical addresses are sorted in descending
+			 * order on PPC64 */
+			if (hugepg_tbl[j].physaddr !=
+				hugepg_tbl[j-1].physaddr - hugepage_sz)
+				break;
+#else
+			if (hugepg_tbl[j].physaddr !=
+				hugepg_tbl[j-1].physaddr + hugepage_sz)
+				break;
+#endif
+		}
+		num_pages = j - i;
+		vma_len = num_pages * hugepage_sz;
+
+		socket = hugepg_tbl[i].socket_id;
+
+		/* get the biggest virtual memory area up to
+		 * vma_len. If it fails, vma_addr is NULL, so
+		 * let the kernel provide the address. */
+		vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
+
+		/* If we can't find a big enough virtual area, work out how many pages
+		 * we are going to get */
+		if (vma_addr == NULL)
+			j = i + 1;
+		else if (vma_len != num_pages * hugepage_sz) {
+			num_pages = vma_len / hugepage_sz;
+			j = i + num_pages;
+
+		}
+
+		hugepg_tbl[page_idx].file_id = page_idx;
+		eal_get_hugefile_path(filepath,
+				sizeof(filepath),
+				hpi->hugedir,
+				hugepg_tbl[page_idx].file_id);
+
+		/* try to create hugepage file */
+		fd = open(filepath, O_CREAT | O_RDWR, 0755);
+		if (fd < 0) {
+			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__, strerror(errno));
+			return -1;
+		}
+
+		total_size = 0;
+		for (;i < j; i++) {
+
+			/* unmap current segment */
+			if (total_size > 0)
+				munmap(vma_addr, total_size);
+
+			/* unmap original page */
+			munmap(hugepg_tbl[i].orig_va, hugepage_sz);
+			unlink(hugepg_tbl[i].filepath);
+
+			total_size += hugepage_sz;
+
+			old_addr = vma_addr;
+
+			/* map new, bigger segment, and populate page tables,
+			 * the kernel fills this segment with zeros */
+			vma_addr = mmap(vma_addr, total_size,
+					PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0);
+
+			if (vma_addr == MAP_FAILED || vma_addr != old_addr) {
+				RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__, strerror(errno));
+				close(fd);
+				return -1;
+			}
+		}
+
+		/* set shared flock on the file. */
+		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
+			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
+				__func__, strerror(errno));
+			close(fd);
+			return -1;
+		}
+
+		snprintf(hugepg_tbl[page_idx].filepath, MAX_HUGEPAGE_PATH, "%s",
+				filepath);
+
+		physaddr = rte_mem_virt2phy(vma_addr);
+
+		if (physaddr == RTE_BAD_PHYS_ADDR)
+			return -1;
+
+		hugepg_tbl[page_idx].final_va = vma_addr;
+
+		hugepg_tbl[page_idx].physaddr = physaddr;
+
+		hugepg_tbl[page_idx].repeated = num_pages;
+
+		hugepg_tbl[page_idx].socket_id = socket;
+
+		close(fd);
+
+		/* verify the memory segment - that is, check that every VA corresponds
+		 * to the physical address we expect to see
+		 */
+		for (offset = 0; offset < vma_len; offset += hugepage_sz) {
+			uint64_t expected_physaddr;
+
+			expected_physaddr = hugepg_tbl[page_idx].physaddr + offset;
+			page_addr = RTE_PTR_ADD(vma_addr, offset);
+			physaddr = rte_mem_virt2phy(page_addr);
+
+			if (physaddr != expected_physaddr) {
+				RTE_LOG(ERR, EAL, "Segment sanity check failed: wrong physaddr "
+						"at %p (offset 0x%" PRIx64 ": 0x%" PRIx64
+						" (expected 0x%" PRIx64 ")\n",
+						page_addr, offset, physaddr, expected_physaddr);
+				return -1;
+			}
+		}
+
+		page_idx++;
+	}
+
+	/* zero out the rest */
+	memset(&hugepg_tbl[page_idx], 0, (hpi->num_pages[0] - page_idx) * sizeof(struct hugepage_file));
+	return page_idx;
+}
+#else/* RTE_EAL_SINGLE_FILE_SEGMENTS=n */
+
+/* Unmap all hugepages from original mapping */
+static int
+unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
+{
+        unsigned i;
+        for (i = 0; i < hpi->num_pages[0]; i++) {
+                if (hugepg_tbl[i].orig_va) {
+                        munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz);
+                        hugepg_tbl[i].orig_va = NULL;
+                }
+        }
+        return 0;
+}
+#endif /* RTE_EAL_SINGLE_FILE_SEGMENTS */
+
+/*
+ * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
+ * page.
+ */
+static int
+find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
+{
+	int socket_id;
+	char *end, *nodestr;
+	unsigned i, hp_count = 0;
+	uint64_t virt_addr;
+	char buf[BUFSIZ];
+	char hugedir_str[PATH_MAX];
+	FILE *f;
+
+	f = fopen("/proc/self/numa_maps", "r");
+	if (f == NULL) {
+		RTE_LOG(NOTICE, EAL, "cannot open /proc/self/numa_maps,"
+				" consider that all memory is in socket_id 0\n");
+		return 0;
+	}
+
+	snprintf(hugedir_str, sizeof(hugedir_str),
+			"%s/%s", hpi->hugedir, internal_config.hugefile_prefix);
+
+	/* parse numa map */
+	while (fgets(buf, sizeof(buf), f) != NULL) {
+
+		/* ignore non huge page */
+		if (strstr(buf, " huge ") == NULL &&
+				strstr(buf, hugedir_str) == NULL)
+			continue;
+
+		/* get zone addr */
+		virt_addr = strtoull(buf, &end, 16);
+		if (virt_addr == 0 || end == buf) {
+			RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+			goto error;
+		}
+
+		/* get node id (socket id) */
+		nodestr = strstr(buf, " N");
+		if (nodestr == NULL) {
+			RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+			goto error;
+		}
+		nodestr += 2;
+		end = strstr(nodestr, "=");
+		if (end == NULL) {
+			RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+			goto error;
+		}
+		end[0] = '\0';
+		end = NULL;
+
+		socket_id = strtoul(nodestr, &end, 0);
+		if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
+			RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+			goto error;
+		}
+
+		/* if we find this page in our mappings, set socket_id */
+		for (i = 0; i < hpi->num_pages[0]; i++) {
+			void *va = (void *)(unsigned long)virt_addr;
+			if (hugepg_tbl[i].orig_va == va) {
+				hugepg_tbl[i].socket_id = socket_id;
+				hp_count++;
+			}
+		}
+	}
+
+	if (hp_count < hpi->num_pages[0])
+		goto error;
+
+	fclose(f);
+	return 0;
+
+error:
+	fclose(f);
+	return -1;
+}
+
+static int
+cmp_physaddr(const void *a, const void *b)
+{
+#ifndef RTE_ARCH_PPC_64
+	const struct hugepage_file *p1 = (const struct hugepage_file *)a;
+	const struct hugepage_file *p2 = (const struct hugepage_file *)b;
+#else
+	/* PowerPC needs memory sorted in reverse order from x86 */
+	const struct hugepage_file *p1 = (const struct hugepage_file *)b;
+	const struct hugepage_file *p2 = (const struct hugepage_file *)a;
+#endif
+	if (p1->physaddr < p2->physaddr)
+		return -1;
+	else if (p1->physaddr > p2->physaddr)
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ * Uses mmap to create a shared memory area for storage of data
+ * Used in this file to store the hugepage file map on disk
+ */
+static void *
+create_shared_memory(const char *filename, const size_t mem_size)
+{
+	void *retval;
+	int fd = open(filename, O_CREAT | O_RDWR, 0666);
+	if (fd < 0)
+		return NULL;
+	if (ftruncate(fd, mem_size) < 0) {
+		close(fd);
+		return NULL;
+	}
+	retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	close(fd);
+	return retval;
+}
+
+/*
+ * this copies *active* hugepages from one hugepage table to another.
+ * destination is typically the shared memory.
+ */
+static int
+copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
+		const struct hugepage_file * src, int src_size)
+{
+	int src_pos, dst_pos = 0;
+
+	for (src_pos = 0; src_pos < src_size; src_pos++) {
+		if (src[src_pos].final_va != NULL) {
+			/* error on overflow attempt */
+			if (dst_pos == dest_size)
+				return -1;
+			memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file));
+			dst_pos++;
+		}
+	}
+	return 0;
+}
+
+static int
+unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
+		unsigned num_hp_info)
+{
+	unsigned socket, size;
+	int page, nrpages = 0;
+
+	/* get total number of hugepages */
+	for (size = 0; size < num_hp_info; size++)
+		for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
+			nrpages +=
+			internal_config.hugepage_info[size].num_pages[socket];
+
+	for (page = 0; page < nrpages; page++) {
+		struct hugepage_file *hp = &hugepg_tbl[page];
+
+		if (hp->final_va != NULL && unlink(hp->filepath)) {
+			RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n",
+				__func__, hp->filepath, strerror(errno));
+		}
+	}
+	return 0;
+}
+
+/*
+ * unmaps hugepages that are not going to be used. since we originally allocate
+ * ALL hugepages (not just those we need), additional unmapping needs to be done.
+ */
+static int
+unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
+		struct hugepage_info *hpi,
+		unsigned num_hp_info)
+{
+	unsigned socket, size;
+	int page, nrpages = 0;
+
+	/* get total number of hugepages */
+	for (size = 0; size < num_hp_info; size++)
+		for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
+			nrpages += internal_config.hugepage_info[size].num_pages[socket];
+
+	for (size = 0; size < num_hp_info; size++) {
+		for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
+			unsigned pages_found = 0;
+
+			/* traverse until we have unmapped all the unused pages */
+			for (page = 0; page < nrpages; page++) {
+				struct hugepage_file *hp = &hugepg_tbl[page];
+
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+				/* if this page was already cleared */
+				if (hp->final_va == NULL)
+					continue;
+#endif
+
+				/* find a page that matches the criteria */
+				if ((hp->size == hpi[size].hugepage_sz) &&
+						(hp->socket_id == (int) socket)) {
+
+					/* if we skipped enough pages, unmap the rest */
+					if (pages_found == hpi[size].num_pages[socket]) {
+						uint64_t unmap_len;
+
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+						unmap_len = hp->size * hp->repeated;
+#else
+						unmap_len = hp->size;
+#endif
+
+						/* get start addr and len of the remaining segment */
+						munmap(hp->final_va, (size_t) unmap_len);
+
+						hp->final_va = NULL;
+						if (unlink(hp->filepath) == -1) {
+							RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n",
+									__func__, hp->filepath, strerror(errno));
+							return -1;
+						}
+					}
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+					/* else, check how much do we need to map */
+					else {
+						int nr_pg_left =
+								hpi[size].num_pages[socket] - pages_found;
+
+						/* if we need enough memory to fit into the segment */
+						if (hp->repeated <= nr_pg_left) {
+							pages_found += hp->repeated;
+						}
+						/* truncate the segment */
+						else {
+							uint64_t final_size = nr_pg_left * hp->size;
+							uint64_t seg_size = hp->repeated * hp->size;
+
+							void * unmap_va = RTE_PTR_ADD(hp->final_va,
+									final_size);
+							int fd;
+
+							munmap(unmap_va, seg_size - final_size);
+
+							fd = open(hp->filepath, O_RDWR);
+							if (fd < 0) {
+								RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+										hp->filepath, strerror(errno));
+								return -1;
+							}
+							if (ftruncate(fd, final_size) < 0) {
+								RTE_LOG(ERR, EAL, "Cannot truncate %s: %s\n",
+										hp->filepath, strerror(errno));
+								return -1;
+							}
+							close(fd);
+
+							pages_found += nr_pg_left;
+							hp->repeated = nr_pg_left;
+						}
+					}
+#else
+					/* else, lock the page and skip */
+					else
+						pages_found++;
+#endif
+
+				} /* match page */
+			} /* foreach page */
+		} /* foreach socket */
+	} /* foreach pagesize */
+
+	return 0;
+}
+
+static inline uint64_t
+get_socket_mem_size(int socket)
+{
+	uint64_t size = 0;
+	unsigned i;
+
+	for (i = 0; i < internal_config.num_hugepage_sizes; i++){
+		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+		if (hpi->hugedir != NULL)
+			size += hpi->hugepage_sz * hpi->num_pages[socket];
+	}
+
+	return size;
+}
+
+/*
+ * This function is a NUMA-aware equivalent of calc_num_pages.
+ * It takes in the list of hugepage sizes and the
+ * number of pages thereof, and calculates the best number of
+ * pages of each size to fulfill the request for <memory> ram
+ */
+static int
+calc_num_pages_per_socket(uint64_t * memory,
+		struct hugepage_info *hp_info,
+		struct hugepage_info *hp_used,
+		unsigned num_hp_info)
+{
+	unsigned socket, j, i = 0;
+	unsigned requested, available;
+	int total_num_pages = 0;
+	uint64_t remaining_mem, cur_mem;
+	uint64_t total_mem = internal_config.memory;
+
+	if (num_hp_info == 0)
+		return -1;
+
+	/* if specific memory amounts per socket weren't requested */
+	if (internal_config.force_sockets == 0) {
+		int cpu_per_socket[RTE_MAX_NUMA_NODES];
+		size_t default_size, total_size;
+		unsigned lcore_id;
+
+		/* Compute number of cores per socket */
+		memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
+		RTE_LCORE_FOREACH(lcore_id) {
+			cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
+		}
+
+		/*
+		 * Automatically spread requested memory amongst detected sockets according
+		 * to number of cores from cpu mask present on each socket
+		 */
+		total_size = internal_config.memory;
+		for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
+
+			/* Set memory amount per socket */
+			default_size = (internal_config.memory * cpu_per_socket[socket])
+			                / rte_lcore_count();
+
+			/* Limit to maximum available memory on socket */
+			default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
+
+			/* Update sizes */
+			memory[socket] = default_size;
+			total_size -= default_size;
+		}
+
+		/*
+		 * If some memory is remaining, try to allocate it by getting all
+		 * available memory from sockets, one after the other
+		 */
+		for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
+			/* take whatever is available */
+			default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
+			                       total_size);
+
+			/* Update sizes */
+			memory[socket] += default_size;
+			total_size -= default_size;
+		}
+	}
+
+	for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
+		/* skips if the memory on specific socket wasn't requested */
+		for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
+			hp_used[i].hugedir = hp_info[i].hugedir;
+			hp_used[i].num_pages[socket] = RTE_MIN(
+					memory[socket] / hp_info[i].hugepage_sz,
+					hp_info[i].num_pages[socket]);
+
+			cur_mem = hp_used[i].num_pages[socket] *
+					hp_used[i].hugepage_sz;
+
+			memory[socket] -= cur_mem;
+			total_mem -= cur_mem;
+
+			total_num_pages += hp_used[i].num_pages[socket];
+
+			/* check if we have met all memory requests */
+			if (memory[socket] == 0)
+				break;
+
+			/* check if we have any more pages left at this size, if so
+			 * move on to next size */
+			if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
+				continue;
+			/* At this point we know that there are more pages available that are
+			 * bigger than the memory we want, so lets see if we can get enough
+			 * from other page sizes.
+			 */
+			remaining_mem = 0;
+			for (j = i+1; j < num_hp_info; j++)
+				remaining_mem += hp_info[j].hugepage_sz *
+				hp_info[j].num_pages[socket];
+
+			/* is there enough other memory, if not allocate another page and quit */
+			if (remaining_mem < memory[socket]){
+				cur_mem = RTE_MIN(memory[socket],
+						hp_info[i].hugepage_sz);
+				memory[socket] -= cur_mem;
+				total_mem -= cur_mem;
+				hp_used[i].num_pages[socket]++;
+				total_num_pages++;
+				break; /* we are done with this socket*/
+			}
+		}
+		/* if we didn't satisfy all memory requirements per socket */
+		if (memory[socket] > 0) {
+			/* to prevent icc errors */
+			requested = (unsigned) (internal_config.socket_mem[socket] /
+					0x100000);
+			available = requested -
+					((unsigned) (memory[socket] / 0x100000));
+			RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! "
+					"Requested: %uMB, available: %uMB\n", socket,
+					requested, available);
+			return -1;
+		}
+	}
+
+	/* if we didn't satisfy total memory requirements */
+	if (total_mem > 0) {
+		requested = (unsigned) (internal_config.memory / 0x100000);
+		available = requested - (unsigned) (total_mem / 0x100000);
+		RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB,"
+				" available: %uMB\n", requested, available);
+		return -1;
+	}
+	return total_num_pages;
+}
+
+/*
+ * Prepare physical memory mapping: fill configuration structure with
+ * these infos, return 0 on success.
+ *  1. map N huge pages in separate files in hugetlbfs
+ *  2. find associated physical addr
+ *  3. find associated NUMA socket ID
+ *  4. sort all huge pages by physical address
+ *  5. remap these N huge pages in the correct order
+ *  6. unmap the first mapping
+ *  7. fill memsegs in configuration with contiguous zones
+ */
+int
+rte_eal_hugepage_init(void)
+{
+	struct rte_mem_config *mcfg;
+	struct hugepage_file *hugepage, *tmp_hp = NULL;
+	struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+
+	uint64_t memory[RTE_MAX_NUMA_NODES];
+
+	unsigned hp_offset;
+	int i, j, new_memseg;
+	int nr_hugefiles, nr_hugepages = 0;
+	void *addr;
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+	int new_pages_count[MAX_HUGEPAGE_SIZES];
+#endif
+
+	test_proc_pagemap_readable();
+
+	memset(used_hp, 0, sizeof(used_hp));
+
+	/* get pointer to global configuration */
+	mcfg = rte_eal_get_configuration()->mem_config;
+
+	/* hugetlbfs can be disabled */
+	if (internal_config.no_hugetlbfs) {
+		addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
+				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+		if (addr == MAP_FAILED) {
+			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
+					strerror(errno));
+			return -1;
+		}
+		mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
+		mcfg->memseg[0].addr = addr;
+		mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
+		mcfg->memseg[0].len = internal_config.memory;
+		mcfg->memseg[0].socket_id = 0;
+		return 0;
+	}
+
+/* check if app runs on Xen Dom0 */
+	if (internal_config.xen_dom0_support) {
+#ifdef RTE_LIBRTE_XEN_DOM0
+		/* use dom0_mm kernel driver to init memory */
+		if (rte_xen_dom0_memory_init() < 0)
+			return -1;
+		else
+			return 0;
+#endif
+	}
+
+	/* calculate total number of hugepages available. at this point we haven't
+	 * yet started sorting them so they all are on socket 0 */
+	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
+		/* meanwhile, also initialize used_hp hugepage sizes in used_hp */
+		used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
+
+		nr_hugepages += internal_config.hugepage_info[i].num_pages[0];
+	}
+
+	/*
+	 * allocate a memory area for hugepage table.
+	 * this isn't shared memory yet. due to the fact that we need some
+	 * processing done on these pages, shared memory will be created
+	 * at a later stage.
+	 */
+	tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
+	if (tmp_hp == NULL)
+		goto fail;
+
+	memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
+
+	hp_offset = 0; /* where we start the current page size entries */
+
+	/* map all hugepages and sort them */
+	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+		struct hugepage_info *hpi;
+
+		/*
+		 * we don't yet mark hugepages as used at this stage, so
+		 * we just map all hugepages available to the system
+		 * all hugepages are still located on socket 0
+		 */
+		hpi = &internal_config.hugepage_info[i];
+
+		if (hpi->num_pages[0] == 0)
+			continue;
+
+		/* map all hugepages available */
+		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
+			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
+					(unsigned)(hpi->hugepage_sz / 0x100000));
+			goto fail;
+		}
+
+		/* find physical addresses and sockets for each hugepage */
+		if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0){
+			RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n",
+					(unsigned)(hpi->hugepage_sz / 0x100000));
+			goto fail;
+		}
+
+		if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
+			RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
+					(unsigned)(hpi->hugepage_sz / 0x100000));
+			goto fail;
+		}
+
+		qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
+		      sizeof(struct hugepage_file), cmp_physaddr);
+
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+		/* remap all hugepages into single file segments */
+		new_pages_count[i] = remap_all_hugepages(&tmp_hp[hp_offset], hpi);
+		if (new_pages_count[i] < 0){
+			RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
+					(unsigned)(hpi->hugepage_sz / 0x100000));
+			goto fail;
+		}
+
+		/* we have processed a num of hugepages of this size, so inc offset */
+		hp_offset += new_pages_count[i];
+#else
+		/* remap all hugepages */
+		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
+			RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
+					(unsigned)(hpi->hugepage_sz / 0x100000));
+			goto fail;
+		}
+
+		/* unmap original mappings */
+		if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
+			goto fail;
+
+		/* we have processed a num of hugepages of this size, so inc offset */
+		hp_offset += hpi->num_pages[0];
+#endif
+	}
+
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+	nr_hugefiles = 0;
+	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
+		nr_hugefiles += new_pages_count[i];
+	}
+#else
+	nr_hugefiles = nr_hugepages;
+#endif
+
+
+	/* clean out the numbers of pages */
+	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
+		for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
+			internal_config.hugepage_info[i].num_pages[j] = 0;
+
+	/* get hugepages for each socket */
+	for (i = 0; i < nr_hugefiles; i++) {
+		int socket = tmp_hp[i].socket_id;
+
+		/* find a hugepage info with right size and increment num_pages */
+		const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
+				(int)internal_config.num_hugepage_sizes);
+		for (j = 0; j < nb_hpsizes; j++) {
+			if (tmp_hp[i].size ==
+					internal_config.hugepage_info[j].hugepage_sz) {
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+					internal_config.hugepage_info[j].num_pages[socket] +=
+						tmp_hp[i].repeated;
+#else
+				internal_config.hugepage_info[j].num_pages[socket]++;
+#endif
+			}
+		}
+	}
+
+	/* make a copy of socket_mem, needed for number of pages calculation */
+	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
+		memory[i] = internal_config.socket_mem[i];
+
+	/* calculate final number of pages */
+	nr_hugepages = calc_num_pages_per_socket(memory,
+			internal_config.hugepage_info, used_hp,
+			internal_config.num_hugepage_sizes);
+
+	/* error if not enough memory available */
+	if (nr_hugepages < 0)
+		goto fail;
+
+	/* reporting in! */
+	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
+		for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+			if (used_hp[i].num_pages[j] > 0) {
+				RTE_LOG(DEBUG, EAL,
+					"Requesting %u pages of size %uMB"
+					" from socket %i\n",
+					used_hp[i].num_pages[j],
+					(unsigned)
+					(used_hp[i].hugepage_sz / 0x100000),
+					j);
+			}
+		}
+	}
+
+	/* create shared memory */
+	hugepage = create_shared_memory(eal_hugepage_info_path(),
+			nr_hugefiles * sizeof(struct hugepage_file));
+
+	if (hugepage == NULL) {
+		RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
+		goto fail;
+	}
+	memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
+
+	/*
+	 * unmap pages that we won't need (looks at used_hp).
+	 * also, sets final_va to NULL on pages that were unmapped.
+	 */
+	if (unmap_unneeded_hugepages(tmp_hp, used_hp,
+			internal_config.num_hugepage_sizes) < 0) {
+		RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
+		goto fail;
+	}
+
+	/*
+	 * copy stuff from malloc'd hugepage* to the actual shared memory.
+	 * this procedure only copies those hugepages that have final_va
+	 * not NULL. has overflow protection.
+	 */
+	if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
+			tmp_hp, nr_hugefiles) < 0) {
+		RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
+		goto fail;
+	}
+
+	/* free the hugepage backing files */
+	if (internal_config.hugepage_unlink &&
+		unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) {
+		RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n");
+		goto fail;
+	}
+
+	/* free the temporary hugepage table */
+	free(tmp_hp);
+	tmp_hp = NULL;
+
+	/* find earliest free memseg - this is needed because in case of IVSHMEM,
+	 * segments might have already been initialized */
+	for (j = 0; j < RTE_MAX_MEMSEG; j++)
+		if (mcfg->memseg[j].addr == NULL) {
+			/* move to previous segment and exit loop */
+			j--;
+			break;
+		}
+
+	for (i = 0; i < nr_hugefiles; i++) {
+		new_memseg = 0;
+
+		/* if this is a new section, create a new memseg */
+		if (i == 0)
+			new_memseg = 1;
+		else if (hugepage[i].socket_id != hugepage[i-1].socket_id)
+			new_memseg = 1;
+		else if (hugepage[i].size != hugepage[i-1].size)
+			new_memseg = 1;
+
+#ifdef RTE_ARCH_PPC_64
+		/* On PPC64 architecture, the mmap always start from higher
+		 * virtual address to lower address. Here, both the physical
+		 * address and virtual address are in descending order */
+		else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=
+		    hugepage[i].size)
+			new_memseg = 1;
+		else if (((unsigned long)hugepage[i-1].final_va -
+		    (unsigned long)hugepage[i].final_va) != hugepage[i].size)
+			new_memseg = 1;
+#else
+		else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
+		    hugepage[i].size)
+			new_memseg = 1;
+		else if (((unsigned long)hugepage[i].final_va -
+		    (unsigned long)hugepage[i-1].final_va) != hugepage[i].size)
+			new_memseg = 1;
+#endif
+
+		if (new_memseg) {
+			j += 1;
+			if (j == RTE_MAX_MEMSEG)
+				break;
+
+			mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
+			mcfg->memseg[j].addr = hugepage[i].final_va;
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+			mcfg->memseg[j].len = hugepage[i].size * hugepage[i].repeated;
+#else
+			mcfg->memseg[j].len = hugepage[i].size;
+#endif
+			mcfg->memseg[j].socket_id = hugepage[i].socket_id;
+			mcfg->memseg[j].hugepage_sz = hugepage[i].size;
+		}
+		/* continuation of previous memseg */
+		else {
+#ifdef RTE_ARCH_PPC_64
+		/* Use the phy and virt address of the last page as segment
+		 * address for IBM Power architecture */
+			mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
+			mcfg->memseg[j].addr = hugepage[i].final_va;
+#endif
+			mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
+		}
+		hugepage[i].memseg_id = j;
+	}
+
+	if (i < nr_hugefiles) {
+		RTE_LOG(ERR, EAL, "Can only reserve %d pages "
+			"from %d requested\n"
+			"Current %s=%d is not enough\n"
+			"Please either increase it or request less amount "
+			"of memory.\n",
+			i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
+			RTE_MAX_MEMSEG);
+		return -ENOMEM;
+	}
+
+	return 0;
+
+fail:
+	free(tmp_hp);
+	return -1;
+}
+
+/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+getFileSize(int fd)
+{
+	struct stat st;
+	if (fstat(fd, &st) < 0)
+		return 0;
+	return st.st_size;
+}
+
+/*
+ * This creates the memory mappings in the secondary process to match that of
+ * the server process. It goes through each memory segment in the DPDK runtime
+ * configuration and finds the hugepages which form that segment, mapping them
+ * in order to form a contiguous block in the virtual memory space
+ */
+int
+rte_eal_hugepage_attach(void)
+{
+	const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	const struct hugepage_file *hp = NULL;
+	unsigned num_hp = 0;
+	unsigned i, s = 0; /* s used to track the segment number */
+	off_t size;
+	int fd, fd_zero = -1, fd_hugepage = -1;
+
+	if (aslr_enabled() > 0) {
+		RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
+				"(ASLR) is enabled in the kernel.\n");
+		RTE_LOG(WARNING, EAL, "   This may cause issues with mapping memory "
+				"into secondary processes\n");
+	}
+
+	test_proc_pagemap_readable();
+
+	if (internal_config.xen_dom0_support) {
+#ifdef RTE_LIBRTE_XEN_DOM0
+		if (rte_xen_dom0_memory_attach() < 0) {
+			RTE_LOG(ERR, EAL,"Failed to attach memory setments of primay "
+					"process\n");
+			return -1;
+		}
+		return 0;
+#endif
+	}
+
+	fd_zero = open("/dev/zero", O_RDONLY);
+	if (fd_zero < 0) {
+		RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");
+		goto error;
+	}
+	fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY);
+	if (fd_hugepage < 0) {
+		RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
+		goto error;
+	}
+
+	/* map all segments into memory to make sure we get the addrs */
+	for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
+		void *base_addr;
+
+		/*
+		 * the first memory segment with len==0 is the one that
+		 * follows the last valid segment.
+		 */
+		if (mcfg->memseg[s].len == 0)
+			break;
+
+#ifdef RTE_LIBRTE_IVSHMEM
+		/*
+		 * if segment has ioremap address set, it's an IVSHMEM segment and
+		 * doesn't need mapping as it was already mapped earlier
+		 */
+		if (mcfg->memseg[s].ioremap_addr != 0)
+			continue;
+#endif
+
+		/*
+		 * fdzero is mmapped to get a contiguous block of virtual
+		 * addresses of the appropriate memseg size.
+		 * use mmap to get identical addresses as the primary process.
+		 */
+		base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
+				 PROT_READ, MAP_PRIVATE, fd_zero, 0);
+		if (base_addr == MAP_FAILED ||
+		    base_addr != mcfg->memseg[s].addr) {
+			RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
+				"in /dev/zero to requested address [%p]: '%s'\n",
+				(unsigned long long)mcfg->memseg[s].len,
+				mcfg->memseg[s].addr, strerror(errno));
+			if (aslr_enabled() > 0) {
+				RTE_LOG(ERR, EAL, "It is recommended to "
+					"disable ASLR in the kernel "
+					"and retry running both primary "
+					"and secondary processes\n");
+			}
+			goto error;
+		}
+	}
+
+	size = getFileSize(fd_hugepage);
+	hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
+	if (hp == NULL) {
+		RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
+		goto error;
+	}
+
+	num_hp = size / sizeof(struct hugepage_file);
+	RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
+
+	s = 0;
+	while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
+		void *addr, *base_addr;
+		uintptr_t offset = 0;
+		size_t mapping_size;
+#ifdef RTE_LIBRTE_IVSHMEM
+		/*
+		 * if segment has ioremap address set, it's an IVSHMEM segment and
+		 * doesn't need mapping as it was already mapped earlier
+		 */
+		if (mcfg->memseg[s].ioremap_addr != 0) {
+			s++;
+			continue;
+		}
+#endif
+		/*
+		 * free previously mapped memory so we can map the
+		 * hugepages into the space
+		 */
+		base_addr = mcfg->memseg[s].addr;
+		munmap(base_addr, mcfg->memseg[s].len);
+
+		/* find the hugepages for this segment and map them
+		 * we don't need to worry about order, as the server sorted the
+		 * entries before it did the second mmap of them */
+		for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
+			if (hp[i].memseg_id == (int)s){
+				fd = open(hp[i].filepath, O_RDWR);
+				if (fd < 0) {
+					RTE_LOG(ERR, EAL, "Could not open %s\n",
+						hp[i].filepath);
+					goto error;
+				}
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+				mapping_size = hp[i].size * hp[i].repeated;
+#else
+				mapping_size = hp[i].size;
+#endif
+				addr = mmap(RTE_PTR_ADD(base_addr, offset),
+						mapping_size, PROT_READ | PROT_WRITE,
+						MAP_SHARED, fd, 0);
+				close(fd); /* close file both on success and on failure */
+				if (addr == MAP_FAILED ||
+						addr != RTE_PTR_ADD(base_addr, offset)) {
+					RTE_LOG(ERR, EAL, "Could not mmap %s\n",
+						hp[i].filepath);
+					goto error;
+				}
+				offset+=mapping_size;
+			}
+		}
+		RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
+				(unsigned long long)mcfg->memseg[s].len);
+		s++;
+	}
+	/* unmap the hugepage config file, since we are done using it */
+	munmap((void *)(uintptr_t)hp, size);
+	close(fd_zero);
+	close(fd_hugepage);
+	return 0;
+
+error:
+	if (fd_zero >= 0)
+		close(fd_zero);
+	if (fd_hugepage >= 0)
+		close(fd_hugepage);
+	return -1;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
new file mode 100644
index 00000000..dbf12a84
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -0,0 +1,762 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <dirent.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+#include <rte_devargs.h>
+#include <rte_memcpy.h>
+
+#include "eal_filesystem.h"
+#include "eal_private.h"
+#include "eal_pci_init.h"
+
+/**
+ * @file
+ * PCI probing under linux
+ *
+ * This code is used to simulate a PCI probe by parsing information in sysfs.
+ * When a registered device matches a driver, it is then initialized with
+ * IGB_UIO driver (or doesn't initialize, if the device wasn't bound to it).
+ */
+
+/* unbind kernel driver for this device */
+int
+pci_unbind_kernel_driver(struct rte_pci_device *dev)
+{
+	int n;
+	FILE *f;
+	char filename[PATH_MAX];
+	char buf[BUFSIZ];
+	struct rte_pci_addr *loc = &dev->addr;
+
+	/* open /sys/bus/pci/devices/AAAA:BB:CC.D/driver */
+	snprintf(filename, sizeof(filename),
+	         SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/driver/unbind",
+	         loc->domain, loc->bus, loc->devid, loc->function);
+
+	f = fopen(filename, "w");
+	if (f == NULL) /* device was not bound */
+		return 0;
+
+	n = snprintf(buf, sizeof(buf), PCI_PRI_FMT "\n",
+	             loc->domain, loc->bus, loc->devid, loc->function);
+	if ((n < 0) || (n >= (int)sizeof(buf))) {
+		RTE_LOG(ERR, EAL, "%s(): snprintf failed\n", __func__);
+		goto error;
+	}
+	if (fwrite(buf, n, 1, f) == 0) {
+		RTE_LOG(ERR, EAL, "%s(): could not write to %s\n", __func__,
+				filename);
+		goto error;
+	}
+
+	fclose(f);
+	return 0;
+
+error:
+	fclose(f);
+	return -1;
+}
+
+static int
+pci_get_kernel_driver_by_path(const char *filename, char *dri_name)
+{
+	int count;
+	char path[PATH_MAX];
+	char *name;
+
+	if (!filename || !dri_name)
+		return -1;
+
+	count = readlink(filename, path, PATH_MAX);
+	if (count >= PATH_MAX)
+		return -1;
+
+	/* For device does not have a driver */
+	if (count < 0)
+		return 1;
+
+	path[count] = '\0';
+
+	name = strrchr(path, '/');
+	if (name) {
+		strncpy(dri_name, name + 1, strlen(name + 1) + 1);
+		return 0;
+	}
+
+	return -1;
+}
+
+/* Map pci device */
+int
+rte_eal_pci_map_device(struct rte_pci_device *dev)
+{
+	int ret = -1;
+
+	/* try mapping the NIC resources using VFIO if it exists */
+	switch (dev->kdrv) {
+	case RTE_KDRV_VFIO:
+#ifdef VFIO_PRESENT
+		if (pci_vfio_is_enabled())
+			ret = pci_vfio_map_resource(dev);
+#endif
+		break;
+	case RTE_KDRV_IGB_UIO:
+	case RTE_KDRV_UIO_GENERIC:
+		/* map resources for devices that use uio */
+		ret = pci_uio_map_resource(dev);
+		break;
+	default:
+		RTE_LOG(DEBUG, EAL,
+			"  Not managed by a supported kernel driver, skipped\n");
+		ret = 1;
+		break;
+	}
+
+	return ret;
+}
+
+/* Unmap pci device */
+void
+rte_eal_pci_unmap_device(struct rte_pci_device *dev)
+{
+	/* try unmapping the NIC resources using VFIO if it exists */
+	switch (dev->kdrv) {
+	case RTE_KDRV_VFIO:
+		RTE_LOG(ERR, EAL, "Hotplug doesn't support vfio yet\n");
+		break;
+	case RTE_KDRV_IGB_UIO:
+	case RTE_KDRV_UIO_GENERIC:
+		/* unmap resources for devices that use uio */
+		pci_uio_unmap_resource(dev);
+		break;
+	default:
+		RTE_LOG(DEBUG, EAL,
+			"  Not managed by a supported kernel driver, skipped\n");
+		break;
+	}
+}
+
+void *
+pci_find_max_end_va(void)
+{
+	const struct rte_memseg *seg = rte_eal_get_physmem_layout();
+	const struct rte_memseg *last = seg;
+	unsigned i = 0;
+
+	for (i = 0; i < RTE_MAX_MEMSEG; i++, seg++) {
+		if (seg->addr == NULL)
+			break;
+
+		if (seg->addr > last->addr)
+			last = seg;
+
+	}
+	return RTE_PTR_ADD(last->addr, last->len);
+}
+
+/* parse the "resource" sysfs file */
+static int
+pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev)
+{
+	FILE *f;
+	char buf[BUFSIZ];
+	union pci_resource_info {
+		struct {
+			char *phys_addr;
+			char *end_addr;
+			char *flags;
+		};
+		char *ptrs[PCI_RESOURCE_FMT_NVAL];
+	} res_info;
+	int i;
+	uint64_t phys_addr, end_addr, flags;
+
+	f = fopen(filename, "r");
+	if (f == NULL) {
+		RTE_LOG(ERR, EAL, "Cannot open sysfs resource\n");
+		return -1;
+	}
+
+	for (i = 0; i<PCI_MAX_RESOURCE; i++) {
+
+		if (fgets(buf, sizeof(buf), f) == NULL) {
+			RTE_LOG(ERR, EAL,
+				"%s(): cannot read resource\n", __func__);
+			goto error;
+		}
+
+		if (rte_strsplit(buf, sizeof(buf), res_info.ptrs, 3, ' ') != 3) {
+			RTE_LOG(ERR, EAL,
+				"%s(): bad resource format\n", __func__);
+			goto error;
+		}
+		errno = 0;
+		phys_addr = strtoull(res_info.phys_addr, NULL, 16);
+		end_addr = strtoull(res_info.end_addr, NULL, 16);
+		flags = strtoull(res_info.flags, NULL, 16);
+		if (errno != 0) {
+			RTE_LOG(ERR, EAL,
+				"%s(): bad resource format\n", __func__);
+			goto error;
+		}
+
+		if (flags & IORESOURCE_MEM) {
+			dev->mem_resource[i].phys_addr = phys_addr;
+			dev->mem_resource[i].len = end_addr - phys_addr + 1;
+			/* not mapped for now */
+			dev->mem_resource[i].addr = NULL;
+		}
+	}
+	fclose(f);
+	return 0;
+
+error:
+	fclose(f);
+	return -1;
+}
+
+/* Scan one pci sysfs entry, and fill the devices list from it. */
+static int
+pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus,
+	     uint8_t devid, uint8_t function)
+{
+	char filename[PATH_MAX];
+	unsigned long tmp;
+	struct rte_pci_device *dev;
+	char driver[PATH_MAX];
+	int ret;
+
+	dev = malloc(sizeof(*dev));
+	if (dev == NULL)
+		return -1;
+
+	memset(dev, 0, sizeof(*dev));
+	dev->addr.domain = domain;
+	dev->addr.bus = bus;
+	dev->addr.devid = devid;
+	dev->addr.function = function;
+
+	/* get vendor id */
+	snprintf(filename, sizeof(filename), "%s/vendor", dirname);
+	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+		free(dev);
+		return -1;
+	}
+	dev->id.vendor_id = (uint16_t)tmp;
+
+	/* get device id */
+	snprintf(filename, sizeof(filename), "%s/device", dirname);
+	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+		free(dev);
+		return -1;
+	}
+	dev->id.device_id = (uint16_t)tmp;
+
+	/* get subsystem_vendor id */
+	snprintf(filename, sizeof(filename), "%s/subsystem_vendor",
+		 dirname);
+	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+		free(dev);
+		return -1;
+	}
+	dev->id.subsystem_vendor_id = (uint16_t)tmp;
+
+	/* get subsystem_device id */
+	snprintf(filename, sizeof(filename), "%s/subsystem_device",
+		 dirname);
+	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+		free(dev);
+		return -1;
+	}
+	dev->id.subsystem_device_id = (uint16_t)tmp;
+
+	/* get max_vfs */
+	dev->max_vfs = 0;
+	snprintf(filename, sizeof(filename), "%s/max_vfs", dirname);
+	if (!access(filename, F_OK) &&
+	    eal_parse_sysfs_value(filename, &tmp) == 0)
+		dev->max_vfs = (uint16_t)tmp;
+	else {
+		/* for non igb_uio driver, need kernel version >= 3.8 */
+		snprintf(filename, sizeof(filename),
+			 "%s/sriov_numvfs", dirname);
+		if (!access(filename, F_OK) &&
+		    eal_parse_sysfs_value(filename, &tmp) == 0)
+			dev->max_vfs = (uint16_t)tmp;
+	}
+
+	/* get numa node */
+	snprintf(filename, sizeof(filename), "%s/numa_node",
+		 dirname);
+	if (access(filename, R_OK) != 0) {
+		/* if no NUMA support, set default to 0 */
+		dev->numa_node = 0;
+	} else {
+		if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+			free(dev);
+			return -1;
+		}
+		dev->numa_node = tmp;
+	}
+
+	/* parse resources */
+	snprintf(filename, sizeof(filename), "%s/resource", dirname);
+	if (pci_parse_sysfs_resource(filename, dev) < 0) {
+		RTE_LOG(ERR, EAL, "%s(): cannot parse resource\n", __func__);
+		free(dev);
+		return -1;
+	}
+
+	/* parse driver */
+	snprintf(filename, sizeof(filename), "%s/driver", dirname);
+	ret = pci_get_kernel_driver_by_path(filename, driver);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "Fail to get kernel driver\n");
+		free(dev);
+		return -1;
+	}
+
+	if (!ret) {
+		if (!strcmp(driver, "vfio-pci"))
+			dev->kdrv = RTE_KDRV_VFIO;
+		else if (!strcmp(driver, "igb_uio"))
+			dev->kdrv = RTE_KDRV_IGB_UIO;
+		else if (!strcmp(driver, "uio_pci_generic"))
+			dev->kdrv = RTE_KDRV_UIO_GENERIC;
+		else
+			dev->kdrv = RTE_KDRV_UNKNOWN;
+	} else
+		dev->kdrv = RTE_KDRV_NONE;
+
+	/* device is valid, add in list (sorted) */
+	if (TAILQ_EMPTY(&pci_device_list)) {
+		TAILQ_INSERT_TAIL(&pci_device_list, dev, next);
+	} else {
+		struct rte_pci_device *dev2;
+		int ret;
+
+		TAILQ_FOREACH(dev2, &pci_device_list, next) {
+			ret = rte_eal_compare_pci_addr(&dev->addr, &dev2->addr);
+			if (ret > 0)
+				continue;
+
+			if (ret < 0) {
+				TAILQ_INSERT_BEFORE(dev2, dev, next);
+			} else { /* already registered */
+				dev2->kdrv = dev->kdrv;
+				dev2->max_vfs = dev->max_vfs;
+				memmove(dev2->mem_resource, dev->mem_resource,
+					sizeof(dev->mem_resource));
+				free(dev);
+			}
+			return 0;
+		}
+		TAILQ_INSERT_TAIL(&pci_device_list, dev, next);
+	}
+
+	return 0;
+}
+
+/*
+ * split up a pci address into its constituent parts.
+ */
+static int
+parse_pci_addr_format(const char *buf, int bufsize, uint16_t *domain,
+		uint8_t *bus, uint8_t *devid, uint8_t *function)
+{
+	/* first split on ':' */
+	union splitaddr {
+		struct {
+			char *domain;
+			char *bus;
+			char *devid;
+			char *function;
+		};
+		char *str[PCI_FMT_NVAL]; /* last element-separator is "." not ":" */
+	} splitaddr;
+
+	char *buf_copy = strndup(buf, bufsize);
+	if (buf_copy == NULL)
+		return -1;
+
+	if (rte_strsplit(buf_copy, bufsize, splitaddr.str, PCI_FMT_NVAL, ':')
+			!= PCI_FMT_NVAL - 1)
+		goto error;
+	/* final split is on '.' between devid and function */
+	splitaddr.function = strchr(splitaddr.devid,'.');
+	if (splitaddr.function == NULL)
+		goto error;
+	*splitaddr.function++ = '\0';
+
+	/* now convert to int values */
+	errno = 0;
+	*domain = (uint16_t)strtoul(splitaddr.domain, NULL, 16);
+	*bus = (uint8_t)strtoul(splitaddr.bus, NULL, 16);
+	*devid = (uint8_t)strtoul(splitaddr.devid, NULL, 16);
+	*function = (uint8_t)strtoul(splitaddr.function, NULL, 10);
+	if (errno != 0)
+		goto error;
+
+	free(buf_copy); /* free the copy made with strdup */
+	return 0;
+error:
+	free(buf_copy);
+	return -1;
+}
+
+/*
+ * Scan the content of the PCI bus, and the devices in the devices
+ * list
+ */
+int
+rte_eal_pci_scan(void)
+{
+	struct dirent *e;
+	DIR *dir;
+	char dirname[PATH_MAX];
+	uint16_t domain;
+	uint8_t bus, devid, function;
+
+	dir = opendir(SYSFS_PCI_DEVICES);
+	if (dir == NULL) {
+		RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n",
+			__func__, strerror(errno));
+		return -1;
+	}
+
+	while ((e = readdir(dir)) != NULL) {
+		if (e->d_name[0] == '.')
+			continue;
+
+		if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &domain,
+				&bus, &devid, &function) != 0)
+			continue;
+
+		snprintf(dirname, sizeof(dirname), "%s/%s", SYSFS_PCI_DEVICES,
+			 e->d_name);
+		if (pci_scan_one(dirname, domain, bus, devid, function) < 0)
+			goto error;
+	}
+	closedir(dir);
+	return 0;
+
+error:
+	closedir(dir);
+	return -1;
+}
+
+#ifdef RTE_PCI_CONFIG
+/*
+ * It is deprecated, all its configurations have been moved into
+ * each PMD respectively.
+ */
+void
+pci_config_space_set(__rte_unused struct rte_pci_device *dev)
+{
+	RTE_LOG(DEBUG, EAL, "Nothing here, as it is deprecated\n");
+}
+#endif
+
+/* Read PCI config space. */
+int rte_eal_pci_read_config(const struct rte_pci_device *device,
+			    void *buf, size_t len, off_t offset)
+{
+	const struct rte_intr_handle *intr_handle = &device->intr_handle;
+
+	switch (intr_handle->type) {
+	case RTE_INTR_HANDLE_UIO:
+	case RTE_INTR_HANDLE_UIO_INTX:
+		return pci_uio_read_config(intr_handle, buf, len, offset);
+
+#ifdef VFIO_PRESENT
+	case RTE_INTR_HANDLE_VFIO_MSIX:
+	case RTE_INTR_HANDLE_VFIO_MSI:
+	case RTE_INTR_HANDLE_VFIO_LEGACY:
+		return pci_vfio_read_config(intr_handle, buf, len, offset);
+#endif
+	default:
+		RTE_LOG(ERR, EAL,
+			"Unknown handle type of fd %d\n",
+					intr_handle->fd);
+		return -1;
+	}
+}
+
+/* Write PCI config space. */
+int rte_eal_pci_write_config(const struct rte_pci_device *device,
+			     const void *buf, size_t len, off_t offset)
+{
+	const struct rte_intr_handle *intr_handle = &device->intr_handle;
+
+	switch (intr_handle->type) {
+	case RTE_INTR_HANDLE_UIO:
+	case RTE_INTR_HANDLE_UIO_INTX:
+		return pci_uio_write_config(intr_handle, buf, len, offset);
+
+#ifdef VFIO_PRESENT
+	case RTE_INTR_HANDLE_VFIO_MSIX:
+	case RTE_INTR_HANDLE_VFIO_MSI:
+	case RTE_INTR_HANDLE_VFIO_LEGACY:
+		return pci_vfio_write_config(intr_handle, buf, len, offset);
+#endif
+	default:
+		RTE_LOG(ERR, EAL,
+			"Unknown handle type of fd %d\n",
+					intr_handle->fd);
+		return -1;
+	}
+}
+
+#if defined(RTE_ARCH_X86)
+static int
+pci_ioport_map(struct rte_pci_device *dev, int bar __rte_unused,
+	       struct rte_pci_ioport *p)
+{
+	uint16_t start, end;
+	FILE *fp;
+	char *line = NULL;
+	char pci_id[16];
+	int found = 0;
+	size_t linesz;
+
+	snprintf(pci_id, sizeof(pci_id), PCI_PRI_FMT,
+		 dev->addr.domain, dev->addr.bus,
+		 dev->addr.devid, dev->addr.function);
+
+	fp = fopen("/proc/ioports", "r");
+	if (fp == NULL) {
+		RTE_LOG(ERR, EAL, "%s(): can't open ioports\n", __func__);
+		return -1;
+	}
+
+	while (getdelim(&line, &linesz, '\n', fp) > 0) {
+		char *ptr = line;
+		char *left;
+		int n;
+
+		n = strcspn(ptr, ":");
+		ptr[n] = 0;
+		left = &ptr[n + 1];
+
+		while (*left && isspace(*left))
+			left++;
+
+		if (!strncmp(left, pci_id, strlen(pci_id))) {
+			found = 1;
+
+			while (*ptr && isspace(*ptr))
+				ptr++;
+
+			sscanf(ptr, "%04hx-%04hx", &start, &end);
+
+			break;
+		}
+	}
+
+	free(line);
+	fclose(fp);
+
+	if (!found)
+		return -1;
+
+	dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+	p->base = start;
+	RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%x\n", start);
+
+	return 0;
+}
+#endif
+
+int
+rte_eal_pci_ioport_map(struct rte_pci_device *dev, int bar,
+		       struct rte_pci_ioport *p)
+{
+	int ret = -1;
+
+	switch (dev->kdrv) {
+#ifdef VFIO_PRESENT
+	case RTE_KDRV_VFIO:
+		if (pci_vfio_is_enabled())
+			ret = pci_vfio_ioport_map(dev, bar, p);
+		break;
+#endif
+	case RTE_KDRV_IGB_UIO:
+		ret = pci_uio_ioport_map(dev, bar, p);
+		break;
+	case RTE_KDRV_UIO_GENERIC:
+#if defined(RTE_ARCH_X86)
+		ret = pci_ioport_map(dev, bar, p);
+#else
+		ret = pci_uio_ioport_map(dev, bar, p);
+#endif
+		break;
+	case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+		ret = pci_ioport_map(dev, bar, p);
+#endif
+		break;
+	default:
+		break;
+	}
+
+	if (!ret)
+		p->dev = dev;
+
+	return ret;
+}
+
+void
+rte_eal_pci_ioport_read(struct rte_pci_ioport *p,
+			void *data, size_t len, off_t offset)
+{
+	switch (p->dev->kdrv) {
+#ifdef VFIO_PRESENT
+	case RTE_KDRV_VFIO:
+		pci_vfio_ioport_read(p, data, len, offset);
+		break;
+#endif
+	case RTE_KDRV_IGB_UIO:
+		pci_uio_ioport_read(p, data, len, offset);
+		break;
+	case RTE_KDRV_UIO_GENERIC:
+		pci_uio_ioport_read(p, data, len, offset);
+		break;
+	case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+		pci_uio_ioport_read(p, data, len, offset);
+#endif
+		break;
+	default:
+		break;
+	}
+}
+
+void
+rte_eal_pci_ioport_write(struct rte_pci_ioport *p,
+			 const void *data, size_t len, off_t offset)
+{
+	switch (p->dev->kdrv) {
+#ifdef VFIO_PRESENT
+	case RTE_KDRV_VFIO:
+		pci_vfio_ioport_write(p, data, len, offset);
+		break;
+#endif
+	case RTE_KDRV_IGB_UIO:
+		pci_uio_ioport_write(p, data, len, offset);
+		break;
+	case RTE_KDRV_UIO_GENERIC:
+		pci_uio_ioport_write(p, data, len, offset);
+		break;
+	case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+		pci_uio_ioport_write(p, data, len, offset);
+#endif
+		break;
+	default:
+		break;
+	}
+}
+
+int
+rte_eal_pci_ioport_unmap(struct rte_pci_ioport *p)
+{
+	int ret = -1;
+
+	switch (p->dev->kdrv) {
+#ifdef VFIO_PRESENT
+	case RTE_KDRV_VFIO:
+		if (pci_vfio_is_enabled())
+			ret = pci_vfio_ioport_unmap(p);
+		break;
+#endif
+	case RTE_KDRV_IGB_UIO:
+		ret = pci_uio_ioport_unmap(p);
+		break;
+	case RTE_KDRV_UIO_GENERIC:
+#if defined(RTE_ARCH_X86)
+		ret = 0;
+#else
+		ret = pci_uio_ioport_unmap(p);
+#endif
+		break;
+	case RTE_KDRV_NONE:
+#if defined(RTE_ARCH_X86)
+		ret = 0;
+#endif
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+/* Init the PCI EAL subsystem */
+int
+rte_eal_pci_init(void)
+{
+	TAILQ_INIT(&pci_driver_list);
+	TAILQ_INIT(&pci_device_list);
+
+	/* for debug purposes, PCI can be disabled */
+	if (internal_config.no_pci)
+		return 0;
+
+	if (rte_eal_pci_scan() < 0) {
+		RTE_LOG(ERR, EAL, "%s(): Cannot scan PCI bus\n", __func__);
+		return -1;
+	}
+#ifdef VFIO_PRESENT
+	pci_vfio_enable();
+
+	if (pci_vfio_is_enabled()) {
+
+		/* if we are primary process, create a thread to communicate with
+		 * secondary processes. the thread will use a socket to wait for
+		 * requests from secondary process to send open file descriptors,
+		 * because VFIO does not allow multiple open descriptors on a group or
+		 * VFIO container.
+		 */
+		if (internal_config.process_type == RTE_PROC_PRIMARY &&
+				pci_vfio_mp_sync_setup() < 0)
+			return -1;
+	}
+#endif
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
new file mode 100644
index 00000000..7011753d
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
@@ -0,0 +1,127 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef EAL_PCI_INIT_H_
+#define EAL_PCI_INIT_H_
+
+#include "eal_vfio.h"
+
+/*
+ * Helper function to map PCI resources right after hugepages in virtual memory
+ */
+extern void *pci_map_addr;
+void *pci_find_max_end_va(void);
+
+int pci_uio_alloc_resource(struct rte_pci_device *dev,
+		struct mapped_pci_resource **uio_res);
+void pci_uio_free_resource(struct rte_pci_device *dev,
+		struct mapped_pci_resource *uio_res);
+int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx,
+		struct mapped_pci_resource *uio_res, int map_idx);
+
+int pci_uio_read_config(const struct rte_intr_handle *intr_handle,
+			void *buf, size_t len, off_t offs);
+int pci_uio_write_config(const struct rte_intr_handle *intr_handle,
+			 const void *buf, size_t len, off_t offs);
+
+int pci_uio_ioport_map(struct rte_pci_device *dev, int bar,
+		       struct rte_pci_ioport *p);
+void pci_uio_ioport_read(struct rte_pci_ioport *p,
+			 void *data, size_t len, off_t offset);
+void pci_uio_ioport_write(struct rte_pci_ioport *p,
+			  const void *data, size_t len, off_t offset);
+int pci_uio_ioport_unmap(struct rte_pci_ioport *p);
+
+#ifdef VFIO_PRESENT
+
+#define VFIO_MAX_GROUPS 64
+
+int pci_vfio_enable(void);
+int pci_vfio_is_enabled(void);
+int pci_vfio_mp_sync_setup(void);
+
+/* access config space */
+int pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
+			 void *buf, size_t len, off_t offs);
+int pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
+			  const void *buf, size_t len, off_t offs);
+
+int pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
+		        struct rte_pci_ioport *p);
+void pci_vfio_ioport_read(struct rte_pci_ioport *p,
+			  void *data, size_t len, off_t offset);
+void pci_vfio_ioport_write(struct rte_pci_ioport *p,
+			   const void *data, size_t len, off_t offset);
+int pci_vfio_ioport_unmap(struct rte_pci_ioport *p);
+
+/* map VFIO resource prototype */
+int pci_vfio_map_resource(struct rte_pci_device *dev);
+int pci_vfio_get_group_fd(int iommu_group_fd);
+int pci_vfio_get_container_fd(void);
+
+/*
+ * Function prototypes for VFIO multiprocess sync functions
+ */
+int vfio_mp_sync_send_request(int socket, int req);
+int vfio_mp_sync_receive_request(int socket);
+int vfio_mp_sync_send_fd(int socket, int fd);
+int vfio_mp_sync_receive_fd(int socket);
+int vfio_mp_sync_connect_to_primary(void);
+
+/* socket comm protocol definitions */
+#define SOCKET_REQ_CONTAINER 0x100
+#define SOCKET_REQ_GROUP 0x200
+#define SOCKET_OK 0x0
+#define SOCKET_NO_FD 0x1
+#define SOCKET_ERR 0xFF
+
+/*
+ * we don't need to store device fd's anywhere since they can be obtained from
+ * the group fd via an ioctl() call.
+ */
+struct vfio_group {
+	int group_no;
+	int fd;
+};
+
+struct vfio_config {
+	int vfio_enabled;
+	int vfio_container_fd;
+	int vfio_container_has_dma;
+	int vfio_group_idx;
+	struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
+};
+
+#endif
+
+#endif /* EAL_PCI_INIT_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
new file mode 100644
index 00000000..068694dc
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
@@ -0,0 +1,491 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <linux/pci_regs.h>
+
+#if defined(RTE_ARCH_X86)
+#include <sys/io.h>
+#endif
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_eal_memconfig.h>
+#include <rte_common.h>
+#include <rte_malloc.h>
+
+#include "eal_filesystem.h"
+#include "eal_pci_init.h"
+
+void *pci_map_addr = NULL;
+
+#define OFF_MAX              ((uint64_t)(off_t)-1)
+
+int
+pci_uio_read_config(const struct rte_intr_handle *intr_handle,
+		    void *buf, size_t len, off_t offset)
+{
+	return pread(intr_handle->uio_cfg_fd, buf, len, offset);
+}
+
+int
+pci_uio_write_config(const struct rte_intr_handle *intr_handle,
+		     const void *buf, size_t len, off_t offset)
+{
+	return pwrite(intr_handle->uio_cfg_fd, buf, len, offset);
+}
+
+static int
+pci_uio_set_bus_master(int dev_fd)
+{
+	uint16_t reg;
+	int ret;
+
+	ret = pread(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
+	if (ret != sizeof(reg)) {
+		RTE_LOG(ERR, EAL,
+			"Cannot read command from PCI config space!\n");
+		return -1;
+	}
+
+	/* return if bus mastering is already on */
+	if (reg & PCI_COMMAND_MASTER)
+		return 0;
+
+	reg |= PCI_COMMAND_MASTER;
+
+	ret = pwrite(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
+	if (ret != sizeof(reg)) {
+		RTE_LOG(ERR, EAL,
+			"Cannot write command to PCI config space!\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+pci_mknod_uio_dev(const char *sysfs_uio_path, unsigned uio_num)
+{
+	FILE *f;
+	char filename[PATH_MAX];
+	int ret;
+	unsigned major, minor;
+	dev_t dev;
+
+	/* get the name of the sysfs file that contains the major and minor
+	 * of the uio device and read its content */
+	snprintf(filename, sizeof(filename), "%s/dev", sysfs_uio_path);
+
+	f = fopen(filename, "r");
+	if (f == NULL) {
+		RTE_LOG(ERR, EAL, "%s(): cannot open sysfs to get major:minor\n",
+			__func__);
+		return -1;
+	}
+
+	ret = fscanf(f, "%u:%u", &major, &minor);
+	if (ret != 2) {
+		RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs to get major:minor\n",
+			__func__);
+		fclose(f);
+		return -1;
+	}
+	fclose(f);
+
+	/* create the char device "mknod /dev/uioX c major minor" */
+	snprintf(filename, sizeof(filename), "/dev/uio%u", uio_num);
+	dev = makedev(major, minor);
+	ret = mknod(filename, S_IFCHR | S_IRUSR | S_IWUSR, dev);
+	if (f == NULL) {
+		RTE_LOG(ERR, EAL, "%s(): mknod() failed %s\n",
+			__func__, strerror(errno));
+		return -1;
+	}
+
+	return ret;
+}
+
+/*
+ * Return the uioX char device used for a pci device. On success, return
+ * the UIO number and fill dstbuf string with the path of the device in
+ * sysfs. On error, return a negative value. In this case dstbuf is
+ * invalid.
+ */
+static int
+pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf,
+			   unsigned int buflen, int create)
+{
+	struct rte_pci_addr *loc = &dev->addr;
+	unsigned int uio_num;
+	struct dirent *e;
+	DIR *dir;
+	char dirname[PATH_MAX];
+
+	/* depending on kernel version, uio can be located in uio/uioX
+	 * or uio:uioX */
+
+	snprintf(dirname, sizeof(dirname),
+			SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/uio",
+			loc->domain, loc->bus, loc->devid, loc->function);
+
+	dir = opendir(dirname);
+	if (dir == NULL) {
+		/* retry with the parent directory */
+		snprintf(dirname, sizeof(dirname),
+				SYSFS_PCI_DEVICES "/" PCI_PRI_FMT,
+				loc->domain, loc->bus, loc->devid, loc->function);
+		dir = opendir(dirname);
+
+		if (dir == NULL) {
+			RTE_LOG(ERR, EAL, "Cannot opendir %s\n", dirname);
+			return -1;
+		}
+	}
+
+	/* take the first file starting with "uio" */
+	while ((e = readdir(dir)) != NULL) {
+		/* format could be uio%d ...*/
+		int shortprefix_len = sizeof("uio") - 1;
+		/* ... or uio:uio%d */
+		int longprefix_len = sizeof("uio:uio") - 1;
+		char *endptr;
+
+		if (strncmp(e->d_name, "uio", 3) != 0)
+			continue;
+
+		/* first try uio%d */
+		errno = 0;
+		uio_num = strtoull(e->d_name + shortprefix_len, &endptr, 10);
+		if (errno == 0 && endptr != (e->d_name + shortprefix_len)) {
+			snprintf(dstbuf, buflen, "%s/uio%u", dirname, uio_num);
+			break;
+		}
+
+		/* then try uio:uio%d */
+		errno = 0;
+		uio_num = strtoull(e->d_name + longprefix_len, &endptr, 10);
+		if (errno == 0 && endptr != (e->d_name + longprefix_len)) {
+			snprintf(dstbuf, buflen, "%s/uio:uio%u", dirname, uio_num);
+			break;
+		}
+	}
+	closedir(dir);
+
+	/* No uio resource found */
+	if (e == NULL)
+		return -1;
+
+	/* create uio device if we've been asked to */
+	if (internal_config.create_uio_dev && create &&
+			pci_mknod_uio_dev(dstbuf, uio_num) < 0)
+		RTE_LOG(WARNING, EAL, "Cannot create /dev/uio%u\n", uio_num);
+
+	return uio_num;
+}
+
+void
+pci_uio_free_resource(struct rte_pci_device *dev,
+		struct mapped_pci_resource *uio_res)
+{
+	rte_free(uio_res);
+
+	if (dev->intr_handle.uio_cfg_fd >= 0) {
+		close(dev->intr_handle.uio_cfg_fd);
+		dev->intr_handle.uio_cfg_fd = -1;
+	}
+	if (dev->intr_handle.fd) {
+		close(dev->intr_handle.fd);
+		dev->intr_handle.fd = -1;
+		dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+	}
+}
+
+int
+pci_uio_alloc_resource(struct rte_pci_device *dev,
+		struct mapped_pci_resource **uio_res)
+{
+	char dirname[PATH_MAX];
+	char cfgname[PATH_MAX];
+	char devname[PATH_MAX]; /* contains the /dev/uioX */
+	int uio_num;
+	struct rte_pci_addr *loc;
+
+	loc = &dev->addr;
+
+	/* find uio resource */
+	uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 1);
+	if (uio_num < 0) {
+		RTE_LOG(WARNING, EAL, "  "PCI_PRI_FMT" not managed by UIO driver, "
+				"skipping\n", loc->domain, loc->bus, loc->devid, loc->function);
+		return 1;
+	}
+	snprintf(devname, sizeof(devname), "/dev/uio%u", uio_num);
+
+	/* save fd if in primary process */
+	dev->intr_handle.fd = open(devname, O_RDWR);
+	if (dev->intr_handle.fd < 0) {
+		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+			devname, strerror(errno));
+		goto error;
+	}
+
+	snprintf(cfgname, sizeof(cfgname),
+			"/sys/class/uio/uio%u/device/config", uio_num);
+	dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR);
+	if (dev->intr_handle.uio_cfg_fd < 0) {
+		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+			cfgname, strerror(errno));
+		goto error;
+	}
+
+	if (dev->kdrv == RTE_KDRV_IGB_UIO)
+		dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
+	else {
+		dev->intr_handle.type = RTE_INTR_HANDLE_UIO_INTX;
+
+		/* set bus master that is not done by uio_pci_generic */
+		if (pci_uio_set_bus_master(dev->intr_handle.uio_cfg_fd)) {
+			RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
+			goto error;
+		}
+	}
+
+	/* allocate the mapping details for secondary processes*/
+	*uio_res = rte_zmalloc("UIO_RES", sizeof(**uio_res), 0);
+	if (*uio_res == NULL) {
+		RTE_LOG(ERR, EAL,
+			"%s(): cannot store uio mmap details\n", __func__);
+		goto error;
+	}
+
+	snprintf((*uio_res)->path, sizeof((*uio_res)->path), "%s", devname);
+	memcpy(&(*uio_res)->pci_addr, &dev->addr, sizeof((*uio_res)->pci_addr));
+
+	return 0;
+
+error:
+	pci_uio_free_resource(dev, *uio_res);
+	return -1;
+}
+
+int
+pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx,
+		struct mapped_pci_resource *uio_res, int map_idx)
+{
+	int fd;
+	char devname[PATH_MAX]; /* contains the /dev/uioX */
+	void *mapaddr;
+	struct rte_pci_addr *loc;
+	struct pci_map *maps;
+
+	loc = &dev->addr;
+	maps = uio_res->maps;
+
+	/* update devname for mmap  */
+	snprintf(devname, sizeof(devname),
+			SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/resource%d",
+			loc->domain, loc->bus, loc->devid,
+			loc->function, res_idx);
+
+	/* allocate memory to keep path */
+	maps[map_idx].path = rte_malloc(NULL, strlen(devname) + 1, 0);
+	if (maps[map_idx].path == NULL) {
+		RTE_LOG(ERR, EAL, "Cannot allocate memory for path: %s\n",
+				strerror(errno));
+		return -1;
+	}
+
+	/*
+	 * open resource file, to mmap it
+	 */
+	fd = open(devname, O_RDWR);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+				devname, strerror(errno));
+		goto error;
+	}
+
+	/* try mapping somewhere close to the end of hugepages */
+	if (pci_map_addr == NULL)
+		pci_map_addr = pci_find_max_end_va();
+
+	mapaddr = pci_map_resource(pci_map_addr, fd, 0,
+			(size_t)dev->mem_resource[res_idx].len, 0);
+	close(fd);
+	if (mapaddr == MAP_FAILED)
+		goto error;
+
+	pci_map_addr = RTE_PTR_ADD(mapaddr,
+			(size_t)dev->mem_resource[res_idx].len);
+
+	maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr;
+	maps[map_idx].size = dev->mem_resource[res_idx].len;
+	maps[map_idx].addr = mapaddr;
+	maps[map_idx].offset = 0;
+	strcpy(maps[map_idx].path, devname);
+	dev->mem_resource[res_idx].addr = mapaddr;
+
+	return 0;
+
+error:
+	rte_free(maps[map_idx].path);
+	return -1;
+}
+
+int
+pci_uio_ioport_map(struct rte_pci_device *dev, int bar,
+		   struct rte_pci_ioport *p)
+{
+#if defined(RTE_ARCH_X86)
+	char dirname[PATH_MAX];
+	char filename[PATH_MAX];
+	int uio_num;
+	unsigned long start;
+
+	uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 0);
+	if (uio_num < 0)
+		return -1;
+
+	/* get portio start */
+	snprintf(filename, sizeof(filename),
+		 "%s/portio/port%d/start", dirname, bar);
+	if (eal_parse_sysfs_value(filename, &start) < 0) {
+		RTE_LOG(ERR, EAL, "%s(): cannot parse portio start\n",
+			__func__);
+		return -1;
+	}
+	/* ensure we don't get anything funny here, read/write will cast to
+	 * uin16_t */
+	if (start > UINT16_MAX)
+		return -1;
+
+	/* FIXME only for primary process ? */
+	if (dev->intr_handle.type == RTE_INTR_HANDLE_UNKNOWN) {
+
+		snprintf(filename, sizeof(filename), "/dev/uio%u", uio_num);
+		dev->intr_handle.fd = open(filename, O_RDWR);
+		if (dev->intr_handle.fd < 0) {
+			RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+				filename, strerror(errno));
+			return -1;
+		}
+		dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
+	}
+
+	RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%lx\n", start);
+
+	p->base = start;
+	return 0;
+#else
+	RTE_SET_USED(dev);
+	RTE_SET_USED(bar);
+	RTE_SET_USED(p);
+	return -1;
+#endif
+}
+
+void
+pci_uio_ioport_read(struct rte_pci_ioport *p,
+		    void *data, size_t len, off_t offset)
+{
+#if defined(RTE_ARCH_X86)
+	uint8_t *d;
+	int size;
+	unsigned short reg = p->base + offset;
+
+	for (d = data; len > 0; d += size, reg += size, len -= size) {
+		if (len >= 4) {
+			size = 4;
+			*(uint32_t *)d = inl(reg);
+		} else if (len >= 2) {
+			size = 2;
+			*(uint16_t *)d = inw(reg);
+		} else {
+			size = 1;
+			*d = inb(reg);
+		}
+	}
+#else
+	RTE_SET_USED(p);
+	RTE_SET_USED(data);
+	RTE_SET_USED(len);
+	RTE_SET_USED(offset);
+#endif
+}
+
+void
+pci_uio_ioport_write(struct rte_pci_ioport *p,
+		     const void *data, size_t len, off_t offset)
+{
+#if defined(RTE_ARCH_X86)
+	const uint8_t *s;
+	int size;
+	unsigned short reg = p->base + offset;
+
+	for (s = data; len > 0; s += size, reg += size, len -= size) {
+		if (len >= 4) {
+			size = 4;
+			outl_p(*(const uint32_t *)s, reg);
+		} else if (len >= 2) {
+			size = 2;
+			outw_p(*(const uint16_t *)s, reg);
+		} else {
+			size = 1;
+			outb_p(*s, reg);
+		}
+	}
+#else
+	RTE_SET_USED(p);
+	RTE_SET_USED(data);
+	RTE_SET_USED(len);
+	RTE_SET_USED(offset);
+#endif
+}
+
+int
+pci_uio_ioport_unmap(struct rte_pci_ioport *p)
+{
+	RTE_SET_USED(p);
+#if defined(RTE_ARCH_X86)
+	/* FIXME close intr fd ? */
+	return 0;
+#else
+	return -1;
+#endif
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
new file mode 100644
index 00000000..10266f8f
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -0,0 +1,1097 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <linux/pci_regs.h>
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+#include <eal_private.h>
+
+#include "eal_filesystem.h"
+#include "eal_pci_init.h"
+#include "eal_vfio.h"
+
+/**
+ * @file
+ * PCI probing under linux (VFIO version)
+ *
+ * This code tries to determine if the PCI device is bound to VFIO driver,
+ * and initialize it (map BARs, set up interrupts) if that's the case.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+
+#define PAGE_SIZE   (sysconf(_SC_PAGESIZE))
+#define PAGE_MASK   (~(PAGE_SIZE - 1))
+
+static struct rte_tailq_elem rte_vfio_tailq = {
+	.name = "VFIO_RESOURCE_LIST",
+};
+EAL_REGISTER_TAILQ(rte_vfio_tailq)
+
+#define VFIO_DIR "/dev/vfio"
+#define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
+#define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
+#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
+#define VFIO_GET_REGION_IDX(x) (x >> 40)
+
+/* per-process VFIO config */
+static struct vfio_config vfio_cfg;
+
+/* DMA mapping function prototype.
+ * Takes VFIO container fd as a parameter.
+ * Returns 0 on success, -1 on error.
+ * */
+typedef int (*vfio_dma_func_t)(int);
+
+struct vfio_iommu_type {
+	int type_id;
+	const char *name;
+	vfio_dma_func_t dma_map_func;
+};
+
+static int vfio_type1_dma_map(int);
+static int vfio_noiommu_dma_map(int);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+	/* x86 IOMMU, otherwise known as type 1 */
+	{ RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+	/* IOMMU-less mode */
+	{ RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
+};
+
+int
+vfio_type1_dma_map(int vfio_container_fd)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	int i, ret;
+
+	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		struct vfio_iommu_type1_dma_map dma_map;
+
+		if (ms[i].addr == NULL)
+			break;
+
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = ms[i].addr_64;
+		dma_map.size = ms[i].len;
+		dma_map.iova = ms[i].phys_addr;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int
+vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+	/* No-IOMMU mode does not need DMA mapping */
+	return 0;
+}
+
+int
+pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
+		    void *buf, size_t len, off_t offs)
+{
+	return pread64(intr_handle->vfio_dev_fd, buf, len,
+	       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
+}
+
+int
+pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
+		    const void *buf, size_t len, off_t offs)
+{
+	return pwrite64(intr_handle->vfio_dev_fd, buf, len,
+	       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
+}
+
+/* get PCI BAR number where MSI-X interrupts are */
+static int
+pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset,
+		      uint32_t *msix_table_size)
+{
+	int ret;
+	uint32_t reg;
+	uint16_t flags;
+	uint8_t cap_id, cap_offset;
+
+	/* read PCI capability pointer from config space */
+	ret = pread64(fd, &reg, sizeof(reg),
+			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+			PCI_CAPABILITY_LIST);
+	if (ret != sizeof(reg)) {
+		RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+				"config space!\n");
+		return -1;
+	}
+
+	/* we need first byte */
+	cap_offset = reg & 0xFF;
+
+	while (cap_offset) {
+
+		/* read PCI capability ID */
+		ret = pread64(fd, &reg, sizeof(reg),
+				VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+				cap_offset);
+		if (ret != sizeof(reg)) {
+			RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI "
+					"config space!\n");
+			return -1;
+		}
+
+		/* we need first byte */
+		cap_id = reg & 0xFF;
+
+		/* if we haven't reached MSI-X, check next capability */
+		if (cap_id != PCI_CAP_ID_MSIX) {
+			ret = pread64(fd, &reg, sizeof(reg),
+					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+					cap_offset);
+			if (ret != sizeof(reg)) {
+				RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+						"config space!\n");
+				return -1;
+			}
+
+			/* we need second byte */
+			cap_offset = (reg & 0xFF00) >> 8;
+
+			continue;
+		}
+		/* else, read table offset */
+		else {
+			/* table offset resides in the next 4 bytes */
+			ret = pread64(fd, &reg, sizeof(reg),
+					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+					cap_offset + 4);
+			if (ret != sizeof(reg)) {
+				RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config "
+						"space!\n");
+				return -1;
+			}
+
+			ret = pread64(fd, &flags, sizeof(flags),
+					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+					cap_offset + 2);
+			if (ret != sizeof(flags)) {
+				RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config "
+						"space!\n");
+				return -1;
+			}
+
+			*msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
+			*msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
+			*msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
+
+			return 0;
+		}
+	}
+	return 0;
+}
+
+/* set PCI bus mastering */
+static int
+pci_vfio_set_bus_master(int dev_fd)
+{
+	uint16_t reg;
+	int ret;
+
+	ret = pread64(dev_fd, &reg, sizeof(reg),
+			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+			PCI_COMMAND);
+	if (ret != sizeof(reg)) {
+		RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
+		return -1;
+	}
+
+	/* set the master bit */
+	reg |= PCI_COMMAND_MASTER;
+
+	ret = pwrite64(dev_fd, &reg, sizeof(reg),
+			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+			PCI_COMMAND);
+
+	if (ret != sizeof(reg)) {
+		RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+static const struct vfio_iommu_type *
+pci_vfio_set_iommu_type(int vfio_container_fd) {
+	unsigned idx;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
+
+		int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+				t->type_id);
+		if (!ret) {
+			RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
+					t->type_id, t->name);
+			return t;
+		}
+		/* not an error, there may be more supported IOMMU types */
+		RTE_LOG(DEBUG, EAL, "  set IOMMU type %d (%s) failed, "
+				"error %i (%s)\n", t->type_id, t->name, errno,
+				strerror(errno));
+	}
+	/* if we didn't find a suitable IOMMU type, fail */
+	return NULL;
+}
+
+/* check if we have any supported extensions */
+static int
+pci_vfio_has_supported_extensions(int vfio_container_fd) {
+	int ret;
+	unsigned idx, n_extensions = 0;
+	for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+		const struct vfio_iommu_type *t = &iommu_types[idx];
+
+		ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
+				t->type_id);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
+				"error %i (%s)\n", errno,
+				strerror(errno));
+			close(vfio_container_fd);
+			return -1;
+		} else if (ret == 1) {
+			/* we found a supported extension */
+			n_extensions++;
+		}
+		RTE_LOG(DEBUG, EAL, "  IOMMU type %d (%s) is %s\n",
+				t->type_id, t->name,
+				ret ? "supported" : "not supported");
+	}
+
+	/* if we didn't find any supported IOMMU types, fail */
+	if (!n_extensions) {
+		close(vfio_container_fd);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* set up interrupt support (but not enable interrupts) */
+static int
+pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
+{
+	int i, ret, intr_idx;
+
+	/* default to invalid index */
+	intr_idx = VFIO_PCI_NUM_IRQS;
+
+	/* get interrupt type from internal config (MSI-X by default, can be
+	 * overriden from the command line
+	 */
+	switch (internal_config.vfio_intr_mode) {
+	case RTE_INTR_MODE_MSIX:
+		intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
+		break;
+	case RTE_INTR_MODE_MSI:
+		intr_idx = VFIO_PCI_MSI_IRQ_INDEX;
+		break;
+	case RTE_INTR_MODE_LEGACY:
+		intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
+		break;
+	/* don't do anything if we want to automatically determine interrupt type */
+	case RTE_INTR_MODE_NONE:
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "  unknown default interrupt type!\n");
+		return -1;
+	}
+
+	/* start from MSI-X interrupt type */
+	for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
+		struct vfio_irq_info irq = { .argsz = sizeof(irq) };
+		int fd = -1;
+
+		/* skip interrupt modes we don't want */
+		if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE &&
+				i != intr_idx)
+			continue;
+
+		irq.index = i;
+
+		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "  cannot get IRQ info, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+
+		/* if this vector cannot be used with eventfd, fail if we explicitly
+		 * specified interrupt type, otherwise continue */
+		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
+			if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE) {
+				RTE_LOG(ERR, EAL,
+						"  interrupt vector does not support eventfd!\n");
+				return -1;
+			} else
+				continue;
+		}
+
+		/* set up an eventfd for interrupts */
+		fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+		if (fd < 0) {
+			RTE_LOG(ERR, EAL, "  cannot set up eventfd, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+
+		dev->intr_handle.fd = fd;
+		dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
+
+		switch (i) {
+		case VFIO_PCI_MSIX_IRQ_INDEX:
+			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX;
+			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
+			break;
+		case VFIO_PCI_MSI_IRQ_INDEX:
+			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSI;
+			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
+			break;
+		case VFIO_PCI_INTX_IRQ_INDEX:
+			internal_config.vfio_intr_mode = RTE_INTR_MODE_LEGACY;
+			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY;
+			break;
+		default:
+			RTE_LOG(ERR, EAL, "  unknown interrupt type!\n");
+			return -1;
+		}
+
+		return 0;
+	}
+
+	/* if we're here, we haven't found a suitable interrupt vector */
+	return -1;
+}
+
+/* open container fd or get an existing one */
+int
+pci_vfio_get_container_fd(void)
+{
+	int ret, vfio_container_fd;
+
+	/* if we're in a primary process, try to open the container */
+	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
+		if (vfio_container_fd < 0) {
+			RTE_LOG(ERR, EAL, "  cannot open VFIO container, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+
+		/* check VFIO API version */
+		ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
+		if (ret != VFIO_API_VERSION) {
+			if (ret < 0)
+				RTE_LOG(ERR, EAL, "  could not get VFIO API version, "
+						"error %i (%s)\n", errno, strerror(errno));
+			else
+				RTE_LOG(ERR, EAL, "  unsupported VFIO API version!\n");
+			close(vfio_container_fd);
+			return -1;
+		}
+
+		ret = pci_vfio_has_supported_extensions(vfio_container_fd);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  no supported IOMMU "
+					"extensions found!\n");
+			return -1;
+		}
+
+		return vfio_container_fd;
+	} else {
+		/*
+		 * if we're in a secondary process, request container fd from the
+		 * primary process via our socket
+		 */
+		int socket_fd;
+
+		socket_fd = vfio_mp_sync_connect_to_primary();
+		if (socket_fd < 0) {
+			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
+			return -1;
+		}
+		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
+			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
+			close(socket_fd);
+			return -1;
+		}
+		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
+		if (vfio_container_fd < 0) {
+			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
+			close(socket_fd);
+			return -1;
+		}
+		close(socket_fd);
+		return vfio_container_fd;
+	}
+
+	return -1;
+}
+
+/* open group fd or get an existing one */
+int
+pci_vfio_get_group_fd(int iommu_group_no)
+{
+	int i;
+	int vfio_group_fd;
+	char filename[PATH_MAX];
+
+	/* check if we already have the group descriptor open */
+	for (i = 0; i < vfio_cfg.vfio_group_idx; i++)
+		if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
+			return vfio_cfg.vfio_groups[i].fd;
+
+	/* if primary, try to open the group */
+	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		/* try regular group format */
+		snprintf(filename, sizeof(filename),
+				 VFIO_GROUP_FMT, iommu_group_no);
+		vfio_group_fd = open(filename, O_RDWR);
+		if (vfio_group_fd < 0) {
+			/* if file not found, it's not an error */
+			if (errno != ENOENT) {
+				RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+						strerror(errno));
+				return -1;
+			}
+
+			/* special case: try no-IOMMU path as well */
+			snprintf(filename, sizeof(filename),
+					VFIO_NOIOMMU_GROUP_FMT, iommu_group_no);
+			vfio_group_fd = open(filename, O_RDWR);
+			if (vfio_group_fd < 0) {
+				if (errno != ENOENT) {
+					RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+							strerror(errno));
+					return -1;
+				}
+				return 0;
+			}
+			/* noiommu group found */
+		}
+
+		/* if the fd is valid, create a new group for it */
+		if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
+			RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+			close(vfio_group_fd);
+			return -1;
+		}
+		vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
+		vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
+		return vfio_group_fd;
+	}
+	/* if we're in a secondary process, request group fd from the primary
+	 * process via our socket
+	 */
+	else {
+		int socket_fd, ret;
+
+		socket_fd = vfio_mp_sync_connect_to_primary();
+
+		if (socket_fd < 0) {
+			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
+			return -1;
+		}
+		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
+			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
+			close(socket_fd);
+			return -1;
+		}
+		if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
+			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
+			close(socket_fd);
+			return -1;
+		}
+		ret = vfio_mp_sync_receive_request(socket_fd);
+		switch (ret) {
+		case SOCKET_NO_FD:
+			close(socket_fd);
+			return 0;
+		case SOCKET_OK:
+			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
+			/* if we got the fd, return it */
+			if (vfio_group_fd > 0) {
+				close(socket_fd);
+				return vfio_group_fd;
+			}
+			/* fall-through on error */
+		default:
+			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
+			close(socket_fd);
+			return -1;
+		}
+	}
+	return -1;
+}
+
+/* parse IOMMU group number for a PCI device
+ * returns 1 on success, -1 for errors, 0 for non-existent group
+ */
+static int
+pci_vfio_get_group_no(const char *pci_addr, int *iommu_group_no)
+{
+	char linkname[PATH_MAX];
+	char filename[PATH_MAX];
+	char *tok[16], *group_tok, *end;
+	int ret;
+
+	memset(linkname, 0, sizeof(linkname));
+	memset(filename, 0, sizeof(filename));
+
+	/* try to find out IOMMU group for this device */
+	snprintf(linkname, sizeof(linkname),
+			 SYSFS_PCI_DEVICES "/%s/iommu_group", pci_addr);
+
+	ret = readlink(linkname, filename, sizeof(filename));
+
+	/* if the link doesn't exist, no VFIO for us */
+	if (ret < 0)
+		return 0;
+
+	ret = rte_strsplit(filename, sizeof(filename),
+			tok, RTE_DIM(tok), '/');
+
+	if (ret <= 0) {
+		RTE_LOG(ERR, EAL, "  %s cannot get IOMMU group\n", pci_addr);
+		return -1;
+	}
+
+	/* IOMMU group is always the last token */
+	errno = 0;
+	group_tok = tok[ret - 1];
+	end = group_tok;
+	*iommu_group_no = strtol(group_tok, &end, 10);
+	if ((end != group_tok && *end != '\0') || errno != 0) {
+		RTE_LOG(ERR, EAL, "  %s error parsing IOMMU number!\n", pci_addr);
+		return -1;
+	}
+
+	return 1;
+}
+
+static void
+clear_current_group(void)
+{
+	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0;
+	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1;
+}
+
+
+/*
+ * map the PCI resources of a PCI device in virtual memory (VFIO version).
+ * primary and secondary processes follow almost exactly the same path
+ */
+int
+pci_vfio_map_resource(struct rte_pci_device *dev)
+{
+	struct vfio_group_status group_status = {
+			.argsz = sizeof(group_status)
+	};
+	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+	int vfio_group_fd, vfio_dev_fd;
+	int iommu_group_no;
+	char pci_addr[PATH_MAX] = {0};
+	struct rte_pci_addr *loc = &dev->addr;
+	int i, ret, msix_bar;
+	struct mapped_pci_resource *vfio_res = NULL;
+	struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+
+	struct pci_map *maps;
+	uint32_t msix_table_offset = 0;
+	uint32_t msix_table_size = 0;
+	uint32_t ioport_bar;
+
+	dev->intr_handle.fd = -1;
+	dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+
+	/* store PCI address string */
+	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+			loc->domain, loc->bus, loc->devid, loc->function);
+
+	/* get group number */
+	ret = pci_vfio_get_group_no(pci_addr, &iommu_group_no);
+	if (ret == 0) {
+		RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+			pci_addr);
+		return 1;
+	}
+
+	/* if negative, something failed */
+	if (ret < 0)
+		return -1;
+
+	/* get the actual group fd */
+	vfio_group_fd = pci_vfio_get_group_fd(iommu_group_no);
+	if (vfio_group_fd < 0)
+		return -1;
+
+	/* store group fd */
+	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
+	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
+
+	/* if group_fd == 0, that means the device isn't managed by VFIO */
+	if (vfio_group_fd == 0) {
+		RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+				pci_addr);
+		/* we store 0 as group fd to distinguish between existing but
+		 * unbound VFIO groups, and groups that don't exist at all.
+		 */
+		vfio_cfg.vfio_group_idx++;
+		return 1;
+	}
+
+	/*
+	 * at this point, we know at least one port on this device is bound to VFIO,
+	 * so we can proceed to try and set this particular port up
+	 */
+
+	/* check if the group is viable */
+	ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "  %s cannot get group status, "
+				"error %i (%s)\n", pci_addr, errno, strerror(errno));
+		close(vfio_group_fd);
+		clear_current_group();
+		return -1;
+	} else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+		RTE_LOG(ERR, EAL, "  %s VFIO group is not viable!\n", pci_addr);
+		close(vfio_group_fd);
+		clear_current_group();
+		return -1;
+	}
+
+	/*
+	 * at this point, we know that this group is viable (meaning, all devices
+	 * are either bound to VFIO or not bound to anything)
+	 */
+
+	/* check if group does not have a container yet */
+	if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
+
+		/* add group to a container */
+		ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
+				&vfio_cfg.vfio_container_fd);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  %s cannot add VFIO group to container, "
+					"error %i (%s)\n", pci_addr, errno, strerror(errno));
+			close(vfio_group_fd);
+			clear_current_group();
+			return -1;
+		}
+		/*
+		 * at this point we know that this group has been successfully
+		 * initialized, so we increment vfio_group_idx to indicate that we can
+		 * add new groups.
+		 */
+		vfio_cfg.vfio_group_idx++;
+	}
+
+	/*
+	 * pick an IOMMU type and set up DMA mappings for container
+	 *
+	 * needs to be done only once, only when at least one group is assigned to
+	 * a container and only in primary process
+	 */
+	if (internal_config.process_type == RTE_PROC_PRIMARY &&
+			vfio_cfg.vfio_container_has_dma == 0) {
+		/* select an IOMMU type which we will be using */
+		const struct vfio_iommu_type *t =
+				pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+		if (!t) {
+			RTE_LOG(ERR, EAL, "  %s failed to select IOMMU type\n", pci_addr);
+			return -1;
+		}
+		ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  %s DMA remapping failed, "
+					"error %i (%s)\n", pci_addr, errno, strerror(errno));
+			return -1;
+		}
+		vfio_cfg.vfio_container_has_dma = 1;
+	}
+
+	/* get a file descriptor for the device */
+	vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, pci_addr);
+	if (vfio_dev_fd < 0) {
+		/* if we cannot get a device fd, this simply means that this
+		 * particular port is not bound to VFIO
+		 */
+		RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+				pci_addr);
+		return 1;
+	}
+
+	/* test and setup the device */
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_INFO, &device_info);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "  %s cannot get device info, "
+				"error %i (%s)\n", pci_addr, errno, strerror(errno));
+		close(vfio_dev_fd);
+		return -1;
+	}
+
+	/* get MSI-X BAR, if any (we have to know where it is because we can't
+	 * easily mmap it when using VFIO) */
+	msix_bar = -1;
+	ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar,
+				    &msix_table_offset, &msix_table_size);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n", pci_addr);
+		close(vfio_dev_fd);
+		return -1;
+	}
+
+	/* if we're in a primary process, allocate vfio_res and get region info */
+	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
+		if (vfio_res == NULL) {
+			RTE_LOG(ERR, EAL,
+				"%s(): cannot store uio mmap details\n", __func__);
+			close(vfio_dev_fd);
+			return -1;
+		}
+		memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
+
+		/* get number of registers (up to BAR5) */
+		vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
+				VFIO_PCI_BAR5_REGION_INDEX + 1);
+	} else {
+		/* if we're in a secondary process, just find our tailq entry */
+		TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
+			if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
+				continue;
+			break;
+		}
+		/* if we haven't found our tailq entry, something's wrong */
+		if (vfio_res == NULL) {
+			RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI device!\n",
+					pci_addr);
+			close(vfio_dev_fd);
+			return -1;
+		}
+	}
+
+	/* map BARs */
+	maps = vfio_res->maps;
+
+	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+		struct vfio_region_info reg = { .argsz = sizeof(reg) };
+		void *bar_addr;
+		struct memreg {
+			unsigned long offset, size;
+		} memreg[2] = {};
+
+		reg.index = i;
+
+		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
+
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  %s cannot get device region info "
+					"error %i (%s)\n", pci_addr, errno, strerror(errno));
+			close(vfio_dev_fd);
+			if (internal_config.process_type == RTE_PROC_PRIMARY)
+				rte_free(vfio_res);
+			return -1;
+		}
+
+		/* chk for io port region */
+		ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
+			      VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
+			      + PCI_BASE_ADDRESS_0 + i*4);
+
+		if (ret != sizeof(ioport_bar)) {
+			RTE_LOG(ERR, EAL,
+				"Cannot read command (%x) from config space!\n",
+				PCI_BASE_ADDRESS_0 + i*4);
+			return -1;
+		}
+
+		if (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) {
+			RTE_LOG(INFO, EAL,
+				"Ignore mapping IO port bar(%d) addr: %x\n",
+				 i, ioport_bar);
+			continue;
+		}
+
+		/* skip non-mmapable BARs */
+		if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+			continue;
+
+		if (i == msix_bar) {
+			/*
+			 * VFIO will not let us map the MSI-X table,
+			 * but we can map around it.
+			 */
+			uint32_t table_start = msix_table_offset;
+			uint32_t table_end = table_start + msix_table_size;
+			table_end = (table_end + ~PAGE_MASK) & PAGE_MASK;
+			table_start &= PAGE_MASK;
+
+			if (table_start == 0 && table_end >= reg.size) {
+				/* Cannot map this BAR */
+				RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n", i);
+				continue;
+			} else {
+				memreg[0].offset = reg.offset;
+				memreg[0].size = table_start;
+				memreg[1].offset = table_end;
+				memreg[1].size = reg.size - table_end;
+
+				RTE_LOG(DEBUG, EAL,
+					"Trying to map BAR %d that contains the MSI-X "
+					"table. Trying offsets: "
+					"0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", i,
+					memreg[0].offset, memreg[0].size,
+					memreg[1].offset, memreg[1].size);
+			}
+		} else {
+			memreg[0].offset = reg.offset;
+			memreg[0].size = reg.size;
+		}
+
+		/* try to figure out an address */
+		if (internal_config.process_type == RTE_PROC_PRIMARY) {
+			/* try mapping somewhere close to the end of hugepages */
+			if (pci_map_addr == NULL)
+				pci_map_addr = pci_find_max_end_va();
+
+			bar_addr = pci_map_addr;
+			pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
+		} else {
+			bar_addr = maps[i].addr;
+		}
+
+		/* reserve the address using an inaccessible mapping */
+		bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE |
+				MAP_ANONYMOUS, -1, 0);
+		if (bar_addr != MAP_FAILED) {
+			void *map_addr = NULL;
+			if (memreg[0].size) {
+				/* actual map of first part */
+				map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
+							    memreg[0].offset,
+							    memreg[0].size,
+							    MAP_FIXED);
+			}
+
+			/* if there's a second part, try to map it */
+			if (map_addr != MAP_FAILED
+			    && memreg[1].offset && memreg[1].size) {
+				void *second_addr = RTE_PTR_ADD(bar_addr, memreg[1].offset);
+				map_addr = pci_map_resource(second_addr,
+							    vfio_dev_fd, memreg[1].offset,
+							    memreg[1].size,
+							    MAP_FIXED);
+			}
+
+			if (map_addr == MAP_FAILED || !map_addr) {
+				munmap(bar_addr, reg.size);
+				bar_addr = MAP_FAILED;
+			}
+		}
+
+		if (bar_addr == MAP_FAILED ||
+				(internal_config.process_type == RTE_PROC_SECONDARY &&
+						bar_addr != maps[i].addr)) {
+			RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n", pci_addr, i,
+					strerror(errno));
+			close(vfio_dev_fd);
+			if (internal_config.process_type == RTE_PROC_PRIMARY)
+				rte_free(vfio_res);
+			return -1;
+		}
+
+		maps[i].addr = bar_addr;
+		maps[i].offset = reg.offset;
+		maps[i].size = reg.size;
+		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
+		dev->mem_resource[i].addr = bar_addr;
+	}
+
+	/* if secondary process, do not set up interrupts */
+	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+		if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
+			RTE_LOG(ERR, EAL, "  %s error setting up interrupts!\n", pci_addr);
+			close(vfio_dev_fd);
+			rte_free(vfio_res);
+			return -1;
+		}
+
+		/* set bus mastering for the device */
+		if (pci_vfio_set_bus_master(vfio_dev_fd)) {
+			RTE_LOG(ERR, EAL, "  %s cannot set up bus mastering!\n", pci_addr);
+			close(vfio_dev_fd);
+			rte_free(vfio_res);
+			return -1;
+		}
+
+		/* Reset the device */
+		ioctl(vfio_dev_fd, VFIO_DEVICE_RESET);
+	}
+
+	if (internal_config.process_type == RTE_PROC_PRIMARY)
+		TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next);
+
+	return 0;
+}
+
+int
+pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
+		    struct rte_pci_ioport *p)
+{
+	if (bar < VFIO_PCI_BAR0_REGION_INDEX ||
+	    bar > VFIO_PCI_BAR5_REGION_INDEX) {
+		RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar);
+		return -1;
+	}
+
+	p->dev = dev;
+	p->base = VFIO_GET_REGION_ADDR(bar);
+	return 0;
+}
+
+void
+pci_vfio_ioport_read(struct rte_pci_ioport *p,
+		     void *data, size_t len, off_t offset)
+{
+	const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
+
+	if (pread64(intr_handle->vfio_dev_fd, data,
+		    len, p->base + offset) <= 0)
+		RTE_LOG(ERR, EAL,
+			"Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n",
+			VFIO_GET_REGION_IDX(p->base), (int)offset);
+}
+
+void
+pci_vfio_ioport_write(struct rte_pci_ioport *p,
+		      const void *data, size_t len, off_t offset)
+{
+	const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
+
+	if (pwrite64(intr_handle->vfio_dev_fd, data,
+		     len, p->base + offset) <= 0)
+		RTE_LOG(ERR, EAL,
+			"Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n",
+			VFIO_GET_REGION_IDX(p->base), (int)offset);
+}
+
+int
+pci_vfio_ioport_unmap(struct rte_pci_ioport *p)
+{
+	RTE_SET_USED(p);
+	return -1;
+}
+
+int
+pci_vfio_enable(void)
+{
+	/* initialize group list */
+	int i;
+	int vfio_available;
+
+	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
+		vfio_cfg.vfio_groups[i].fd = -1;
+		vfio_cfg.vfio_groups[i].group_no = -1;
+	}
+
+	/* inform the user that we are probing for VFIO */
+	RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
+
+	/* check if vfio-pci module is loaded */
+	vfio_available = rte_eal_check_module("vfio_pci");
+
+	/* return error directly */
+	if (vfio_available == -1) {
+		RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
+		return -1;
+	}
+
+	/* return 0 if VFIO modules not loaded */
+	if (vfio_available == 0) {
+		RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, "
+			"skipping VFIO support...\n");
+		return 0;
+	}
+
+	vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd();
+
+	/* check if we have VFIO driver enabled */
+	if (vfio_cfg.vfio_container_fd != -1) {
+		RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
+		vfio_cfg.vfio_enabled = 1;
+	} else {
+		RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
+	}
+
+	return 0;
+}
+
+int
+pci_vfio_is_enabled(void)
+{
+	return vfio_cfg.vfio_enabled;
+}
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c
new file mode 100644
index 00000000..d9188fde
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_mp_sync.c
@@ -0,0 +1,405 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <pthread.h>
+
+/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
+#ifdef __USE_MISC
+#define REMOVED_USE_MISC
+#undef __USE_MISC
+#endif
+#include <sys/un.h>
+/* make sure we redefine __USE_MISC only if it was previously undefined */
+#ifdef REMOVED_USE_MISC
+#define __USE_MISC
+#undef REMOVED_USE_MISC
+#endif
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+
+#include "eal_filesystem.h"
+#include "eal_pci_init.h"
+#include "eal_thread.h"
+
+/**
+ * @file
+ * VFIO socket for communication between primary and secondary processes.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+
+#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
+#define CMSGLEN (CMSG_LEN(sizeof(int)))
+#define FD_TO_CMSGHDR(fd, chdr) \
+		do {\
+			(chdr).cmsg_len = CMSGLEN;\
+			(chdr).cmsg_level = SOL_SOCKET;\
+			(chdr).cmsg_type = SCM_RIGHTS;\
+			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
+		} while (0)
+#define CMSGHDR_TO_FD(chdr, fd) \
+			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
+
+static pthread_t socket_thread;
+static int mp_socket_fd;
+
+
+/* get socket path (/var/run if root, $HOME otherwise) */
+static void
+get_socket_path(char *buffer, int bufsz)
+{
+	const char *dir = "/var/run";
+	const char *home_dir = getenv("HOME");
+
+	if (getuid() != 0 && home_dir != NULL)
+		dir = home_dir;
+
+	/* use current prefix as file path */
+	snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
+			internal_config.hugefile_prefix);
+}
+
+
+
+/*
+ * data flow for socket comm protocol:
+ * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
+ * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
+ * 2. server receives message
+ * 2a. in case of invalid group, SOCKET_ERR is sent back to client
+ * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
+ * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
+ *
+ * in case of any error, socket is closed.
+ */
+
+/* send a request, return -1 on error */
+int
+vfio_mp_sync_send_request(int socket, int req)
+{
+	struct msghdr hdr;
+	struct iovec iov;
+	int buf;
+	int ret;
+
+	memset(&hdr, 0, sizeof(hdr));
+
+	buf = req;
+
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+	iov.iov_base = (char *) &buf;
+	iov.iov_len = sizeof(buf);
+
+	ret = sendmsg(socket, &hdr, 0);
+	if (ret < 0)
+		return -1;
+	return 0;
+}
+
+/* receive a request and return it */
+int
+vfio_mp_sync_receive_request(int socket)
+{
+	int buf;
+	struct msghdr hdr;
+	struct iovec iov;
+	int ret, req;
+
+	memset(&hdr, 0, sizeof(hdr));
+
+	buf = SOCKET_ERR;
+
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+	iov.iov_base = (char *) &buf;
+	iov.iov_len = sizeof(buf);
+
+	ret = recvmsg(socket, &hdr, 0);
+	if (ret < 0)
+		return -1;
+
+	req = buf;
+
+	return req;
+}
+
+/* send OK in message, fd in control message */
+int
+vfio_mp_sync_send_fd(int socket, int fd)
+{
+	int buf;
+	struct msghdr hdr;
+	struct cmsghdr *chdr;
+	char chdr_buf[CMSGLEN];
+	struct iovec iov;
+	int ret;
+
+	chdr = (struct cmsghdr *) chdr_buf;
+	memset(chdr, 0, sizeof(chdr_buf));
+	memset(&hdr, 0, sizeof(hdr));
+
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+	iov.iov_base = (char *) &buf;
+	iov.iov_len = sizeof(buf);
+	hdr.msg_control = chdr;
+	hdr.msg_controllen = CMSGLEN;
+
+	buf = SOCKET_OK;
+	FD_TO_CMSGHDR(fd, *chdr);
+
+	ret = sendmsg(socket, &hdr, 0);
+	if (ret < 0)
+		return -1;
+	return 0;
+}
+
+/* receive OK in message, fd in control message */
+int
+vfio_mp_sync_receive_fd(int socket)
+{
+	int buf;
+	struct msghdr hdr;
+	struct cmsghdr *chdr;
+	char chdr_buf[CMSGLEN];
+	struct iovec iov;
+	int ret, req, fd;
+
+	buf = SOCKET_ERR;
+
+	chdr = (struct cmsghdr *) chdr_buf;
+	memset(chdr, 0, sizeof(chdr_buf));
+	memset(&hdr, 0, sizeof(hdr));
+
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+	iov.iov_base = (char *) &buf;
+	iov.iov_len = sizeof(buf);
+	hdr.msg_control = chdr;
+	hdr.msg_controllen = CMSGLEN;
+
+	ret = recvmsg(socket, &hdr, 0);
+	if (ret < 0)
+		return -1;
+
+	req = buf;
+
+	if (req != SOCKET_OK)
+		return -1;
+
+	CMSGHDR_TO_FD(*chdr, fd);
+
+	return fd;
+}
+
+/* connect socket_fd in secondary process to the primary process's socket */
+int
+vfio_mp_sync_connect_to_primary(void)
+{
+	struct sockaddr_un addr;
+	socklen_t sockaddr_len;
+	int socket_fd;
+
+	/* set up a socket */
+	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+	if (socket_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+		return -1;
+	}
+
+	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
+	addr.sun_family = AF_UNIX;
+
+	sockaddr_len = sizeof(struct sockaddr_un);
+
+	if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
+		return socket_fd;
+
+	/* if connect failed */
+	close(socket_fd);
+	return -1;
+}
+
+
+
+/*
+ * socket listening thread for primary process
+ */
+static __attribute__((noreturn)) void *
+pci_vfio_mp_sync_thread(void __rte_unused * arg)
+{
+	int ret, fd, vfio_group_no;
+
+	/* wait for requests on the socket */
+	for (;;) {
+		int conn_sock;
+		struct sockaddr_un addr;
+		socklen_t sockaddr_len = sizeof(addr);
+
+		/* this is a blocking call */
+		conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
+				&sockaddr_len);
+
+		/* just restart on error */
+		if (conn_sock == -1)
+			continue;
+
+		/* set socket to linger after close */
+		struct linger l;
+		l.l_onoff = 1;
+		l.l_linger = 60;
+		setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l));
+
+		ret = vfio_mp_sync_receive_request(conn_sock);
+
+		switch (ret) {
+		case SOCKET_REQ_CONTAINER:
+			fd = pci_vfio_get_container_fd();
+			if (fd < 0)
+				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+			else
+				vfio_mp_sync_send_fd(conn_sock, fd);
+			break;
+		case SOCKET_REQ_GROUP:
+			/* wait for group number */
+			vfio_group_no = vfio_mp_sync_receive_request(conn_sock);
+			if (vfio_group_no < 0) {
+				close(conn_sock);
+				continue;
+			}
+
+			fd = pci_vfio_get_group_fd(vfio_group_no);
+
+			if (fd < 0)
+				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+			/* if VFIO group exists but isn't bound to VFIO driver */
+			else if (fd == 0)
+				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+			/* if group exists and is bound to VFIO driver */
+			else {
+				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
+				vfio_mp_sync_send_fd(conn_sock, fd);
+			}
+			break;
+		default:
+			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+			break;
+		}
+		close(conn_sock);
+	}
+}
+
+static int
+vfio_mp_sync_socket_setup(void)
+{
+	int ret, socket_fd;
+	struct sockaddr_un addr;
+	socklen_t sockaddr_len;
+
+	/* set up a socket */
+	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+	if (socket_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+		return -1;
+	}
+
+	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
+	addr.sun_family = AF_UNIX;
+
+	sockaddr_len = sizeof(struct sockaddr_un);
+
+	unlink(addr.sun_path);
+
+	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
+		close(socket_fd);
+		return -1;
+	}
+
+	ret = listen(socket_fd, 50);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
+		close(socket_fd);
+		return -1;
+	}
+
+	/* save the socket in local configuration */
+	mp_socket_fd = socket_fd;
+
+	return 0;
+}
+
+/*
+ * set up a local socket and tell it to listen for incoming connections
+ */
+int
+pci_vfio_mp_sync_setup(void)
+{
+	int ret;
+	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+	if (vfio_mp_sync_socket_setup() < 0) {
+		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
+		return -1;
+	}
+
+	ret = pthread_create(&socket_thread, NULL,
+			pci_vfio_mp_sync_thread, NULL);
+	if (ret) {
+		RTE_LOG(ERR, EAL,
+			"Failed to create thread for communication with secondary processes!\n");
+		close(mp_socket_fd);
+		return -1;
+	}
+
+	/* Set thread_name for aid in debugging. */
+	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "pci-vfio-sync");
+	ret = rte_thread_setname(socket_thread, thread_name);
+	if (ret)
+		RTE_LOG(ERR, EAL,
+			"Failed to set thread name for secondary processes!\n");
+
+	return 0;
+}
+
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
new file mode 100644
index 00000000..18bd8e04
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c
@@ -0,0 +1,199 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+
+#include <rte_debug.h>
+#include <rte_atomic.h>
+#include <rte_launch.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_per_lcore.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+
+#include "eal_private.h"
+#include "eal_thread.h"
+
+RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY;
+RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY;
+RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset);
+
+/*
+ * Send a message to a slave lcore identified by slave_id to call a
+ * function f with argument arg. Once the execution is done, the
+ * remote lcore switch in FINISHED state.
+ */
+int
+rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id)
+{
+	int n;
+	char c = 0;
+	int m2s = lcore_config[slave_id].pipe_master2slave[1];
+	int s2m = lcore_config[slave_id].pipe_slave2master[0];
+
+	if (lcore_config[slave_id].state != WAIT)
+		return -EBUSY;
+
+	lcore_config[slave_id].f = f;
+	lcore_config[slave_id].arg = arg;
+
+	/* send message */
+	n = 0;
+	while (n == 0 || (n < 0 && errno == EINTR))
+		n = write(m2s, &c, 1);
+	if (n < 0)
+		rte_panic("cannot write on configuration pipe\n");
+
+	/* wait ack */
+	do {
+		n = read(s2m, &c, 1);
+	} while (n < 0 && errno == EINTR);
+
+	if (n <= 0)
+		rte_panic("cannot read on configuration pipe\n");
+
+	return 0;
+}
+
+/* set affinity for current EAL thread */
+static int
+eal_thread_set_affinity(void)
+{
+	unsigned lcore_id = rte_lcore_id();
+
+	/* acquire system unique id  */
+	rte_gettid();
+
+	/* update EAL thread core affinity */
+	return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset);
+}
+
+void eal_thread_init_master(unsigned lcore_id)
+{
+	/* set the lcore ID in per-lcore memory area */
+	RTE_PER_LCORE(_lcore_id) = lcore_id;
+
+	/* set CPU affinity */
+	if (eal_thread_set_affinity() < 0)
+		rte_panic("cannot set affinity\n");
+}
+
+/* main loop of threads */
+__attribute__((noreturn)) void *
+eal_thread_loop(__attribute__((unused)) void *arg)
+{
+	char c;
+	int n, ret;
+	unsigned lcore_id;
+	pthread_t thread_id;
+	int m2s, s2m;
+	char cpuset[RTE_CPU_AFFINITY_STR_LEN];
+
+	thread_id = pthread_self();
+
+	/* retrieve our lcore_id from the configuration structure */
+	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+		if (thread_id == lcore_config[lcore_id].thread_id)
+			break;
+	}
+	if (lcore_id == RTE_MAX_LCORE)
+		rte_panic("cannot retrieve lcore id\n");
+
+	m2s = lcore_config[lcore_id].pipe_master2slave[0];
+	s2m = lcore_config[lcore_id].pipe_slave2master[1];
+
+	/* set the lcore ID in per-lcore memory area */
+	RTE_PER_LCORE(_lcore_id) = lcore_id;
+
+	/* set CPU affinity */
+	if (eal_thread_set_affinity() < 0)
+		rte_panic("cannot set affinity\n");
+
+	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
+
+	RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
+		lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "...");
+
+	/* read on our pipe to get commands */
+	while (1) {
+		void *fct_arg;
+
+		/* wait command */
+		do {
+			n = read(m2s, &c, 1);
+		} while (n < 0 && errno == EINTR);
+
+		if (n <= 0)
+			rte_panic("cannot read on configuration pipe\n");
+
+		lcore_config[lcore_id].state = RUNNING;
+
+		/* send ack */
+		n = 0;
+		while (n == 0 || (n < 0 && errno == EINTR))
+			n = write(s2m, &c, 1);
+		if (n < 0)
+			rte_panic("cannot write on configuration pipe\n");
+
+		if (lcore_config[lcore_id].f == NULL)
+			rte_panic("NULL function pointer\n");
+
+		/* call the function and store the return value */
+		fct_arg = lcore_config[lcore_id].arg;
+		ret = lcore_config[lcore_id].f(fct_arg);
+		lcore_config[lcore_id].ret = ret;
+		rte_wmb();
+		lcore_config[lcore_id].state = FINISHED;
+	}
+
+	/* never reached */
+	/* pthread_exit(NULL); */
+	/* return NULL; */
+}
+
+/* require calling thread tid by gettid() */
+int rte_sys_gettid(void)
+{
+	return (int)syscall(SYS_gettid);
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c
new file mode 100644
index 00000000..f2abb7b6
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_timer.c
@@ -0,0 +1,305 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2012-2013 6WIND S.A.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <pthread.h>
+#include <errno.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_cycles.h>
+#include <rte_lcore.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_eal.h>
+#include <rte_debug.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+
+enum timer_source eal_timer_source = EAL_TIMER_HPET;
+
+#ifdef RTE_LIBEAL_USE_HPET
+
+#define DEV_HPET "/dev/hpet"
+
+/* Maximum number of counters. */
+#define HPET_TIMER_NUM 3
+
+/* General capabilities register */
+#define CLK_PERIOD_SHIFT     32 /* Clock period shift. */
+#define CLK_PERIOD_MASK      0xffffffff00000000ULL /* Clock period mask. */
+
+/**
+ * HPET timer registers. From the Intel IA-PC HPET (High Precision Event
+ * Timers) Specification.
+ */
+struct eal_hpet_regs {
+	/* Memory-mapped, software visible registers */
+	uint64_t capabilities;      /**< RO General Capabilities Register. */
+	uint64_t reserved0;         /**< Reserved for future use. */
+	uint64_t config;            /**< RW General Configuration Register. */
+	uint64_t reserved1;         /**< Reserved for future use. */
+	uint64_t isr;               /**< RW Clear General Interrupt Status. */
+	uint64_t reserved2[25];     /**< Reserved for future use. */
+	union {
+		uint64_t counter;   /**< RW Main Counter Value Register. */
+		struct {
+			uint32_t counter_l; /**< RW Main Counter Low. */
+			uint32_t counter_h; /**< RW Main Counter High. */
+		};
+	};
+	uint64_t reserved3;         /**< Reserved for future use. */
+	struct {
+		uint64_t config;    /**< RW Timer Config and Capability Reg. */
+		uint64_t comp;      /**< RW Timer Comparator Value Register. */
+		uint64_t fsb;       /**< RW FSB Interrupt Route Register. */
+		uint64_t reserved4; /**< Reserved for future use. */
+	} timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */
+};
+
+/* Mmap'd hpet registers */
+static volatile struct eal_hpet_regs *eal_hpet = NULL;
+
+/* Period at which the HPET counter increments in
+ * femtoseconds (10^-15 seconds). */
+static uint32_t eal_hpet_resolution_fs = 0;
+
+/* Frequency of the HPET counter in Hz */
+static uint64_t eal_hpet_resolution_hz = 0;
+
+/* Incremented 4 times during one 32bits hpet full count */
+static uint32_t eal_hpet_msb;
+
+static pthread_t msb_inc_thread_id;
+
+/*
+ * This function runs on a specific thread to update a global variable
+ * containing used to process MSB of the HPET (unfortunatelly, we need
+ * this because hpet is 32 bits by default under linux).
+ */
+static void
+hpet_msb_inc(__attribute__((unused)) void *arg)
+{
+	uint32_t t;
+
+	while (1) {
+		t = (eal_hpet->counter_l >> 30);
+		if (t != (eal_hpet_msb & 3))
+			eal_hpet_msb ++;
+		sleep(10);
+	}
+}
+
+uint64_t
+rte_get_hpet_hz(void)
+{
+	if(internal_config.no_hpet)
+		rte_panic("Error, HPET called, but no HPET present\n");
+
+	return eal_hpet_resolution_hz;
+}
+
+uint64_t
+rte_get_hpet_cycles(void)
+{
+	uint32_t t, msb;
+	uint64_t ret;
+
+	if(internal_config.no_hpet)
+		rte_panic("Error, HPET called, but no HPET present\n");
+
+	t = eal_hpet->counter_l;
+	msb = eal_hpet_msb;
+	ret = (msb + 2 - (t >> 30)) / 4;
+	ret <<= 32;
+	ret += t;
+	return ret;
+}
+
+#endif
+
+#ifdef RTE_LIBEAL_USE_HPET
+/*
+ * Open and mmap /dev/hpet (high precision event timer) that will
+ * provide our time reference.
+ */
+int
+rte_eal_hpet_init(int make_default)
+{
+	int fd, ret;
+	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+	if (internal_config.no_hpet) {
+		RTE_LOG(NOTICE, EAL, "HPET is disabled\n");
+		return -1;
+	}
+
+	fd = open(DEV_HPET, O_RDONLY);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n",
+			strerror(errno));
+		internal_config.no_hpet = 1;
+		return -1;
+	}
+	eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0);
+	if (eal_hpet == MAP_FAILED) {
+		RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n"
+				"Please enable CONFIG_HPET_MMAP in your kernel configuration "
+				"to allow HPET support.\n"
+				"To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n "
+				"in your build configuration or use '--no-hpet' EAL flag.\n");
+		close(fd);
+		internal_config.no_hpet = 1;
+		return -1;
+	}
+	close(fd);
+
+	eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities &
+					CLK_PERIOD_MASK) >>
+					CLK_PERIOD_SHIFT);
+
+	eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) /
+		(uint64_t)eal_hpet_resolution_fs;
+
+	RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n",
+			eal_hpet_resolution_hz/1000);
+
+	eal_hpet_msb = (eal_hpet->counter_l >> 30);
+
+	/* create a thread that will increment a global variable for
+	 * msb (hpet is 32 bits by default under linux) */
+	ret = pthread_create(&msb_inc_thread_id, NULL,
+			(void *(*)(void *))hpet_msb_inc, NULL);
+	if (ret != 0) {
+		RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n");
+		internal_config.no_hpet = 1;
+		return -1;
+	}
+
+	/*
+	 * Set thread_name for aid in debugging.
+	 */
+	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "hpet-msb-inc");
+	ret = rte_thread_setname(msb_inc_thread_id, thread_name);
+	if (ret != 0)
+		RTE_LOG(ERR, EAL,
+			"ERROR: Cannot set HPET timer thread name!\n");
+
+	if (make_default)
+		eal_timer_source = EAL_TIMER_HPET;
+	return 0;
+}
+#endif
+
+static void
+check_tsc_flags(void)
+{
+	char line[512];
+	FILE *stream;
+
+	stream = fopen("/proc/cpuinfo", "r");
+	if (!stream) {
+		RTE_LOG(WARNING, EAL, "WARNING: Unable to open /proc/cpuinfo\n");
+		return;
+	}
+
+	while (fgets(line, sizeof line, stream)) {
+		char *constant_tsc;
+		char *nonstop_tsc;
+
+		if (strncmp(line, "flags", 5) != 0)
+			continue;
+
+		constant_tsc = strstr(line, "constant_tsc");
+		nonstop_tsc = strstr(line, "nonstop_tsc");
+		if (!constant_tsc || !nonstop_tsc)
+			RTE_LOG(WARNING, EAL,
+				"WARNING: cpu flags "
+				"constant_tsc=%s "
+				"nonstop_tsc=%s "
+				"-> using unreliable clock cycles !\n",
+				constant_tsc ? "yes":"no",
+				nonstop_tsc ? "yes":"no");
+		break;
+	}
+
+	fclose(stream);
+}
+
+uint64_t
+get_tsc_freq(void)
+{
+#ifdef CLOCK_MONOTONIC_RAW
+#define NS_PER_SEC 1E9
+
+	struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */
+
+	struct timespec t_start, t_end;
+	uint64_t tsc_hz;
+
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) {
+		uint64_t ns, end, start = rte_rdtsc();
+		nanosleep(&sleeptime,NULL);
+		clock_gettime(CLOCK_MONOTONIC_RAW, &t_end);
+		end = rte_rdtsc();
+		ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC);
+		ns += (t_end.tv_nsec - t_start.tv_nsec);
+
+		double secs = (double)ns/NS_PER_SEC;
+		tsc_hz = (uint64_t)((end - start)/secs);
+		return tsc_hz;
+	}
+#endif
+	return 0;
+}
+
+int
+rte_eal_timer_init(void)
+{
+
+	eal_timer_source = EAL_TIMER_TSC;
+
+	set_tsc_freq();
+	check_tsc_flags();
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
new file mode 100644
index 00000000..f483bf40
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -0,0 +1,67 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef EAL_VFIO_H_
+#define EAL_VFIO_H_
+
+/*
+ * determine if VFIO is present on the system
+ */
+#ifdef RTE_EAL_VFIO
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
+#include <linux/vfio.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0)
+#define RTE_PCI_MSIX_TABLE_BIR    0x7
+#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8
+#define RTE_PCI_MSIX_FLAGS_QSIZE  0x07ff
+#else
+#define RTE_PCI_MSIX_TABLE_BIR    PCI_MSIX_TABLE_BIR
+#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET
+#define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE
+#endif
+
+#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
+#define RTE_VFIO_NOIOMMU 8
+#else
+#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU
+#endif
+
+#define VFIO_PRESENT
+#endif /* kernel version */
+#endif /* RTE_EAL_VFIO */
+
+#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_xen_memory.c b/lib/librte_eal/linuxapp/eal/eal_xen_memory.c
new file mode 100644
index 00000000..495eef9e
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_xen_memory.c
@@ -0,0 +1,369 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdarg.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <limits.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
+#include <rte_string_fns.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+#include "eal_filesystem.h"
+#include <exec-env/rte_dom0_common.h>
+
+#define PAGE_SIZE RTE_PGSIZE_4K
+#define DEFAUL_DOM0_NAME "dom0-mem"
+
+static int xen_fd = -1;
+static const char sys_dir_path[] = "/sys/kernel/mm/dom0-mm/memsize-mB";
+
+/*
+ * Try to mmap *size bytes in /dev/zero. If it is successful, return the
+ * pointer to the mmap'd area and keep *size unmodified. Else, retry
+ * with a smaller zone: decrease *size by mem_size until it reaches
+ * 0. In this case, return NULL. Note: this function returns an address
+ * which is a multiple of mem_size size.
+ */
+static void *
+xen_get_virtual_area(size_t *size, size_t mem_size)
+{
+	void *addr;
+	int fd;
+	long aligned_addr;
+
+	RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zu bytes\n", *size);
+
+	fd = open("/dev/zero", O_RDONLY);
+	if (fd < 0){
+		RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n");
+		return NULL;
+	}
+	do {
+		addr = mmap(NULL, (*size) + mem_size, PROT_READ,
+			MAP_PRIVATE, fd, 0);
+		if (addr == MAP_FAILED)
+			*size -= mem_size;
+	} while (addr == MAP_FAILED && *size > 0);
+
+	if (addr == MAP_FAILED) {
+		close(fd);
+		RTE_LOG(ERR, EAL, "Cannot get a virtual area\n");
+		return NULL;
+	}
+
+	munmap(addr, (*size) + mem_size);
+	close(fd);
+
+	/* align addr to a mem_size boundary */
+	aligned_addr = (uintptr_t)addr;
+	aligned_addr = RTE_ALIGN_CEIL(aligned_addr, mem_size);
+        addr = (void *)(aligned_addr);
+
+	RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
+		addr, *size);
+
+	return addr;
+}
+
+/**
+ * Get memory size configuration from /sys/devices/virtual/misc/dom0_mm
+ * /memsize-mB/memsize file, and the size unit is mB.
+ */
+static int
+get_xen_memory_size(void)
+{
+	char path[PATH_MAX];
+	unsigned long mem_size = 0;
+	static const char *file_name;
+
+	file_name = "memsize";
+	snprintf(path, sizeof(path), "%s/%s",
+			sys_dir_path, file_name);
+
+	if (eal_parse_sysfs_value(path, &mem_size) < 0)
+		return -1;
+
+	if (mem_size == 0)
+		rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s was not"
+			" configured.\n",sys_dir_path, file_name);
+	if (mem_size % 2)
+		rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s must be"
+			" even number.\n",sys_dir_path, file_name);
+
+	if (mem_size > DOM0_CONFIG_MEMSIZE)
+		rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s should not be larger"
+			" than %d mB\n",sys_dir_path, file_name, DOM0_CONFIG_MEMSIZE);
+
+	return mem_size;
+}
+
+/**
+ * Based on physical address to caculate MFN in Xen Dom0.
+ */
+phys_addr_t
+rte_xen_mem_phy2mch(uint32_t memseg_id, const phys_addr_t phy_addr)
+{
+	int mfn_id;
+	uint64_t mfn, mfn_offset;
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct rte_memseg *memseg = mcfg->memseg;
+
+	mfn_id = (phy_addr - memseg[memseg_id].phys_addr) / RTE_PGSIZE_2M;
+
+	/*the MFN is contiguous in 2M */
+	mfn_offset = (phy_addr - memseg[memseg_id].phys_addr) %
+					RTE_PGSIZE_2M / PAGE_SIZE;
+	mfn = mfn_offset + memseg[memseg_id].mfn[mfn_id];
+
+	/** return mechine address */
+	return mfn * PAGE_SIZE + phy_addr % PAGE_SIZE;
+}
+
+int
+rte_xen_dom0_memory_init(void)
+{
+	void *vir_addr, *vma_addr = NULL;
+	int err, ret = 0;
+	uint32_t i, requested, mem_size, memseg_idx, num_memseg = 0;
+	size_t vma_len = 0;
+	struct memory_info meminfo;
+	struct memseg_info seginfo[RTE_MAX_MEMSEG];
+	int flags, page_size = getpagesize();
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct rte_memseg *memseg = mcfg->memseg;
+	uint64_t total_mem = internal_config.memory;
+
+	memset(seginfo, 0, sizeof(seginfo));
+	memset(&meminfo, 0, sizeof(struct memory_info));
+
+	mem_size = get_xen_memory_size();
+	requested = (unsigned) (total_mem / 0x100000);
+	if (requested > mem_size)
+		/* if we didn't satisfy total memory requirements */
+		rte_exit(EXIT_FAILURE,"Not enough memory available! Requested: %uMB,"
+				" available: %uMB\n", requested, mem_size);
+	else if (total_mem != 0)
+		mem_size = requested;
+
+	/* Check FD and open once */
+	if (xen_fd < 0) {
+		xen_fd = open(DOM0_MM_DEV, O_RDWR);
+		if (xen_fd < 0) {
+			RTE_LOG(ERR, EAL, "Can not open %s\n",DOM0_MM_DEV);
+			return -1;
+		}
+	}
+
+	meminfo.size = mem_size;
+
+	/* construct memory mangement name for Dom0 */
+	snprintf(meminfo.name, DOM0_NAME_MAX, "%s-%s",
+		internal_config.hugefile_prefix, DEFAUL_DOM0_NAME);
+
+	/* Notify kernel driver to allocate memory */
+	ret = ioctl(xen_fd, RTE_DOM0_IOCTL_PREPARE_MEMSEG, &meminfo);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memory\n");
+		err = -EIO;
+		goto fail;
+	}
+
+	/* Get number of memory segment from driver */
+	ret = ioctl(xen_fd, RTE_DOM0_IOCTL_GET_NUM_MEMSEG, &num_memseg);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memseg count.\n");
+		err = -EIO;
+		goto fail;
+	}
+
+	if(num_memseg > RTE_MAX_MEMSEG){
+		RTE_LOG(ERR, EAL, "XEN DOM0: the memseg count %d is greater"
+			" than max memseg %d.\n",num_memseg, RTE_MAX_MEMSEG);
+		err = -EIO;
+		goto fail;
+	}
+
+	/* get all memory segements information */
+	ret = ioctl(xen_fd, RTE_DOM0_IOCTL_GET_MEMSEG_INFO, seginfo);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memseg info.\n");
+		err = -EIO;
+		goto fail;
+	}
+
+	/* map all memory segments to contiguous user space */
+	for (memseg_idx = 0; memseg_idx < num_memseg; memseg_idx++)
+	{
+		vma_len = seginfo[memseg_idx].size;
+
+		/**
+		 * get the biggest virtual memory area up to vma_len. If it fails,
+		 * vma_addr is NULL, so let the kernel provide the address.
+		 */
+		vma_addr = xen_get_virtual_area(&vma_len, RTE_PGSIZE_2M);
+		if (vma_addr == NULL) {
+			flags = MAP_SHARED;
+			vma_len = RTE_PGSIZE_2M;
+		} else
+			flags = MAP_SHARED | MAP_FIXED;
+
+		seginfo[memseg_idx].size = vma_len;
+		vir_addr = mmap(vma_addr, seginfo[memseg_idx].size,
+			PROT_READ|PROT_WRITE, flags, xen_fd,
+			memseg_idx * page_size);
+		if (vir_addr == MAP_FAILED) {
+			RTE_LOG(ERR, EAL, "XEN DOM0:Could not mmap %s\n",
+				DOM0_MM_DEV);
+			err = -EIO;
+			goto fail;
+		}
+
+		memseg[memseg_idx].addr = vir_addr;
+		memseg[memseg_idx].phys_addr = page_size *
+			seginfo[memseg_idx].pfn ;
+		memseg[memseg_idx].len = seginfo[memseg_idx].size;
+		for ( i = 0; i < seginfo[memseg_idx].size / RTE_PGSIZE_2M; i++)
+			memseg[memseg_idx].mfn[i] = seginfo[memseg_idx].mfn[i];
+
+		/* MFNs are continuous in 2M, so assume that page size is 2M */
+		memseg[memseg_idx].hugepage_sz = RTE_PGSIZE_2M;
+
+		memseg[memseg_idx].nchannel = mcfg->nchannel;
+		memseg[memseg_idx].nrank = mcfg->nrank;
+
+		/* NUMA is not suppoted in Xen Dom0, so only set socket 0*/
+		memseg[memseg_idx].socket_id = 0;
+	}
+
+	return 0;
+fail:
+	if (xen_fd > 0) {
+		close(xen_fd);
+		xen_fd = -1;
+	}
+	return err;
+}
+
+/*
+ * This creates the memory mappings in the secondary process to match that of
+ * the server process. It goes through each memory segment in the DPDK runtime
+ * configuration, mapping them in order to form a contiguous block in the
+ * virtual memory space
+ */
+int
+rte_xen_dom0_memory_attach(void)
+{
+	const struct rte_mem_config *mcfg;
+	unsigned s = 0; /* s used to track the segment number */
+	int xen_fd = -1;
+	int ret = -1;
+	void *vir_addr;
+	char name[DOM0_NAME_MAX] = {0};
+	int page_size = getpagesize();
+
+	mcfg = rte_eal_get_configuration()->mem_config;
+
+	/* Check FD and open once */
+	if (xen_fd < 0) {
+		xen_fd = open(DOM0_MM_DEV, O_RDWR);
+		if (xen_fd < 0) {
+			RTE_LOG(ERR, EAL, "Can not open %s\n",DOM0_MM_DEV);
+			goto error;
+		}
+	}
+
+	/* construct memory mangement name for Dom0 */
+	snprintf(name, DOM0_NAME_MAX, "%s-%s",
+		internal_config.hugefile_prefix, DEFAUL_DOM0_NAME);
+	/* attach to memory segments of primary process */
+	ret = ioctl(xen_fd, RTE_DOM0_IOCTL_ATTACH_TO_MEMSEG, name);
+	if (ret) {
+		RTE_LOG(ERR, EAL,"attach memory segments fail.\n");
+		goto error;
+	}
+
+	/* map all segments into memory to make sure we get the addrs */
+	for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
+
+		/*
+		 * the first memory segment with len==0 is the one that
+		 * follows the last valid segment.
+		 */
+		if (mcfg->memseg[s].len == 0)
+			break;
+
+		vir_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
+				PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED, xen_fd,
+				s * page_size);
+		if (vir_addr == MAP_FAILED) {
+			RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
+				"in %s to requested address [%p]\n",
+				(unsigned long long)mcfg->memseg[s].len, DOM0_MM_DEV,
+				mcfg->memseg[s].addr);
+			goto error;
+		}
+	}
+	return 0;
+
+error:
+	if (xen_fd >= 0) {
+		close(xen_fd);
+		xen_fd = -1;
+	}
+	return -1;
+}
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dom0_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dom0_common.h
new file mode 100644
index 00000000..d9707780
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dom0_common.h
@@ -0,0 +1,108 @@
+/*-
+ *   This file is provided under a dual BSD/LGPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GNU LESSER GENERAL PUBLIC LICENSE
+ *
+ *   Copyright(c) 2007-2014 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2.1 of the GNU Lesser General Public License
+ *   as published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ *   Contact Information:
+ *   Intel Corporation
+ *
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _RTE_DOM0_COMMON_H_
+#define _RTE_DOM0_COMMON_H_
+
+#ifdef __KERNEL__
+#include <linux/if.h>
+#endif
+
+#define DOM0_NAME_MAX   256
+#define DOM0_MM_DEV   "/dev/dom0_mm"
+
+#define DOM0_CONTIG_NUM_ORDER       9       /**< order of 2M */
+#define DOM0_NUM_MEMSEG             512     /**< Maximum nb. of memory segment. */
+#define DOM0_MEMBLOCK_SIZE          0x200000 /**< size of memory block(2M). */
+#define DOM0_CONFIG_MEMSIZE         4096     /**< Maximum config memory size(4G). */
+#define DOM0_NUM_MEMBLOCK (DOM0_CONFIG_MEMSIZE / 2) /**< Maximum nb. of 2M memory block. */
+
+#define RTE_DOM0_IOCTL_PREPARE_MEMSEG    _IOWR(0, 1 , struct memory_info)
+#define RTE_DOM0_IOCTL_ATTACH_TO_MEMSEG  _IOWR(0, 2 , char *)
+#define RTE_DOM0_IOCTL_GET_NUM_MEMSEG    _IOWR(0, 3, int)
+#define RTE_DOM0_IOCTL_GET_MEMSEG_INFO   _IOWR(0, 4, void *)
+
+/**
+ * A structure used to store memory information.
+ */
+struct memory_info {
+	char name[DOM0_NAME_MAX];
+	uint64_t size;
+};
+
+/**
+ * A structure used to store memory segment information.
+ */
+struct memseg_info {
+	uint32_t idx;
+	uint64_t pfn;
+	uint64_t size;
+	uint64_t mfn[DOM0_NUM_MEMBLOCK];
+};
+
+/**
+ * A structure used to store memory block information.
+ */
+struct memblock_info {
+	uint8_t exchange_flag;
+	uint8_t used;
+	uint64_t vir_addr;
+	uint64_t pfn;
+	uint64_t mfn;
+};
+#endif /* _RTE_DOM0_COMMON_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
new file mode 100644
index 00000000..3dacbff8
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -0,0 +1,228 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_INTERRUPTS_H_
+#error "don't include this file directly, please include generic <rte_interrupts.h>"
+#endif
+
+#ifndef _RTE_LINUXAPP_INTERRUPTS_H_
+#define _RTE_LINUXAPP_INTERRUPTS_H_
+
+#define RTE_MAX_RXTX_INTR_VEC_ID     32
+#define RTE_INTR_VEC_ZERO_OFFSET      0
+#define RTE_INTR_VEC_RXTX_OFFSET      1
+
+enum rte_intr_handle_type {
+	RTE_INTR_HANDLE_UNKNOWN = 0,
+	RTE_INTR_HANDLE_UIO,          /**< uio device handle */
+	RTE_INTR_HANDLE_UIO_INTX,     /**< uio generic handle */
+	RTE_INTR_HANDLE_VFIO_LEGACY,  /**< vfio device handle (legacy) */
+	RTE_INTR_HANDLE_VFIO_MSI,     /**< vfio device handle (MSI) */
+	RTE_INTR_HANDLE_VFIO_MSIX,    /**< vfio device handle (MSIX) */
+	RTE_INTR_HANDLE_ALARM,    /**< alarm handle */
+	RTE_INTR_HANDLE_EXT, /**< external handler */
+	RTE_INTR_HANDLE_MAX
+};
+
+#define RTE_INTR_EVENT_ADD            1UL
+#define RTE_INTR_EVENT_DEL            2UL
+
+typedef void (*rte_intr_event_cb_t)(int fd, void *arg);
+
+struct rte_epoll_data {
+	uint32_t event;               /**< event type */
+	void *data;                   /**< User data */
+	rte_intr_event_cb_t cb_fun;   /**< IN: callback fun */
+	void *cb_arg;	              /**< IN: callback arg */
+};
+
+enum {
+	RTE_EPOLL_INVALID = 0,
+	RTE_EPOLL_VALID,
+	RTE_EPOLL_EXEC,
+};
+
+/** interrupt epoll event obj, taken by epoll_event.ptr */
+struct rte_epoll_event {
+	volatile uint32_t status;  /**< OUT: event status */
+	int fd;                    /**< OUT: event fd */
+	int epfd;       /**< OUT: epoll instance the ev associated with */
+	struct rte_epoll_data epdata;
+};
+
+/** Handle for interrupts. */
+struct rte_intr_handle {
+	union {
+		int vfio_dev_fd;  /**< VFIO device file descriptor */
+		int uio_cfg_fd;  /**< UIO config file descriptor
+					for uio_pci_generic */
+	};
+	int fd;	 /**< interrupt event file descriptor */
+	enum rte_intr_handle_type type;  /**< handle type */
+	uint32_t max_intr;             /**< max interrupt requested */
+	uint32_t nb_efd;               /**< number of available efd(event fd) */
+	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
+	struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID];
+				       /**< intr vector epoll event */
+	int *intr_vec;                 /**< intr vector number array */
+};
+
+#define RTE_EPOLL_PER_THREAD        -1  /**< to hint using per thread epfd */
+
+/**
+ * It waits for events on the epoll instance.
+ *
+ * @param epfd
+ *   Epoll instance fd on which the caller wait for events.
+ * @param events
+ *   Memory area contains the events that will be available for the caller.
+ * @param maxevents
+ *   Up to maxevents are returned, must greater than zero.
+ * @param timeout
+ *   Specifying a timeout of -1 causes a block indefinitely.
+ *   Specifying a timeout equal to zero cause to return immediately.
+ * @return
+ *   - On success, returns the number of available event.
+ *   - On failure, a negative value.
+ */
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+	       int maxevents, int timeout);
+
+/**
+ * It performs control operations on epoll instance referred by the epfd.
+ * It requests that the operation op be performed for the target fd.
+ *
+ * @param epfd
+ *   Epoll instance fd on which the caller perform control operations.
+ * @param op
+ *   The operation be performed for the target fd.
+ * @param fd
+ *   The target fd on which the control ops perform.
+ * @param event
+ *   Describes the object linked to the fd.
+ *   Note: The caller must take care the object deletion after CTL_DEL.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+	      struct rte_epoll_event *event);
+
+/**
+ * The function returns the per thread epoll instance.
+ *
+ * @return
+ *   epfd the epoll instance referred to.
+ */
+int
+rte_intr_tls_epfd(void);
+
+/**
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {ADD, DEL}.
+ * @param vec
+ *   RX intr vector number added to the epoll instance wait list.
+ * @param data
+ *   User raw data.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
+		int epfd, int op, unsigned int vec, void *data);
+
+/**
+ * It enables the packet I/O interrupt event if it's necessary.
+ * It creates event fd for each interrupt vector when MSIX is used,
+ * otherwise it multiplexes a single event fd.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param nb_efd
+ *   Number of interrupt vector trying to enable.
+ *   The value 0 is not allowed.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd);
+
+/**
+ * It disables the packet I/O interrupt event.
+ * It deletes registered eventfds and closes the open fds.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle);
+
+/**
+ * The packet I/O interrupt on datapath is enabled or not.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+int
+rte_intr_dp_is_en(struct rte_intr_handle *intr_handle);
+
+/**
+ * The interrupt handle instance allows other causes or not.
+ * Other causes stand for any none packet I/O interrupts.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+int
+rte_intr_allow_others(struct rte_intr_handle *intr_handle);
+
+/**
+ * The multiple interrupt vector capability of interrupt handle instance.
+ * It returns zero if no multiple interrupt vector support.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+int
+rte_intr_cap_multiple(struct rte_intr_handle *intr_handle);
+
+#endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
new file mode 100644
index 00000000..7e5e5984
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -0,0 +1,172 @@
+/*-
+ *   This file is provided under a dual BSD/LGPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GNU LESSER GENERAL PUBLIC LICENSE
+ *
+ *   Copyright(c) 2007-2014 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2.1 of the GNU Lesser General Public License
+ *   as published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ *   Contact Information:
+ *   Intel Corporation
+ *
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _RTE_KNI_COMMON_H_
+#define _RTE_KNI_COMMON_H_
+
+#ifdef __KERNEL__
+#include <linux/if.h>
+#endif
+
+/**
+ * KNI name is part of memzone name.
+ */
+#define RTE_KNI_NAMESIZE 32
+
+#define RTE_CACHE_LINE_MIN_SIZE 64
+
+/*
+ * Request id.
+ */
+enum rte_kni_req_id {
+	RTE_KNI_REQ_UNKNOWN = 0,
+	RTE_KNI_REQ_CHANGE_MTU,
+	RTE_KNI_REQ_CFG_NETWORK_IF,
+	RTE_KNI_REQ_MAX,
+};
+
+/*
+ * Structure for KNI request.
+ */
+struct rte_kni_request {
+	uint32_t req_id;             /**< Request id */
+	union {
+		uint32_t new_mtu;    /**< New MTU */
+		uint8_t if_up;       /**< 1: interface up, 0: interface down */
+	};
+	int32_t result;               /**< Result for processing request */
+} __attribute__((__packed__));
+
+/*
+ * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO
+ * Write and read should wrap around. Fifo is empty when write == read
+ * Writing should never overwrite the read position
+ */
+struct rte_kni_fifo {
+	volatile unsigned write;     /**< Next position to be written*/
+	volatile unsigned read;      /**< Next position to be read */
+	unsigned len;                /**< Circular buffer length */
+	unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
+	void * volatile buffer[0];   /**< The buffer contains mbuf pointers */
+};
+
+/*
+ * The kernel image of the rte_mbuf struct, with only the relevant fields.
+ * Padding is necessary to assure the offsets of these fields
+ */
+struct rte_kni_mbuf {
+	void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE)));
+	char pad0[10];
+	uint16_t data_off;      /**< Start address of data in segment buffer. */
+	char pad1[4];
+	uint64_t ol_flags;      /**< Offload features. */
+	char pad2[4];
+	uint32_t pkt_len;       /**< Total pkt len: sum of all segment data_len. */
+	uint16_t data_len;      /**< Amount of data in segment buffer. */
+
+	/* fields on second cache line */
+	char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE)));
+	void *pool;
+	void *next;
+};
+
+/*
+ * Struct used to create a KNI device. Passed to the kernel in IOCTL call
+ */
+
+struct rte_kni_device_info {
+	char name[RTE_KNI_NAMESIZE];  /**< Network device name for KNI */
+
+	phys_addr_t tx_phys;
+	phys_addr_t rx_phys;
+	phys_addr_t alloc_phys;
+	phys_addr_t free_phys;
+
+	/* Used by Ethtool */
+	phys_addr_t req_phys;
+	phys_addr_t resp_phys;
+	phys_addr_t sync_phys;
+	void * sync_va;
+
+	/* mbuf mempool */
+	void * mbuf_va;
+	phys_addr_t mbuf_phys;
+
+	/* PCI info */
+	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
+	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
+	uint8_t bus;                  /**< Device bus */
+	uint8_t devid;                /**< Device ID */
+	uint8_t function;             /**< Device function. */
+
+	uint16_t group_id;            /**< Group ID */
+	uint32_t core_id;             /**< core ID to bind for kernel thread */
+
+	uint8_t force_bind : 1;       /**< Flag for kernel thread binding */
+
+	/* mbuf size */
+	unsigned mbuf_size;
+};
+
+#define KNI_DEVICE "kni"
+
+#define RTE_KNI_IOCTL_TEST    _IOWR(0, 1, int)
+#define RTE_KNI_IOCTL_CREATE  _IOWR(0, 2, struct rte_kni_device_info)
+#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info)
+
+#endif /* _RTE_KNI_COMMON_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
new file mode 100644
index 00000000..12503efa
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -0,0 +1,156 @@
+DPDK_2.0 {
+	global:
+
+	__rte_panic;
+	devargs_list;
+	eal_parse_sysfs_value;
+	eal_timer_source;
+	lcore_config;
+	pci_device_list;
+	pci_driver_list;
+	per_lcore__lcore_id;
+	per_lcore__rte_errno;
+	rte_calloc;
+	rte_calloc_socket;
+	rte_cpu_check_supported;
+	rte_cpu_get_flag_enabled;
+	rte_cycles_vmware_tsc_map;
+	rte_delay_us;
+	rte_dump_physmem_layout;
+	rte_dump_registers;
+	rte_dump_stack;
+	rte_dump_tailq;
+	rte_eal_alarm_cancel;
+	rte_eal_alarm_set;
+	rte_eal_dev_init;
+	rte_eal_devargs_add;
+	rte_eal_devargs_dump;
+	rte_eal_devargs_type_count;
+	rte_eal_driver_register;
+	rte_eal_driver_unregister;
+	rte_eal_get_configuration;
+	rte_eal_get_lcore_state;
+	rte_eal_get_physmem_layout;
+	rte_eal_get_physmem_size;
+	rte_eal_has_hugepages;
+	rte_eal_hpet_init;
+	rte_eal_init;
+	rte_eal_iopl_init;
+	rte_eal_lcore_role;
+	rte_eal_mp_remote_launch;
+	rte_eal_mp_wait_lcore;
+	rte_eal_parse_devargs_str;
+	rte_eal_pci_dump;
+	rte_eal_pci_probe;
+	rte_eal_pci_probe_one;
+	rte_eal_pci_register;
+	rte_eal_pci_scan;
+	rte_eal_pci_unregister;
+	rte_eal_process_type;
+	rte_eal_remote_launch;
+	rte_eal_tailq_lookup;
+	rte_eal_tailq_register;
+	rte_eal_vdev_init;
+	rte_eal_vdev_uninit;
+	rte_eal_wait_lcore;
+	rte_exit;
+	rte_free;
+	rte_get_hpet_cycles;
+	rte_get_hpet_hz;
+	rte_get_log_level;
+	rte_get_log_type;
+	rte_get_tsc_hz;
+	rte_hexdump;
+	rte_intr_callback_register;
+	rte_intr_callback_unregister;
+	rte_intr_disable;
+	rte_intr_enable;
+	rte_log;
+	rte_log_add_in_history;
+	rte_log_cur_msg_loglevel;
+	rte_log_cur_msg_logtype;
+	rte_log_dump_history;
+	rte_log_set_history;
+	rte_logs;
+	rte_malloc;
+	rte_malloc_dump_stats;
+	rte_malloc_get_socket_stats;
+	rte_malloc_set_limit;
+	rte_malloc_socket;
+	rte_malloc_validate;
+	rte_malloc_virt2phy;
+	rte_mem_lock_page;
+	rte_mem_phy2mch;
+	rte_mem_virt2phy;
+	rte_memdump;
+	rte_memory_get_nchannel;
+	rte_memory_get_nrank;
+	rte_memzone_dump;
+	rte_memzone_lookup;
+	rte_memzone_reserve;
+	rte_memzone_reserve_aligned;
+	rte_memzone_reserve_bounded;
+	rte_memzone_walk;
+	rte_openlog_stream;
+	rte_realloc;
+	rte_set_application_usage_hook;
+	rte_set_log_level;
+	rte_set_log_type;
+	rte_socket_id;
+	rte_strerror;
+	rte_strsplit;
+	rte_sys_gettid;
+	rte_thread_get_affinity;
+	rte_thread_set_affinity;
+	rte_vlog;
+	rte_xen_dom0_memory_attach;
+	rte_xen_dom0_memory_init;
+	rte_zmalloc;
+	rte_zmalloc_socket;
+
+	local: *;
+};
+
+DPDK_2.1 {
+	global:
+
+	rte_eal_pci_detach;
+	rte_eal_pci_read_config;
+	rte_eal_pci_write_config;
+	rte_epoll_ctl;
+	rte_epoll_wait;
+	rte_intr_allow_others;
+	rte_intr_dp_is_en;
+	rte_intr_efd_disable;
+	rte_intr_efd_enable;
+	rte_intr_rx_ctl;
+	rte_intr_tls_epfd;
+	rte_memzone_free;
+
+} DPDK_2.0;
+
+DPDK_2.2 {
+	global:
+
+	rte_intr_cap_multiple;
+	rte_keepalive_create;
+	rte_keepalive_dispatch_pings;
+	rte_keepalive_mark_alive;
+	rte_keepalive_register_core;
+	rte_xen_dom0_supported;
+
+} DPDK_2.1;
+
+DPDK_16.04 {
+	global:
+
+	rte_cpu_get_flag_name;
+	rte_eal_pci_ioport_map;
+	rte_eal_pci_ioport_read;
+	rte_eal_pci_ioport_unmap;
+	rte_eal_pci_ioport_write;
+	rte_eal_pci_map_device;
+	rte_eal_pci_unmap_device;
+	rte_eal_primary_proc_alive;
+
+} DPDK_2.2;
author	C.J. Collier <cjcollier@linuxfoundation.org>	2016-06-14 07:50:17 -0700
committer	C.J. Collier <cjcollier@linuxfoundation.org>	2016-06-14 12:17:54 -0700
commit	97f17497d162afdb82c8704bf097f0fee3724b2e (patch)
tree	1c6269614c0c15ffef8451c58ae8f8b30a1bc804 /lib/librte_eal/linuxapp/eal
parent	e04be89c2409570e0055b2cda60bd11395bb93b0 (diff)