New upstream version 18.05

Change-Id: Icd4170ddc4f63aeae5d0559490e5195b5349f9c2 Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
author: Christian Ehrhardt <christian.ehrhardt@canonical.com> 2018-06-01 09:09:08 +0200
committer: Christian Ehrhardt <christian.ehrhardt@canonical.com> 2018-06-01 09:12:07 +0200
commit: 1bd9b61222f3a81ffe770fc00b70ded6e760c42b (patch)
tree: 0bf7d996cf0664796687c1be6d22958fcf6a8096 /lib/librte_eal/linuxapp/eal
parent: bb4e158029645f37809fcf81a3acddd6fa11f88a (diff)
14 files changed, 4164 insertions, 1140 deletions
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 7e5bbe88..3719ec9d 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -10,7 +10,7 @@ ARCH_DIR ?= $(RTE_ARCH)
 EXPORT_MAP := ../../rte_eal_version.map
 VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR)
 
-LIBABIVER := 6
+LIBABIVER := 7
 
 VPATH += $(RTE_SDK)/lib/librte_eal/common
 
@@ -30,17 +30,20 @@ endif
 
 # specific to linuxapp exec-env
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_cpuflags.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_hugepage_info.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memory.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_dev.c
 
 # from common dir
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c
@@ -48,6 +51,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memalloc.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c
@@ -61,9 +65,11 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c
@@ -81,6 +87,7 @@ CFLAGS_eal_interrupts.o := -D_GNU_SOURCE
 CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE
 CFLAGS_eal_timer.o := -D_GNU_SOURCE
 CFLAGS_eal_lcore.o := -D_GNU_SOURCE
+CFLAGS_eal_memalloc.o := -D_GNU_SOURCE
 CFLAGS_eal_thread.o := -D_GNU_SOURCE
 CFLAGS_eal_log.o := -D_GNU_SOURCE
 CFLAGS_eal_common_log.o := -D_GNU_SOURCE
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 38306bf5..8655b869 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -74,8 +74,8 @@ static int mem_cfg_fd = -1;
 static struct flock wr_lock = {
 		.l_type = F_WRLCK,
 		.l_whence = SEEK_SET,
-		.l_start = offsetof(struct rte_mem_config, memseg),
-		.l_len = sizeof(early_mem_config.memseg),
+		.l_start = offsetof(struct rte_mem_config, memsegs),
+		.l_len = sizeof(early_mem_config.memsegs),
 };
 
 /* Address of global and public configuration */
@@ -92,6 +92,68 @@ struct internal_config internal_config;
 /* used by rte_rdtsc() */
 int rte_cycles_vmware_tsc_map;
 
+/* platform-specific runtime dir */
+static char runtime_dir[PATH_MAX];
+
+static const char *default_runtime_dir = "/var/run";
+
+int
+eal_create_runtime_dir(void)
+{
+	const char *directory = default_runtime_dir;
+	const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR");
+	const char *fallback = "/tmp";
+	char tmp[PATH_MAX];
+	int ret;
+
+	if (getuid() != 0) {
+		/* try XDG path first, fall back to /tmp */
+		if (xdg_runtime_dir != NULL)
+			directory = xdg_runtime_dir;
+		else
+			directory = fallback;
+	}
+	/* create DPDK subdirectory under runtime dir */
+	ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory);
+	if (ret < 0 || ret == sizeof(tmp)) {
+		RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n");
+		return -1;
+	}
+
+	/* create prefix-specific subdirectory under DPDK runtime dir */
+	ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s",
+			tmp, internal_config.hugefile_prefix);
+	if (ret < 0 || ret == sizeof(runtime_dir)) {
+		RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n");
+		return -1;
+	}
+
+	/* create the path if it doesn't exist. no "mkdir -p" here, so do it
+	 * step by step.
+	 */
+	ret = mkdir(tmp, 0700);
+	if (ret < 0 && errno != EEXIST) {
+		RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
+			tmp, strerror(errno));
+		return -1;
+	}
+
+	ret = mkdir(runtime_dir, 0700);
+	if (ret < 0 && errno != EEXIST) {
+		RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
+			runtime_dir, strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
+const char *
+eal_get_runtime_dir(void)
+{
+	return runtime_dir;
+}
+
 /* Return user provided mbuf pool ops name */
 const char * __rte_experimental
 rte_eal_mbuf_user_pool_ops(void)
@@ -348,6 +410,8 @@ eal_usage(const char *prgname)
 	       "  --"OPT_BASE_VIRTADDR"     Base virtual address\n"
 	       "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by hotplug)\n"
 	       "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO (legacy|msi|msix)\n"
+	       "  --"OPT_LEGACY_MEM"        Legacy memory mode (no dynamic allocation, contiguous segments)\n"
+	       "  --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n"
 	       "\n");
 	/* Allow the application to print its usage message too if hook is set */
 	if ( rte_application_usage_hook ) {
@@ -591,7 +655,8 @@ eal_parse_args(int argc, char **argv)
 			break;
 
 		case OPT_MBUF_POOL_OPS_NAME_NUM:
-			internal_config.user_mbuf_pool_ops_name = optarg;
+			internal_config.user_mbuf_pool_ops_name =
+			    strdup(optarg);
 			break;
 
 		default:
@@ -638,23 +703,23 @@ out:
 	return ret;
 }
 
+static int
+check_socket(const struct rte_memseg_list *msl, void *arg)
+{
+	int *socket_id = arg;
+
+	return *socket_id == msl->socket_id;
+}
+
 static void
 eal_check_mem_on_local_socket(void)
 {
-	const struct rte_memseg *ms;
-	int i, socket_id;
+	int socket_id;
 
 	socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
 
-	ms = rte_eal_get_physmem_layout();
-
-	for (i = 0; i < RTE_MAX_MEMSEG; i++)
-		if (ms[i].socket_id == socket_id &&
-				ms[i].len > 0)
-			return;
-
-	RTE_LOG(WARNING, EAL, "WARNING: Master core has no "
-			"memory on local socket!\n");
+	if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
+		RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
 }
 
 static int
@@ -669,6 +734,8 @@ rte_eal_mcfg_complete(void)
 	/* ALL shared mem_config related INIT DONE */
 	if (rte_config.process_type == RTE_PROC_PRIMARY)
 		rte_config.mem_config->magic = RTE_MAGIC;
+
+	internal_config.init_complete = 1;
 }
 
 /*
@@ -689,24 +756,8 @@ rte_eal_iopl_init(void)
 #ifdef VFIO_PRESENT
 static int rte_eal_vfio_setup(void)
 {
-	int vfio_enabled = 0;
-
 	if (rte_vfio_enable("vfio"))
 		return -1;
-	vfio_enabled = rte_vfio_is_enabled("vfio");
-
-	if (vfio_enabled) {
-
-		/* if we are primary process, create a thread to communicate with
-		 * secondary processes. the thread will use a socket to wait for
-		 * requests from secondary process to send open file descriptors,
-		 * because VFIO does not allow multiple open descriptors on a group or
-		 * VFIO container.
-		 */
-		if (internal_config.process_type == RTE_PROC_PRIMARY &&
-				vfio_mp_sync_setup() < 0)
-			return -1;
-	}
 
 	return 0;
 }
@@ -766,6 +817,13 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+	/* create runtime data directory */
+	if (eal_create_runtime_dir() < 0) {
+		rte_eal_init_alert("Cannot create runtime directory\n");
+		rte_errno = EACCES;
+		return -1;
+	}
+
 	if (eal_plugins_init() < 0) {
 		rte_eal_init_alert("Cannot init plugins\n");
 		rte_errno = EINVAL;
@@ -779,6 +837,19 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+	rte_config_init();
+
+	/* Put mp channel init before bus scan so that we can init the vdev
+	 * bus through mp channel in the secondary process before the bus scan.
+	 */
+	if (rte_mp_channel_init() < 0) {
+		rte_eal_init_alert("failed to init mp channel\n");
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			rte_errno = EFAULT;
+			return -1;
+		}
+	}
+
 	if (rte_bus_scan()) {
 		rte_eal_init_alert("Cannot scan the buses for devices\n");
 		rte_errno = ENODEV;
@@ -798,13 +869,17 @@ rte_eal_init(int argc, char **argv)
 			"KNI module inserted\n");
 	}
 
-	if (internal_config.no_hugetlbfs == 0 &&
-			internal_config.process_type != RTE_PROC_SECONDARY &&
-			eal_hugepage_info_init() < 0) {
-		rte_eal_init_alert("Cannot get hugepage information.");
-		rte_errno = EACCES;
-		rte_atomic32_clear(&run_once);
-		return -1;
+	if (internal_config.no_hugetlbfs == 0) {
+		/* rte_config isn't initialized yet */
+		ret = internal_config.process_type == RTE_PROC_PRIMARY ?
+				eal_hugepage_info_init() :
+				eal_hugepage_info_read();
+		if (ret < 0) {
+			rte_eal_init_alert("Cannot get hugepage information.");
+			rte_errno = EACCES;
+			rte_atomic32_clear(&run_once);
+			return -1;
+		}
 	}
 
 	if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
@@ -825,8 +900,6 @@ rte_eal_init(int argc, char **argv)
 
 	rte_srand(rte_rdtsc());
 
-	rte_config_init();
-
 	if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
 		rte_eal_init_alert("Cannot init logging.");
 		rte_errno = ENOMEM;
@@ -834,14 +907,6 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
-	if (rte_mp_channel_init() < 0) {
-		rte_eal_init_alert("failed to init mp channel\n");
-		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
-			rte_errno = EFAULT;
-			return -1;
-		}
-	}
-
 #ifdef VFIO_PRESENT
 	if (rte_eal_vfio_setup() < 0) {
 		rte_eal_init_alert("Cannot init VFIO\n");
@@ -850,6 +915,15 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 #endif
+	/* in secondary processes, memory init may allocate additional fbarrays
+	 * not present in primary processes, so to avoid any potential issues,
+	 * initialize memzones first.
+	 */
+	if (rte_eal_memzone_init() < 0) {
+		rte_eal_init_alert("Cannot init memzone\n");
+		rte_errno = ENODEV;
+		return -1;
+	}
 
 	if (rte_eal_memory_init() < 0) {
 		rte_eal_init_alert("Cannot init memory\n");
@@ -860,8 +934,8 @@ rte_eal_init(int argc, char **argv)
 	/* the directories are locked during eal_hugepage_info_init */
 	eal_hugedirs_unlock();
 
-	if (rte_eal_memzone_init() < 0) {
-		rte_eal_init_alert("Cannot init memzone\n");
+	if (rte_eal_malloc_heap_init() < 0) {
+		rte_eal_init_alert("Cannot init malloc heap\n");
 		rte_errno = ENODEV;
 		return -1;
 	}
@@ -888,7 +962,7 @@ rte_eal_init(int argc, char **argv)
 
 	eal_thread_init_master(rte_config.master_lcore);
 
-	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
+	ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
 
 	RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
 		rte_config.master_lcore, (int)thread_id, cpuset,
@@ -919,7 +993,7 @@ rte_eal_init(int argc, char **argv)
 			rte_panic("Cannot create thread\n");
 
 		/* Set thread_name for aid in debugging. */
-		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
+		snprintf(thread_name, sizeof(thread_name),
 			"lcore-slave-%d", i);
 		ret = rte_thread_setname(lcore_config[i].thread_id,
 						thread_name);
@@ -950,6 +1024,12 @@ rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+#ifdef VFIO_PRESENT
+	/* Register mp action after probe() so that we got enough info */
+	if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
+		return -1;
+#endif
+
 	/* initialize default service/lcore mappings and start running. Ignore
 	 * -ENOTSUP, as it indicates no service coremask passed to EAL.
 	 */
diff --git a/lib/librte_eal/linuxapp/eal/eal_cpuflags.c b/lib/librte_eal/linuxapp/eal/eal_cpuflags.c
new file mode 100644
index 00000000..d38296e1
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_cpuflags.c
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 Red Hat, Inc.
+ */
+
+#include <elf.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#if defined(__GLIBC__) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 16)
+#include <sys/auxv.h>
+#define HAS_AUXV 1
+#endif
+#endif
+
+#include <rte_cpuflags.h>
+
+#ifndef HAS_AUXV
+static unsigned long
+getauxval(unsigned long type __rte_unused)
+{
+	errno = ENOTSUP;
+	return 0;
+}
+#endif
+
+#ifdef RTE_ARCH_64
+typedef Elf64_auxv_t Internal_Elfx_auxv_t;
+#else
+typedef Elf32_auxv_t Internal_Elfx_auxv_t;
+#endif
+
+/**
+ * Provides a method for retrieving values from the auxiliary vector and
+ * possibly running a string comparison.
+ *
+ * @return Always returns a result.  When the result is 0, check errno
+ * to see if an error occurred during processing.
+ */
+static unsigned long
+_rte_cpu_getauxval(unsigned long type, const char *str)
+{
+	unsigned long val;
+
+	errno = 0;
+	val = getauxval(type);
+
+	if (!val && (errno == ENOTSUP || errno == ENOENT)) {
+		int auxv_fd = open("/proc/self/auxv", O_RDONLY);
+		Internal_Elfx_auxv_t auxv;
+
+		if (auxv_fd == -1)
+			return 0;
+
+		errno = ENOENT;
+		while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) {
+			if (auxv.a_type == type) {
+				errno = 0;
+				val = auxv.a_un.a_val;
+				if (str)
+					val = strcmp((const char *)val, str);
+				break;
+			}
+		}
+		close(auxv_fd);
+	}
+
+	return val;
+}
+
+unsigned long
+rte_cpu_getauxval(unsigned long type)
+{
+	return _rte_cpu_getauxval(type, NULL);
+}
+
+int
+rte_cpu_strcmp_auxval(unsigned long type, const char *str)
+{
+	return _rte_cpu_getauxval(type, str);
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c
new file mode 100644
index 00000000..1cf6aebf
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_dev.c
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <string.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <linux/netlink.h>
+
+#include <rte_string_fns.h>
+#include <rte_log.h>
+#include <rte_compat.h>
+#include <rte_dev.h>
+#include <rte_malloc.h>
+#include <rte_interrupts.h>
+#include <rte_alarm.h>
+
+#include "eal_private.h"
+
+static struct rte_intr_handle intr_handle = {.fd = -1 };
+static bool monitor_started;
+
+#define EAL_UEV_MSG_LEN 4096
+#define EAL_UEV_MSG_ELEM_LEN 128
+
+static void dev_uev_handler(__rte_unused void *param);
+
+/* identify the system layer which reports this event. */
+enum eal_dev_event_subsystem {
+	EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */
+	EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */
+	EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */
+	EAL_DEV_EVENT_SUBSYSTEM_MAX
+};
+
+static int
+dev_uev_socket_fd_create(void)
+{
+	struct sockaddr_nl addr;
+	int ret;
+
+	intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC |
+			SOCK_NONBLOCK,
+			NETLINK_KOBJECT_UEVENT);
+	if (intr_handle.fd < 0) {
+		RTE_LOG(ERR, EAL, "create uevent fd failed.\n");
+		return -1;
+	}
+
+	memset(&addr, 0, sizeof(addr));
+	addr.nl_family = AF_NETLINK;
+	addr.nl_pid = 0;
+	addr.nl_groups = 0xffffffff;
+
+	ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr));
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n");
+		goto err;
+	}
+
+	return 0;
+err:
+	close(intr_handle.fd);
+	intr_handle.fd = -1;
+	return ret;
+}
+
+static int
+dev_uev_parse(const char *buf, struct rte_dev_event *event, int length)
+{
+	char action[EAL_UEV_MSG_ELEM_LEN];
+	char subsystem[EAL_UEV_MSG_ELEM_LEN];
+	char pci_slot_name[EAL_UEV_MSG_ELEM_LEN];
+	int i = 0;
+
+	memset(action, 0, EAL_UEV_MSG_ELEM_LEN);
+	memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN);
+	memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN);
+
+	while (i < length) {
+		for (; i < length; i++) {
+			if (*buf)
+				break;
+			buf++;
+		}
+		/**
+		 * check device uevent from kernel side, no need to check
+		 * uevent from udev.
+		 */
+		if (!strncmp(buf, "libudev", 7)) {
+			buf += 7;
+			i += 7;
+			return -1;
+		}
+		if (!strncmp(buf, "ACTION=", 7)) {
+			buf += 7;
+			i += 7;
+			strlcpy(action, buf, sizeof(action));
+		} else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
+			buf += 10;
+			i += 10;
+			strlcpy(subsystem, buf, sizeof(subsystem));
+		} else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) {
+			buf += 14;
+			i += 14;
+			strlcpy(pci_slot_name, buf, sizeof(subsystem));
+			event->devname = strdup(pci_slot_name);
+		}
+		for (; i < length; i++) {
+			if (*buf == '\0')
+				break;
+			buf++;
+		}
+	}
+
+	/* parse the subsystem layer */
+	if (!strncmp(subsystem, "uio", 3))
+		event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO;
+	else if (!strncmp(subsystem, "pci", 3))
+		event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI;
+	else if (!strncmp(subsystem, "vfio", 4))
+		event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO;
+	else
+		return -1;
+
+	/* parse the action type */
+	if (!strncmp(action, "add", 3))
+		event->type = RTE_DEV_EVENT_ADD;
+	else if (!strncmp(action, "remove", 6))
+		event->type = RTE_DEV_EVENT_REMOVE;
+	else
+		return -1;
+	return 0;
+}
+
+static void
+dev_delayed_unregister(void *param)
+{
+	rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param);
+	close(intr_handle.fd);
+	intr_handle.fd = -1;
+}
+
+static void
+dev_uev_handler(__rte_unused void *param)
+{
+	struct rte_dev_event uevent;
+	int ret;
+	char buf[EAL_UEV_MSG_LEN];
+
+	memset(&uevent, 0, sizeof(struct rte_dev_event));
+	memset(buf, 0, EAL_UEV_MSG_LEN);
+
+	ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT);
+	if (ret < 0 && errno == EAGAIN)
+		return;
+	else if (ret <= 0) {
+		/* connection is closed or broken, can not up again. */
+		RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n");
+		rte_eal_alarm_set(1, dev_delayed_unregister, NULL);
+		return;
+	}
+
+	ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN);
+	if (ret < 0) {
+		RTE_LOG(DEBUG, EAL, "It is not an valid event "
+			"that need to be handle.\n");
+		return;
+	}
+
+	RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n",
+		uevent.devname, uevent.type, uevent.subsystem);
+
+	if (uevent.devname)
+		dev_callback_process(uevent.devname, uevent.type);
+}
+
+int __rte_experimental
+rte_dev_event_monitor_start(void)
+{
+	int ret;
+
+	if (monitor_started)
+		return 0;
+
+	ret = dev_uev_socket_fd_create();
+	if (ret) {
+		RTE_LOG(ERR, EAL, "error create device event fd.\n");
+		return -1;
+	}
+
+	intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT;
+	ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL);
+
+	if (ret) {
+		RTE_LOG(ERR, EAL, "fail to register uevent callback.\n");
+		return -1;
+	}
+
+	monitor_started = true;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_dev_event_monitor_stop(void)
+{
+	int ret;
+
+	if (!monitor_started)
+		return 0;
+
+	ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler,
+					   (void *)-1);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n");
+		return ret;
+	}
+
+	close(intr_handle.fd);
+	intr_handle.fd = -1;
+	monitor_started = false;
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 8bbf771a..7eca711b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -14,7 +14,9 @@
 #include <stdarg.h>
 #include <unistd.h>
 #include <errno.h>
+#include <sys/mman.h>
 #include <sys/queue.h>
+#include <sys/stat.h>
 
 #include <rte_memory.h>
 #include <rte_eal.h>
@@ -30,6 +32,40 @@
 #include "eal_filesystem.h"
 
 static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
+static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";
+
+/*
+ * Uses mmap to create a shared memory area for storage of data
+ * Used in this file to store the hugepage file map on disk
+ */
+static void *
+map_shared_memory(const char *filename, const size_t mem_size, int flags)
+{
+	void *retval;
+	int fd = open(filename, flags, 0666);
+	if (fd < 0)
+		return NULL;
+	if (ftruncate(fd, mem_size) < 0) {
+		close(fd);
+		return NULL;
+	}
+	retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
+			MAP_SHARED, fd, 0);
+	close(fd);
+	return retval;
+}
+
+static void *
+open_shared_memory(const char *filename, const size_t mem_size)
+{
+	return map_shared_memory(filename, mem_size, O_RDWR);
+}
+
+static void *
+create_shared_memory(const char *filename, const size_t mem_size)
+{
+	return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT);
+}
 
 /* this function is only called from eal_hugepage_info_init which itself
  * is only called from a primary process */
@@ -70,6 +106,45 @@ get_num_hugepages(const char *subdir)
 	return num_pages;
 }
 
+static uint32_t
+get_num_hugepages_on_node(const char *subdir, unsigned int socket)
+{
+	char path[PATH_MAX], socketpath[PATH_MAX];
+	DIR *socketdir;
+	unsigned long num_pages = 0;
+	const char *nr_hp_file = "free_hugepages";
+
+	snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",
+		sys_pages_numa_dir_path, socket);
+
+	socketdir = opendir(socketpath);
+	if (socketdir) {
+		/* Keep calm and carry on */
+		closedir(socketdir);
+	} else {
+		/* Can't find socket dir, so ignore it */
+		return 0;
+	}
+
+	snprintf(path, sizeof(path), "%s/%s/%s",
+			socketpath, subdir, nr_hp_file);
+	if (eal_parse_sysfs_value(path, &num_pages) < 0)
+		return 0;
+
+	if (num_pages == 0)
+		RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
+				subdir);
+
+	/*
+	 * we want to return a uint32_t and more than this looks suspicious
+	 * anyway ...
+	 */
+	if (num_pages > UINT32_MAX)
+		num_pages = UINT32_MAX;
+
+	return num_pages;
+}
+
 static uint64_t
 get_default_hp_size(void)
 {
@@ -94,8 +169,8 @@ get_default_hp_size(void)
 	return size;
 }
 
-static const char *
-get_hugepage_dir(uint64_t hugepage_sz)
+static int
+get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len)
 {
 	enum proc_mount_fieldnames {
 		DEVICE = 0,
@@ -113,7 +188,7 @@ get_hugepage_dir(uint64_t hugepage_sz)
 	const char split_tok = ' ';
 	char *splitstr[_FIELDNAME_MAX];
 	char buf[BUFSIZ];
-	char *retval = NULL;
+	int retval = -1;
 
 	FILE *fd = fopen(proc_mounts, "r");
 	if (fd == NULL)
@@ -140,7 +215,8 @@ get_hugepage_dir(uint64_t hugepage_sz)
 			/* if no explicit page size, the default page size is compared */
 			if (pagesz_str == NULL){
 				if (hugepage_sz == default_size){
-					retval = strdup(splitstr[MOUNTPT]);
+					strlcpy(hugedir, splitstr[MOUNTPT], len);
+					retval = 0;
 					break;
 				}
 			}
@@ -148,7 +224,8 @@ get_hugepage_dir(uint64_t hugepage_sz)
 			else {
 				uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]);
 				if (pagesz == hugepage_sz) {
-					retval = strdup(splitstr[MOUNTPT]);
+					strlcpy(hugedir, splitstr[MOUNTPT], len);
+					retval = 0;
 					break;
 				}
 			}
@@ -207,11 +284,9 @@ clear_hugedir(const char * hugedir)
 		/* non-blocking lock */
 		lck_result = flock(fd, LOCK_EX | LOCK_NB);
 
-		/* if lock succeeds, unlock and remove the file */
-		if (lck_result != -1) {
-			flock(fd, LOCK_UN);
+		/* if lock succeeds, remove the file */
+		if (lck_result != -1)
 			unlinkat(dir_fd, dirent->d_name, 0);
-		}
 		close (fd);
 		dirent = readdir(dir);
 	}
@@ -238,17 +313,11 @@ compare_hpi(const void *a, const void *b)
 	return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
 }
 
-/*
- * when we initialize the hugepage info, everything goes
- * to socket 0 by default. it will later get sorted by memory
- * initialization procedure.
- */
-int
-eal_hugepage_info_init(void)
-{
-	const char dirent_start_text[] = "hugepages-";
+static int
+hugepage_info_init(void)
+{	const char dirent_start_text[] = "hugepages-";
 	const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
-	unsigned i, num_sizes = 0;
+	unsigned int i, total_pages, num_sizes = 0;
 	DIR *dir;
 	struct dirent *dirent;
 
@@ -273,10 +342,10 @@ eal_hugepage_info_init(void)
 		hpi = &internal_config.hugepage_info[num_sizes];
 		hpi->hugepage_sz =
 			rte_str_to_size(&dirent->d_name[dirent_start_len]);
-		hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz);
 
 		/* first, check if we have a mountpoint */
-		if (hpi->hugedir == NULL) {
+		if (get_hugepage_dir(hpi->hugepage_sz,
+			hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
 			uint32_t num_pages;
 
 			num_pages = get_num_hugepages(dirent->d_name);
@@ -302,9 +371,28 @@ eal_hugepage_info_init(void)
 		if (clear_hugedir(hpi->hugedir) == -1)
 			break;
 
-		/* for now, put all pages into socket 0,
-		 * later they will be sorted */
-		hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+		/*
+		 * first, try to put all hugepages into relevant sockets, but
+		 * if first attempts fails, fall back to collecting all pages
+		 * in one socket and sorting them later
+		 */
+		total_pages = 0;
+		/* we also don't want to do this for legacy init */
+		if (!internal_config.legacy_mem)
+			for (i = 0; i < rte_socket_count(); i++) {
+				int socket = rte_socket_id_by_idx(i);
+				unsigned int num_pages =
+						get_num_hugepages_on_node(
+							dirent->d_name, socket);
+				hpi->num_pages[socket] = num_pages;
+				total_pages += num_pages;
+			}
+		/*
+		 * we failed to sort memory from the get go, so fall
+		 * back to old way
+		 */
+		if (total_pages == 0)
+			hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
 
 #ifndef RTE_ARCH_64
 		/* for 32-bit systems, limit number of hugepages to
@@ -328,11 +416,79 @@ eal_hugepage_info_init(void)
 	      sizeof(internal_config.hugepage_info[0]), compare_hpi);
 
 	/* now we have all info, check we have at least one valid size */
-	for (i = 0; i < num_sizes; i++)
-		if (internal_config.hugepage_info[i].hugedir != NULL &&
-		    internal_config.hugepage_info[i].num_pages[0] > 0)
+	for (i = 0; i < num_sizes; i++) {
+		/* pages may no longer all be on socket 0, so check all */
+		unsigned int j, num_pages = 0;
+		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+
+		for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
+			num_pages += hpi->num_pages[j];
+		if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0 &&
+				num_pages > 0)
 			return 0;
+	}
 
 	/* no valid hugepage mounts available, return error */
 	return -1;
 }
+
+/*
+ * when we initialize the hugepage info, everything goes
+ * to socket 0 by default. it will later get sorted by memory
+ * initialization procedure.
+ */
+int
+eal_hugepage_info_init(void)
+{
+	struct hugepage_info *hpi, *tmp_hpi;
+	unsigned int i;
+
+	if (hugepage_info_init() < 0)
+		return -1;
+
+	hpi = &internal_config.hugepage_info[0];
+
+	tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
+			sizeof(internal_config.hugepage_info));
+	if (tmp_hpi == NULL) {
+		RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
+		return -1;
+	}
+
+	memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));
+
+	/* we've copied file descriptors along with everything else, but they
+	 * will be invalid in secondary process, so overwrite them
+	 */
+	for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+		struct hugepage_info *tmp = &tmp_hpi[i];
+		tmp->lock_descriptor = -1;
+	}
+
+	if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
+		RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
+		return -1;
+	}
+	return 0;
+}
+
+int eal_hugepage_info_read(void)
+{
+	struct hugepage_info *hpi = &internal_config.hugepage_info[0];
+	struct hugepage_info *tmp_hpi;
+
+	tmp_hpi = open_shared_memory(eal_hugepage_info_path(),
+				  sizeof(internal_config.hugepage_info));
+	if (tmp_hpi == NULL) {
+		RTE_LOG(ERR, EAL, "Failed to open shared memory!\n");
+		return -1;
+	}
+
+	memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info));
+
+	if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
+		RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
+		return -1;
+	}
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index f86f22f7..056d41c1 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -559,6 +559,9 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle)
 			return -1;
 		break;
 #endif
+	/* not used at this moment */
+	case RTE_INTR_HANDLE_DEV_EVENT:
+		return -1;
 	/* unknown handle type */
 	default:
 		RTE_LOG(ERR, EAL,
@@ -606,6 +609,9 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle)
 			return -1;
 		break;
 #endif
+	/* not used at this moment */
+	case RTE_INTR_HANDLE_DEV_EVENT:
+		return -1;
 	/* unknown handle type */
 	default:
 		RTE_LOG(ERR, EAL,
@@ -674,7 +680,10 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
 			bytes_read = 0;
 			call = true;
 			break;
-
+		case RTE_INTR_HANDLE_DEV_EVENT:
+			bytes_read = 0;
+			call = true;
+			break;
 		default:
 			bytes_read = 1;
 			break;
@@ -844,8 +853,7 @@ eal_intr_thread_main(__rte_unused void *arg)
 int
 rte_eal_intr_init(void)
 {
-	int ret = 0, ret_1 = 0;
-	char thread_name[RTE_MAX_THREAD_NAME_LEN];
+	int ret = 0;
 
 	/* init the global interrupt source head */
 	TAILQ_INIT(&intr_sources);
@@ -860,23 +868,15 @@ rte_eal_intr_init(void)
 	}
 
 	/* create the host thread to wait/handle the interrupt */
-	ret = pthread_create(&intr_thread, NULL,
+	ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
 			eal_intr_thread_main, NULL);
 	if (ret != 0) {
-		rte_errno = ret;
+		rte_errno = -ret;
 		RTE_LOG(ERR, EAL,
 			"Failed to create thread for interrupt handling\n");
-	} else {
-		/* Set thread_name for aid in debugging. */
-		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
-			"eal-intr-thread");
-		ret_1 = rte_thread_setname(intr_thread, thread_name);
-		if (ret_1 != 0)
-			RTE_LOG(DEBUG, EAL,
-			"Failed to set thread name for interrupt handling\n");
 	}
 
-	return -ret;
+	return ret;
 }
 
 static void
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
new file mode 100644
index 00000000..8c11f98c
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -0,0 +1,1309 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+#include <numa.h>
+#include <numaif.h>
+#endif
+#include <linux/falloc.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_eal_memconfig.h>
+#include <rte_eal.h>
+#include <rte_memory.h>
+#include <rte_spinlock.h>
+
+#include "eal_filesystem.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+
+/*
+ * not all kernel version support fallocate on hugetlbfs, so fall back to
+ * ftruncate and disallow deallocation if fallocate is not supported.
+ */
+static int fallocate_supported = -1; /* unknown */
+
+/* for single-file segments, we need some kind of mechanism to keep track of
+ * which hugepages can be freed back to the system, and which cannot. we cannot
+ * use flock() because they don't allow locking parts of a file, and we cannot
+ * use fcntl() due to issues with their semantics, so we will have to rely on a
+ * bunch of lockfiles for each page.
+ *
+ * we cannot know how many pages a system will have in advance, but we do know
+ * that they come in lists, and we know lengths of these lists. so, simply store
+ * a malloc'd array of fd's indexed by list and segment index.
+ *
+ * they will be initialized at startup, and filled as we allocate/deallocate
+ * segments. also, use this to track memseg list proper fd.
+ */
+static struct {
+	int *fds; /**< dynamically allocated array of segment lock fd's */
+	int memseg_list_fd; /**< memseg list fd */
+	int len; /**< total length of the array */
+	int count; /**< entries used in an array */
+} lock_fds[RTE_MAX_MEMSEG_LISTS];
+
+/** local copy of a memory map, used to synchronize memory hotplug in MP */
+static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
+
+static sigjmp_buf huge_jmpenv;
+
+static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
+{
+	siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int __rte_unused huge_wrap_sigsetjmp(void)
+{
+	return sigsetjmp(huge_jmpenv, 1);
+}
+
+static struct sigaction huge_action_old;
+static int huge_need_recover;
+
+static void __rte_unused
+huge_register_sigbus(void)
+{
+	sigset_t mask;
+	struct sigaction action;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = 0;
+	action.sa_mask = mask;
+	action.sa_handler = huge_sigbus_handler;
+
+	huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
+}
+
+static void __rte_unused
+huge_recover_sigbus(void)
+{
+	if (huge_need_recover) {
+		sigaction(SIGBUS, &huge_action_old, NULL);
+		huge_need_recover = 0;
+	}
+}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+static bool
+check_numa(void)
+{
+	bool ret = true;
+	/* Check if kernel supports NUMA. */
+	if (numa_available() != 0) {
+		RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
+		ret = false;
+	}
+	return ret;
+}
+
+static void
+prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
+{
+	RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+	if (get_mempolicy(oldpolicy, oldmask->maskp,
+			  oldmask->size + 1, 0, 0) < 0) {
+		RTE_LOG(ERR, EAL,
+			"Failed to get current mempolicy: %s. "
+			"Assuming MPOL_DEFAULT.\n", strerror(errno));
+		oldpolicy = MPOL_DEFAULT;
+	}
+	RTE_LOG(DEBUG, EAL,
+		"Setting policy MPOL_PREFERRED for socket %d\n",
+		socket_id);
+	numa_set_preferred(socket_id);
+}
+
+static void
+restore_numa(int *oldpolicy, struct bitmask *oldmask)
+{
+	RTE_LOG(DEBUG, EAL,
+		"Restoring previous memory policy: %d\n", *oldpolicy);
+	if (*oldpolicy == MPOL_DEFAULT) {
+		numa_set_localalloc();
+	} else if (set_mempolicy(*oldpolicy, oldmask->maskp,
+				 oldmask->size + 1) < 0) {
+		RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
+			strerror(errno));
+		numa_set_localalloc();
+	}
+	numa_free_cpumask(oldmask);
+}
+#endif
+
+/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+get_file_size(int fd)
+{
+	struct stat st;
+	if (fstat(fd, &st) < 0)
+		return 0;
+	return st.st_size;
+}
+
+/* we cannot use rte_memseg_list_walk() here because we will be holding a
+ * write lock whenever we enter every function in this file, however copying
+ * the same iteration code everywhere is not ideal as well. so, use a lockless
+ * copy of memseg list walk here.
+ */
+static int
+memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	int i, ret = 0;
+
+	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+		struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+		if (msl->base_va == NULL)
+			continue;
+
+		ret = func(msl, arg);
+		if (ret < 0)
+			return -1;
+		if (ret > 0)
+			return 1;
+	}
+	return 0;
+}
+
+/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
+static int lock(int fd, int type)
+{
+	int ret;
+
+	/* flock may be interrupted */
+	do {
+		ret = flock(fd, type | LOCK_NB);
+	} while (ret && errno == EINTR);
+
+	if (ret && errno == EWOULDBLOCK) {
+		/* couldn't lock */
+		return 0;
+	} else if (ret) {
+		RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n",
+			__func__, strerror(errno));
+		return -1;
+	}
+	/* lock was successful */
+	return 1;
+}
+
+static int get_segment_lock_fd(int list_idx, int seg_idx)
+{
+	char path[PATH_MAX] = {0};
+	int fd;
+
+	if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds))
+		return -1;
+	if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len)
+		return -1;
+
+	fd = lock_fds[list_idx].fds[seg_idx];
+	/* does this lock already exist? */
+	if (fd >= 0)
+		return fd;
+
+	eal_get_hugefile_lock_path(path, sizeof(path),
+			list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+
+	fd = open(path, O_CREAT | O_RDWR, 0660);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "%s(): error creating lockfile '%s': %s\n",
+			__func__, path, strerror(errno));
+		return -1;
+	}
+	/* take out a read lock */
+	if (lock(fd, LOCK_SH) != 1) {
+		RTE_LOG(ERR, EAL, "%s(): failed to take out a readlock on '%s': %s\n",
+			__func__, path, strerror(errno));
+		close(fd);
+		return -1;
+	}
+	/* store it for future reference */
+	lock_fds[list_idx].fds[seg_idx] = fd;
+	lock_fds[list_idx].count++;
+	return fd;
+}
+
+static int unlock_segment(int list_idx, int seg_idx)
+{
+	int fd, ret;
+
+	if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds))
+		return -1;
+	if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len)
+		return -1;
+
+	fd = lock_fds[list_idx].fds[seg_idx];
+
+	/* upgrade lock to exclusive to see if we can remove the lockfile */
+	ret = lock(fd, LOCK_EX);
+	if (ret == 1) {
+		/* we've succeeded in taking exclusive lock, this lockfile may
+		 * be removed.
+		 */
+		char path[PATH_MAX] = {0};
+		eal_get_hugefile_lock_path(path, sizeof(path),
+				list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+		if (unlink(path)) {
+			RTE_LOG(ERR, EAL, "%s(): error removing lockfile '%s': %s\n",
+					__func__, path, strerror(errno));
+		}
+	}
+	/* we don't want to leak the fd, so even if we fail to lock, close fd
+	 * and remove it from list anyway.
+	 */
+	close(fd);
+	lock_fds[list_idx].fds[seg_idx] = -1;
+	lock_fds[list_idx].count--;
+
+	if (ret < 0)
+		return -1;
+	return 0;
+}
+
+static int
+get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
+		unsigned int list_idx, unsigned int seg_idx)
+{
+	int fd;
+
+	if (internal_config.single_file_segments) {
+		/* create a hugepage file path */
+		eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
+
+		fd = lock_fds[list_idx].memseg_list_fd;
+
+		if (fd < 0) {
+			fd = open(path, O_CREAT | O_RDWR, 0600);
+			if (fd < 0) {
+				RTE_LOG(ERR, EAL, "%s(): open failed: %s\n",
+					__func__, strerror(errno));
+				return -1;
+			}
+			/* take out a read lock and keep it indefinitely */
+			if (lock(fd, LOCK_SH) < 0) {
+				RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
+					__func__, strerror(errno));
+				close(fd);
+				return -1;
+			}
+			lock_fds[list_idx].memseg_list_fd = fd;
+		}
+	} else {
+		/* create a hugepage file path */
+		eal_get_hugefile_path(path, buflen, hi->hugedir,
+				list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+		fd = open(path, O_CREAT | O_RDWR, 0600);
+		if (fd < 0) {
+			RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
+					strerror(errno));
+			return -1;
+		}
+		/* take out a read lock */
+		if (lock(fd, LOCK_SH) < 0) {
+			RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
+				__func__, strerror(errno));
+			close(fd);
+			return -1;
+		}
+	}
+	return fd;
+}
+
+static int
+resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
+		uint64_t fa_offset, uint64_t page_sz, bool grow)
+{
+	bool again = false;
+	do {
+		if (fallocate_supported == 0) {
+			/* we cannot deallocate memory if fallocate() is not
+			 * supported, and hugepage file is already locked at
+			 * creation, so no further synchronization needed.
+			 */
+
+			if (!grow) {
+				RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
+					__func__);
+				return -1;
+			}
+			uint64_t new_size = fa_offset + page_sz;
+			uint64_t cur_size = get_file_size(fd);
+
+			/* fallocate isn't supported, fall back to ftruncate */
+			if (new_size > cur_size &&
+					ftruncate(fd, new_size) < 0) {
+				RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+					__func__, strerror(errno));
+				return -1;
+			}
+		} else {
+			int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+					FALLOC_FL_KEEP_SIZE;
+			int ret, lock_fd;
+
+			/* if fallocate() is supported, we need to take out a
+			 * read lock on allocate (to prevent other processes
+			 * from deallocating this page), and take out a write
+			 * lock on deallocate (to ensure nobody else is using
+			 * this page).
+			 *
+			 * read locks on page itself are already taken out at
+			 * file creation, in get_seg_fd().
+			 *
+			 * we cannot rely on simple use of flock() call, because
+			 * we need to be able to lock a section of the file,
+			 * and we cannot use fcntl() locks, because of numerous
+			 * problems with their semantics, so we will use
+			 * deterministically named lock files for each section
+			 * of the file.
+			 *
+			 * if we're shrinking the file, we want to upgrade our
+			 * lock from shared to exclusive.
+			 *
+			 * lock_fd is an fd for a lockfile, not for the segment
+			 * list.
+			 */
+			lock_fd = get_segment_lock_fd(list_idx, seg_idx);
+
+			if (!grow) {
+				/* we are using this lockfile to determine
+				 * whether this particular page is locked, as we
+				 * are in single file segments mode and thus
+				 * cannot use regular flock() to get this info.
+				 *
+				 * we want to try and take out an exclusive lock
+				 * on the lock file to determine if we're the
+				 * last ones using this page, and if not, we
+				 * won't be shrinking it, and will instead exit
+				 * prematurely.
+				 */
+				ret = lock(lock_fd, LOCK_EX);
+
+				/* drop the lock on the lockfile, so that even
+				 * if we couldn't shrink the file ourselves, we
+				 * are signalling to other processes that we're
+				 * no longer using this page.
+				 */
+				if (unlock_segment(list_idx, seg_idx))
+					RTE_LOG(ERR, EAL, "Could not unlock segment\n");
+
+				/* additionally, if this was the last lock on
+				 * this segment list, we can safely close the
+				 * page file fd, so that one of the processes
+				 * could then delete the file after shrinking.
+				 */
+				if (ret < 1 && lock_fds[list_idx].count == 0) {
+					close(fd);
+					lock_fds[list_idx].memseg_list_fd = -1;
+				}
+
+				if (ret < 0) {
+					RTE_LOG(ERR, EAL, "Could not lock segment\n");
+					return -1;
+				}
+				if (ret == 0)
+					/* failed to lock, not an error. */
+					return 0;
+			}
+
+			/* grow or shrink the file */
+			ret = fallocate(fd, flags, fa_offset, page_sz);
+
+			if (ret < 0) {
+				if (fallocate_supported == -1 &&
+						errno == ENOTSUP) {
+					RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
+						__func__);
+					again = true;
+					fallocate_supported = 0;
+				} else {
+					RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+						__func__,
+						strerror(errno));
+					return -1;
+				}
+			} else {
+				fallocate_supported = 1;
+
+				/* we've grew/shrunk the file, and we hold an
+				 * exclusive lock now. check if there are no
+				 * more segments active in this segment list,
+				 * and remove the file if there aren't.
+				 */
+				if (lock_fds[list_idx].count == 0) {
+					if (unlink(path))
+						RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
+							__func__, path,
+							strerror(errno));
+					close(fd);
+					lock_fds[list_idx].memseg_list_fd = -1;
+				}
+			}
+		}
+	} while (again);
+	return 0;
+}
+
+static int
+alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
+		struct hugepage_info *hi, unsigned int list_idx,
+		unsigned int seg_idx)
+{
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+	int cur_socket_id = 0;
+#endif
+	uint64_t map_offset;
+	char path[PATH_MAX];
+	int ret = 0;
+	int fd;
+	size_t alloc_sz;
+
+	/* takes out a read lock on segment or segment list */
+	fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
+		return -1;
+	}
+
+	alloc_sz = hi->hugepage_sz;
+	if (internal_config.single_file_segments) {
+		map_offset = seg_idx * alloc_sz;
+		ret = resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
+				alloc_sz, true);
+		if (ret < 0)
+			goto resized;
+	} else {
+		map_offset = 0;
+		if (ftruncate(fd, alloc_sz) < 0) {
+			RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+				__func__, strerror(errno));
+			goto resized;
+		}
+	}
+
+	/*
+	 * map the segment, and populate page tables, the kernel fills this
+	 * segment with zeros if it's a new page.
+	 */
+	void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
+			MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset);
+
+	if (va == MAP_FAILED) {
+		RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
+			strerror(errno));
+		goto resized;
+	}
+	if (va != addr) {
+		RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
+		munmap(va, alloc_sz);
+		goto resized;
+	}
+
+	rte_iova_t iova = rte_mem_virt2iova(addr);
+	if (iova == RTE_BAD_PHYS_ADDR) {
+		RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
+			__func__);
+		goto mapped;
+	}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+	move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
+
+	if (cur_socket_id != socket_id) {
+		RTE_LOG(DEBUG, EAL,
+				"%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
+			__func__, socket_id, cur_socket_id);
+		goto mapped;
+	}
+#endif
+
+	/* In linux, hugetlb limitations, like cgroup, are
+	 * enforced at fault time instead of mmap(), even
+	 * with the option of MAP_POPULATE. Kernel will send
+	 * a SIGBUS signal. To avoid to be killed, save stack
+	 * environment here, if SIGBUS happens, we can jump
+	 * back here.
+	 */
+	if (huge_wrap_sigsetjmp()) {
+		RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
+			(unsigned int)(alloc_sz >> 20));
+		goto mapped;
+	}
+	/* for non-single file segments, we can close fd here */
+	if (!internal_config.single_file_segments)
+		close(fd);
+
+	/* we need to trigger a write to the page to enforce page fault and
+	 * ensure that page is accessible to us, but we can't overwrite value
+	 * that is already there, so read the old value, and write itback.
+	 * kernel populates the page with zeroes initially.
+	 */
+	*(volatile int *)addr = *(volatile int *)addr;
+
+	ms->addr = addr;
+	ms->hugepage_sz = alloc_sz;
+	ms->len = alloc_sz;
+	ms->nchannel = rte_memory_get_nchannel();
+	ms->nrank = rte_memory_get_nrank();
+	ms->iova = iova;
+	ms->socket_id = socket_id;
+
+	return 0;
+
+mapped:
+	munmap(addr, alloc_sz);
+resized:
+	if (internal_config.single_file_segments) {
+		resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
+				alloc_sz, false);
+		/* ignore failure, can't make it any worse */
+	} else {
+		/* only remove file if we can take out a write lock */
+		if (lock(fd, LOCK_EX) == 1)
+			unlink(path);
+		close(fd);
+	}
+	return -1;
+}
+
+static int
+free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
+		unsigned int list_idx, unsigned int seg_idx)
+{
+	uint64_t map_offset;
+	char path[PATH_MAX];
+	int fd, ret;
+
+	/* erase page data */
+	memset(ms->addr, 0, ms->len);
+
+	if (mmap(ms->addr, ms->len, PROT_READ,
+			MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
+				MAP_FAILED) {
+		RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
+		return -1;
+	}
+
+	/* if we are not in single file segments mode, we're going to unmap the
+	 * segment and thus drop the lock on original fd, but hugepage dir is
+	 * now locked so we can take out another one without races.
+	 */
+	fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+	if (fd < 0)
+		return -1;
+
+	if (internal_config.single_file_segments) {
+		map_offset = seg_idx * ms->len;
+		if (resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
+				ms->len, false))
+			return -1;
+		ret = 0;
+	} else {
+		/* if we're able to take out a write lock, we're the last one
+		 * holding onto this page.
+		 */
+		ret = lock(fd, LOCK_EX);
+		if (ret >= 0) {
+			/* no one else is using this page */
+			if (ret == 1)
+				unlink(path);
+		}
+		/* closing fd will drop the lock */
+		close(fd);
+	}
+
+	memset(ms, 0, sizeof(*ms));
+
+	return ret < 0 ? -1 : 0;
+}
+
+struct alloc_walk_param {
+	struct hugepage_info *hi;
+	struct rte_memseg **ms;
+	size_t page_sz;
+	unsigned int segs_allocated;
+	unsigned int n_segs;
+	int socket;
+	bool exact;
+};
+static int
+alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct alloc_walk_param *wa = arg;
+	struct rte_memseg_list *cur_msl;
+	size_t page_sz;
+	int cur_idx, start_idx, j, dir_fd = -1;
+	unsigned int msl_idx, need, i;
+
+	if (msl->page_sz != wa->page_sz)
+		return 0;
+	if (msl->socket_id != wa->socket)
+		return 0;
+
+	page_sz = (size_t)msl->page_sz;
+
+	msl_idx = msl - mcfg->memsegs;
+	cur_msl = &mcfg->memsegs[msl_idx];
+
+	need = wa->n_segs;
+
+	/* try finding space in memseg list */
+	cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
+	if (cur_idx < 0)
+		return 0;
+	start_idx = cur_idx;
+
+	/* do not allow any page allocations during the time we're allocating,
+	 * because file creation and locking operations are not atomic,
+	 * and we might be the first or the last ones to use a particular page,
+	 * so we need to ensure atomicity of every operation.
+	 *
+	 * during init, we already hold a write lock, so don't try to take out
+	 * another one.
+	 */
+	if (wa->hi->lock_descriptor == -1) {
+		dir_fd = open(wa->hi->hugedir, O_RDONLY);
+		if (dir_fd < 0) {
+			RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
+				__func__, wa->hi->hugedir, strerror(errno));
+			return -1;
+		}
+		/* blocking writelock */
+		if (flock(dir_fd, LOCK_EX)) {
+			RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
+				__func__, wa->hi->hugedir, strerror(errno));
+			close(dir_fd);
+			return -1;
+		}
+	}
+
+	for (i = 0; i < need; i++, cur_idx++) {
+		struct rte_memseg *cur;
+		void *map_addr;
+
+		cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
+		map_addr = RTE_PTR_ADD(cur_msl->base_va,
+				cur_idx * page_sz);
+
+		if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
+				msl_idx, cur_idx)) {
+			RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n",
+				need, i);
+
+			/* if exact number wasn't requested, stop */
+			if (!wa->exact)
+				goto out;
+
+			/* clean up */
+			for (j = start_idx; j < cur_idx; j++) {
+				struct rte_memseg *tmp;
+				struct rte_fbarray *arr =
+						&cur_msl->memseg_arr;
+
+				tmp = rte_fbarray_get(arr, j);
+				rte_fbarray_set_free(arr, j);
+
+				/* free_seg may attempt to create a file, which
+				 * may fail.
+				 */
+				if (free_seg(tmp, wa->hi, msl_idx, j))
+					RTE_LOG(DEBUG, EAL, "Cannot free page\n");
+			}
+			/* clear the list */
+			if (wa->ms)
+				memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
+
+			if (dir_fd >= 0)
+				close(dir_fd);
+			return -1;
+		}
+		if (wa->ms)
+			wa->ms[i] = cur;
+
+		rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
+	}
+out:
+	wa->segs_allocated = i;
+	if (i > 0)
+		cur_msl->version++;
+	if (dir_fd >= 0)
+		close(dir_fd);
+	return 1;
+}
+
+struct free_walk_param {
+	struct hugepage_info *hi;
+	struct rte_memseg *ms;
+};
+static int
+free_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct rte_memseg_list *found_msl;
+	struct free_walk_param *wa = arg;
+	uintptr_t start_addr, end_addr;
+	int msl_idx, seg_idx, ret, dir_fd = -1;
+
+	start_addr = (uintptr_t) msl->base_va;
+	end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
+
+	if ((uintptr_t)wa->ms->addr < start_addr ||
+			(uintptr_t)wa->ms->addr >= end_addr)
+		return 0;
+
+	msl_idx = msl - mcfg->memsegs;
+	seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
+
+	/* msl is const */
+	found_msl = &mcfg->memsegs[msl_idx];
+
+	/* do not allow any page allocations during the time we're freeing,
+	 * because file creation and locking operations are not atomic,
+	 * and we might be the first or the last ones to use a particular page,
+	 * so we need to ensure atomicity of every operation.
+	 *
+	 * during init, we already hold a write lock, so don't try to take out
+	 * another one.
+	 */
+	if (wa->hi->lock_descriptor == -1) {
+		dir_fd = open(wa->hi->hugedir, O_RDONLY);
+		if (dir_fd < 0) {
+			RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
+				__func__, wa->hi->hugedir, strerror(errno));
+			return -1;
+		}
+		/* blocking writelock */
+		if (flock(dir_fd, LOCK_EX)) {
+			RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
+				__func__, wa->hi->hugedir, strerror(errno));
+			close(dir_fd);
+			return -1;
+		}
+	}
+
+	found_msl->version++;
+
+	rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
+
+	ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
+
+	if (dir_fd >= 0)
+		close(dir_fd);
+
+	if (ret < 0)
+		return -1;
+
+	return 1;
+}
+
+int
+eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
+		int socket, bool exact)
+{
+	int i, ret = -1;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+	bool have_numa = false;
+	int oldpolicy;
+	struct bitmask *oldmask;
+#endif
+	struct alloc_walk_param wa;
+	struct hugepage_info *hi = NULL;
+
+	memset(&wa, 0, sizeof(wa));
+
+	/* dynamic allocation not supported in legacy mode */
+	if (internal_config.legacy_mem)
+		return -1;
+
+	for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
+		if (page_sz ==
+				internal_config.hugepage_info[i].hugepage_sz) {
+			hi = &internal_config.hugepage_info[i];
+			break;
+		}
+	}
+	if (!hi) {
+		RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
+			__func__);
+		return -1;
+	}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+	if (check_numa()) {
+		oldmask = numa_allocate_nodemask();
+		prepare_numa(&oldpolicy, oldmask, socket);
+		have_numa = true;
+	}
+#endif
+
+	wa.exact = exact;
+	wa.hi = hi;
+	wa.ms = ms;
+	wa.n_segs = n_segs;
+	wa.page_sz = page_sz;
+	wa.socket = socket;
+	wa.segs_allocated = 0;
+
+	ret = memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
+	if (ret == 0) {
+		RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
+			__func__);
+		ret = -1;
+	} else if (ret > 0) {
+		ret = (int)wa.segs_allocated;
+	}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+	if (have_numa)
+		restore_numa(&oldpolicy, oldmask);
+#endif
+	return ret;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_seg(size_t page_sz, int socket)
+{
+	struct rte_memseg *ms;
+	if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
+		return NULL;
+	/* return pointer to newly allocated memseg */
+	return ms;
+}
+
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
+{
+	int seg, ret = 0;
+
+	/* dynamic free not supported in legacy mode */
+	if (internal_config.legacy_mem)
+		return -1;
+
+	for (seg = 0; seg < n_segs; seg++) {
+		struct rte_memseg *cur = ms[seg];
+		struct hugepage_info *hi = NULL;
+		struct free_walk_param wa;
+		int i, walk_res;
+
+		/* if this page is marked as unfreeable, fail */
+		if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
+			RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
+			ret = -1;
+			continue;
+		}
+
+		memset(&wa, 0, sizeof(wa));
+
+		for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info);
+				i++) {
+			hi = &internal_config.hugepage_info[i];
+			if (cur->hugepage_sz == hi->hugepage_sz)
+				break;
+		}
+		if (i == (int)RTE_DIM(internal_config.hugepage_info)) {
+			RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+			ret = -1;
+			continue;
+		}
+
+		wa.ms = cur;
+		wa.hi = hi;
+
+		walk_res = memseg_list_walk_thread_unsafe(free_seg_walk, &wa);
+		if (walk_res == 1)
+			continue;
+		if (walk_res == 0)
+			RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
+		ret = -1;
+	}
+	return ret;
+}
+
+int
+eal_memalloc_free_seg(struct rte_memseg *ms)
+{
+	/* dynamic free not supported in legacy mode */
+	if (internal_config.legacy_mem)
+		return -1;
+
+	return eal_memalloc_free_seg_bulk(&ms, 1);
+}
+
+static int
+sync_chunk(struct rte_memseg_list *primary_msl,
+		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+		unsigned int msl_idx, bool used, int start, int end)
+{
+	struct rte_fbarray *l_arr, *p_arr;
+	int i, ret, chunk_len, diff_len;
+
+	l_arr = &local_msl->memseg_arr;
+	p_arr = &primary_msl->memseg_arr;
+
+	/* we need to aggregate allocations/deallocations into bigger chunks,
+	 * as we don't want to spam the user with per-page callbacks.
+	 *
+	 * to avoid any potential issues, we also want to trigger
+	 * deallocation callbacks *before* we actually deallocate
+	 * memory, so that the user application could wrap up its use
+	 * before it goes away.
+	 */
+
+	chunk_len = end - start;
+
+	/* find how many contiguous pages we can map/unmap for this chunk */
+	diff_len = used ?
+			rte_fbarray_find_contig_free(l_arr, start) :
+			rte_fbarray_find_contig_used(l_arr, start);
+
+	/* has to be at least one page */
+	if (diff_len < 1)
+		return -1;
+
+	diff_len = RTE_MIN(chunk_len, diff_len);
+
+	/* if we are freeing memory, notify the application */
+	if (!used) {
+		struct rte_memseg *ms;
+		void *start_va;
+		size_t len, page_sz;
+
+		ms = rte_fbarray_get(l_arr, start);
+		start_va = ms->addr;
+		page_sz = (size_t)primary_msl->page_sz;
+		len = page_sz * diff_len;
+
+		eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
+				start_va, len);
+	}
+
+	for (i = 0; i < diff_len; i++) {
+		struct rte_memseg *p_ms, *l_ms;
+		int seg_idx = start + i;
+
+		l_ms = rte_fbarray_get(l_arr, seg_idx);
+		p_ms = rte_fbarray_get(p_arr, seg_idx);
+
+		if (l_ms == NULL || p_ms == NULL)
+			return -1;
+
+		if (used) {
+			ret = alloc_seg(l_ms, p_ms->addr,
+					p_ms->socket_id, hi,
+					msl_idx, seg_idx);
+			if (ret < 0)
+				return -1;
+			rte_fbarray_set_used(l_arr, seg_idx);
+		} else {
+			ret = free_seg(l_ms, hi, msl_idx, seg_idx);
+			rte_fbarray_set_free(l_arr, seg_idx);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	/* if we just allocated memory, notify the application */
+	if (used) {
+		struct rte_memseg *ms;
+		void *start_va;
+		size_t len, page_sz;
+
+		ms = rte_fbarray_get(l_arr, start);
+		start_va = ms->addr;
+		page_sz = (size_t)primary_msl->page_sz;
+		len = page_sz * diff_len;
+
+		eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
+				start_va, len);
+	}
+
+	/* calculate how much we can advance until next chunk */
+	diff_len = used ?
+			rte_fbarray_find_contig_used(l_arr, start) :
+			rte_fbarray_find_contig_free(l_arr, start);
+	ret = RTE_MIN(chunk_len, diff_len);
+
+	return ret;
+}
+
+static int
+sync_status(struct rte_memseg_list *primary_msl,
+		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+		unsigned int msl_idx, bool used)
+{
+	struct rte_fbarray *l_arr, *p_arr;
+	int p_idx, l_chunk_len, p_chunk_len, ret;
+	int start, end;
+
+	/* this is a little bit tricky, but the basic idea is - walk both lists
+	 * and spot any places where there are discrepancies. walking both lists
+	 * and noting discrepancies in a single go is a hard problem, so we do
+	 * it in two passes - first we spot any places where allocated segments
+	 * mismatch (i.e. ensure that everything that's allocated in the primary
+	 * is also allocated in the secondary), and then we do it by looking at
+	 * free segments instead.
+	 *
+	 * we also need to aggregate changes into chunks, as we have to call
+	 * callbacks per allocation, not per page.
+	 */
+	l_arr = &local_msl->memseg_arr;
+	p_arr = &primary_msl->memseg_arr;
+
+	if (used)
+		p_idx = rte_fbarray_find_next_used(p_arr, 0);
+	else
+		p_idx = rte_fbarray_find_next_free(p_arr, 0);
+
+	while (p_idx >= 0) {
+		int next_chunk_search_idx;
+
+		if (used) {
+			p_chunk_len = rte_fbarray_find_contig_used(p_arr,
+					p_idx);
+			l_chunk_len = rte_fbarray_find_contig_used(l_arr,
+					p_idx);
+		} else {
+			p_chunk_len = rte_fbarray_find_contig_free(p_arr,
+					p_idx);
+			l_chunk_len = rte_fbarray_find_contig_free(l_arr,
+					p_idx);
+		}
+		/* best case scenario - no differences (or bigger, which will be
+		 * fixed during next iteration), look for next chunk
+		 */
+		if (l_chunk_len >= p_chunk_len) {
+			next_chunk_search_idx = p_idx + p_chunk_len;
+			goto next_chunk;
+		}
+
+		/* if both chunks start at the same point, skip parts we know
+		 * are identical, and sync the rest. each call to sync_chunk
+		 * will only sync contiguous segments, so we need to call this
+		 * until we are sure there are no more differences in this
+		 * chunk.
+		 */
+		start = p_idx + l_chunk_len;
+		end = p_idx + p_chunk_len;
+		do {
+			ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
+					used, start, end);
+			start += ret;
+		} while (start < end && ret >= 0);
+		/* if ret is negative, something went wrong */
+		if (ret < 0)
+			return -1;
+
+		next_chunk_search_idx = p_idx + p_chunk_len;
+next_chunk:
+		/* skip to end of this chunk */
+		if (used) {
+			p_idx = rte_fbarray_find_next_used(p_arr,
+					next_chunk_search_idx);
+		} else {
+			p_idx = rte_fbarray_find_next_free(p_arr,
+					next_chunk_search_idx);
+		}
+	}
+	return 0;
+}
+
+static int
+sync_existing(struct rte_memseg_list *primary_msl,
+		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+		unsigned int msl_idx)
+{
+	int ret, dir_fd;
+
+	/* do not allow any page allocations during the time we're allocating,
+	 * because file creation and locking operations are not atomic,
+	 * and we might be the first or the last ones to use a particular page,
+	 * so we need to ensure atomicity of every operation.
+	 */
+	dir_fd = open(hi->hugedir, O_RDONLY);
+	if (dir_fd < 0) {
+		RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__,
+			hi->hugedir, strerror(errno));
+		return -1;
+	}
+	/* blocking writelock */
+	if (flock(dir_fd, LOCK_EX)) {
+		RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__,
+			hi->hugedir, strerror(errno));
+		close(dir_fd);
+		return -1;
+	}
+
+	/* ensure all allocated space is the same in both lists */
+	ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
+	if (ret < 0)
+		goto fail;
+
+	/* ensure all unallocated space is the same in both lists */
+	ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
+	if (ret < 0)
+		goto fail;
+
+	/* update version number */
+	local_msl->version = primary_msl->version;
+
+	close(dir_fd);
+
+	return 0;
+fail:
+	close(dir_fd);
+	return -1;
+}
+
+static int
+sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct rte_memseg_list *primary_msl, *local_msl;
+	struct hugepage_info *hi = NULL;
+	unsigned int i;
+	int msl_idx;
+
+	msl_idx = msl - mcfg->memsegs;
+	primary_msl = &mcfg->memsegs[msl_idx];
+	local_msl = &local_memsegs[msl_idx];
+
+	for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+		uint64_t cur_sz =
+			internal_config.hugepage_info[i].hugepage_sz;
+		uint64_t msl_sz = primary_msl->page_sz;
+		if (msl_sz == cur_sz) {
+			hi = &internal_config.hugepage_info[i];
+			break;
+		}
+	}
+	if (!hi) {
+		RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+		return -1;
+	}
+
+	/* if versions don't match, synchronize everything */
+	if (local_msl->version != primary_msl->version &&
+			sync_existing(primary_msl, local_msl, hi, msl_idx))
+		return -1;
+	return 0;
+}
+
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+	/* nothing to be done in primary */
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		return 0;
+
+	if (memseg_list_walk_thread_unsafe(sync_walk, NULL))
+		return -1;
+	return 0;
+}
+
+static int
+secondary_msl_create_walk(const struct rte_memseg_list *msl,
+		void *arg __rte_unused)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct rte_memseg_list *primary_msl, *local_msl;
+	char name[PATH_MAX];
+	int msl_idx, ret;
+
+	msl_idx = msl - mcfg->memsegs;
+	primary_msl = &mcfg->memsegs[msl_idx];
+	local_msl = &local_memsegs[msl_idx];
+
+	/* create distinct fbarrays for each secondary */
+	snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
+		primary_msl->memseg_arr.name, getpid());
+
+	ret = rte_fbarray_init(&local_msl->memseg_arr, name,
+		primary_msl->memseg_arr.len,
+		primary_msl->memseg_arr.elt_sz);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
+		return -1;
+	}
+	local_msl->base_va = primary_msl->base_va;
+
+	return 0;
+}
+
+static int
+secondary_lock_list_create_walk(const struct rte_memseg_list *msl,
+		void *arg __rte_unused)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	unsigned int i, len;
+	int msl_idx;
+	int *data;
+
+	msl_idx = msl - mcfg->memsegs;
+	len = msl->memseg_arr.len;
+
+	/* ensure we have space to store lock fd per each possible segment */
+	data = malloc(sizeof(int) * len);
+	if (data == NULL) {
+		RTE_LOG(ERR, EAL, "Unable to allocate space for lock descriptors\n");
+		return -1;
+	}
+	/* set all fd's as invalid */
+	for (i = 0; i < len; i++)
+		data[i] = -1;
+
+	lock_fds[msl_idx].fds = data;
+	lock_fds[msl_idx].len = len;
+	lock_fds[msl_idx].count = 0;
+	lock_fds[msl_idx].memseg_list_fd = -1;
+
+	return 0;
+}
+
+int
+eal_memalloc_init(void)
+{
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+		if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
+			return -1;
+
+	/* initialize all of the lock fd lists */
+	if (internal_config.single_file_segments)
+		if (rte_memseg_list_walk(secondary_lock_list_create_walk, NULL))
+			return -1;
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 38853b75..c917de1c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -28,6 +28,7 @@
 #include <numaif.h>
 #endif
 
+#include <rte_errno.h>
 #include <rte_log.h>
 #include <rte_memory.h>
 #include <rte_launch.h>
@@ -39,6 +40,7 @@
 #include <rte_string_fns.h>
 
 #include "eal_private.h"
+#include "eal_memalloc.h"
 #include "eal_internal_cfg.h"
 #include "eal_filesystem.h"
 #include "eal_hugepages.h"
@@ -57,8 +59,6 @@
  * zone as well as a physical contiguous zone.
  */
 
-static uint64_t baseaddr_offset;
-
 static bool phys_addrs_available = true;
 
 #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
@@ -221,82 +221,6 @@ aslr_enabled(void)
 	}
 }
 
-/*
- * Try to mmap *size bytes in /dev/zero. If it is successful, return the
- * pointer to the mmap'd area and keep *size unmodified. Else, retry
- * with a smaller zone: decrease *size by hugepage_sz until it reaches
- * 0. In this case, return NULL. Note: this function returns an address
- * which is a multiple of hugepage size.
- */
-static void *
-get_virtual_area(size_t *size, size_t hugepage_sz)
-{
-	void *addr;
-	void *addr_hint;
-	int fd;
-	long aligned_addr;
-
-	if (internal_config.base_virtaddr != 0) {
-		int page_size = sysconf(_SC_PAGE_SIZE);
-		addr_hint = (void *) (uintptr_t)
-			(internal_config.base_virtaddr + baseaddr_offset);
-		addr_hint = RTE_PTR_ALIGN_FLOOR(addr_hint, page_size);
-	} else {
-		addr_hint = NULL;
-	}
-
-	RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
-
-
-	fd = open("/dev/zero", O_RDONLY);
-	if (fd < 0){
-		RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n");
-		return NULL;
-	}
-	do {
-		addr = mmap(addr_hint, (*size) + hugepage_sz, PROT_READ,
-#ifdef RTE_ARCH_PPC_64
-				MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-#else
-				MAP_PRIVATE,
-#endif
-				fd, 0);
-		if (addr == MAP_FAILED) {
-			*size -= hugepage_sz;
-		} else if (addr_hint != NULL && addr != addr_hint) {
-			RTE_LOG(WARNING, EAL, "WARNING! Base virtual address "
-				"hint (%p != %p) not respected!\n",
-				addr_hint, addr);
-			RTE_LOG(WARNING, EAL, "   This may cause issues with "
-				"mapping memory into secondary processes\n");
-		}
-	} while (addr == MAP_FAILED && *size > 0);
-
-	if (addr == MAP_FAILED) {
-		close(fd);
-		RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
-			strerror(errno));
-		return NULL;
-	}
-
-	munmap(addr, (*size) + hugepage_sz);
-	close(fd);
-
-	/* align addr to a huge page size boundary */
-	aligned_addr = (long)addr;
-	aligned_addr += (hugepage_sz - 1);
-	aligned_addr &= (~(hugepage_sz - 1));
-	addr = (void *)(aligned_addr);
-
-	RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
-		addr, *size);
-
-	/* increment offset */
-	baseaddr_offset += *size;
-
-	return addr;
-}
-
 static sigjmp_buf huge_jmpenv;
 
 static void huge_sigbus_handler(int signo __rte_unused)
@@ -330,13 +254,11 @@ void numa_error(char *where)
  */
 static unsigned
 map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
-		  uint64_t *essential_memory __rte_unused, int orig)
+		  uint64_t *essential_memory __rte_unused)
 {
 	int fd;
 	unsigned i;
 	void *virtaddr;
-	void *vma_addr = NULL;
-	size_t vma_len = 0;
 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
 	int node_id = -1;
 	int essential_prev = 0;
@@ -351,7 +273,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
 		have_numa = false;
 	}
 
-	if (orig && have_numa) {
+	if (have_numa) {
 		RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
 		if (get_mempolicy(&oldpolicy, oldmask->maskp,
 				  oldmask->size + 1, 0, 0) < 0) {
@@ -367,6 +289,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
 #endif
 
 	for (i = 0; i < hpi->num_pages[0]; i++) {
+		struct hugepage_file *hf = &hugepg_tbl[i];
 		uint64_t hugepage_sz = hpi->hugepage_sz;
 
 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
@@ -401,57 +324,14 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
 		}
 #endif
 
-		if (orig) {
-			hugepg_tbl[i].file_id = i;
-			hugepg_tbl[i].size = hugepage_sz;
-			eal_get_hugefile_path(hugepg_tbl[i].filepath,
-					sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
-					hugepg_tbl[i].file_id);
-			hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0';
-		}
-#ifndef RTE_ARCH_64
-		/* for 32-bit systems, don't remap 1G and 16G pages, just reuse
-		 * original map address as final map address.
-		 */
-		else if ((hugepage_sz == RTE_PGSIZE_1G)
-			|| (hugepage_sz == RTE_PGSIZE_16G)) {
-			hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
-			hugepg_tbl[i].orig_va = NULL;
-			continue;
-		}
-#endif
-		else if (vma_len == 0) {
-			unsigned j, num_pages;
-
-			/* reserve a virtual area for next contiguous
-			 * physical block: count the number of
-			 * contiguous physical pages. */
-			for (j = i+1; j < hpi->num_pages[0] ; j++) {
-#ifdef RTE_ARCH_PPC_64
-				/* The physical addresses are sorted in
-				 * descending order on PPC64 */
-				if (hugepg_tbl[j].physaddr !=
-				    hugepg_tbl[j-1].physaddr - hugepage_sz)
-					break;
-#else
-				if (hugepg_tbl[j].physaddr !=
-				    hugepg_tbl[j-1].physaddr + hugepage_sz)
-					break;
-#endif
-			}
-			num_pages = j - i;
-			vma_len = num_pages * hugepage_sz;
-
-			/* get the biggest virtual memory area up to
-			 * vma_len. If it fails, vma_addr is NULL, so
-			 * let the kernel provide the address. */
-			vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
-			if (vma_addr == NULL)
-				vma_len = hugepage_sz;
-		}
+		hf->file_id = i;
+		hf->size = hugepage_sz;
+		eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath),
+				hpi->hugedir, hf->file_id);
+		hf->filepath[sizeof(hf->filepath) - 1] = '\0';
 
 		/* try to create hugepage file */
-		fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0600);
+		fd = open(hf->filepath, O_CREAT | O_RDWR, 0600);
 		if (fd < 0) {
 			RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
 					strerror(errno));
@@ -459,8 +339,11 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
 		}
 
 		/* map the segment, and populate page tables,
-		 * the kernel fills this segment with zeros */
-		virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
+		 * the kernel fills this segment with zeros. we don't care where
+		 * this gets mapped - we already have contiguous memory areas
+		 * ready for us to map into.
+		 */
+		virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,
 				MAP_SHARED | MAP_POPULATE, fd, 0);
 		if (virtaddr == MAP_FAILED) {
 			RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
@@ -469,41 +352,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
 			goto out;
 		}
 
-		if (orig) {
-			hugepg_tbl[i].orig_va = virtaddr;
-		}
-		else {
-			hugepg_tbl[i].final_va = virtaddr;
-		}
+		hf->orig_va = virtaddr;
 
-		if (orig) {
-			/* In linux, hugetlb limitations, like cgroup, are
-			 * enforced at fault time instead of mmap(), even
-			 * with the option of MAP_POPULATE. Kernel will send
-			 * a SIGBUS signal. To avoid to be killed, save stack
-			 * environment here, if SIGBUS happens, we can jump
-			 * back here.
-			 */
-			if (huge_wrap_sigsetjmp()) {
-				RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
-					"hugepages of size %u MB\n",
-					(unsigned)(hugepage_sz / 0x100000));
-				munmap(virtaddr, hugepage_sz);
-				close(fd);
-				unlink(hugepg_tbl[i].filepath);
+		/* In linux, hugetlb limitations, like cgroup, are
+		 * enforced at fault time instead of mmap(), even
+		 * with the option of MAP_POPULATE. Kernel will send
+		 * a SIGBUS signal. To avoid to be killed, save stack
+		 * environment here, if SIGBUS happens, we can jump
+		 * back here.
+		 */
+		if (huge_wrap_sigsetjmp()) {
+			RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
+				"hugepages of size %u MB\n",
+				(unsigned int)(hugepage_sz / 0x100000));
+			munmap(virtaddr, hugepage_sz);
+			close(fd);
+			unlink(hugepg_tbl[i].filepath);
 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-				if (maxnode)
-					essential_memory[node_id] =
-						essential_prev;
+			if (maxnode)
+				essential_memory[node_id] =
+					essential_prev;
 #endif
-				goto out;
-			}
-			*(int *)virtaddr = 0;
+			goto out;
 		}
+		*(int *)virtaddr = 0;
 
-
-		/* set shared flock on the file. */
-		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
+		/* set shared lock on the file. */
+		if (flock(fd, LOCK_SH) < 0) {
 			RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
 				__func__, strerror(errno));
 			close(fd);
@@ -511,9 +386,6 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
 		}
 
 		close(fd);
-
-		vma_addr = (char *)vma_addr + hugepage_sz;
-		vma_len -= hugepage_sz;
 	}
 
 out:
@@ -535,20 +407,6 @@ out:
 	return i;
 }
 
-/* Unmap all hugepages from original mapping */
-static int
-unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
-{
-        unsigned i;
-        for (i = 0; i < hpi->num_pages[0]; i++) {
-                if (hugepg_tbl[i].orig_va) {
-                        munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz);
-                        hugepg_tbl[i].orig_va = NULL;
-                }
-        }
-        return 0;
-}
-
 /*
  * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
  * page.
@@ -688,7 +546,7 @@ copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
 	int src_pos, dst_pos = 0;
 
 	for (src_pos = 0; src_pos < src_size; src_pos++) {
-		if (src[src_pos].final_va != NULL) {
+		if (src[src_pos].orig_va != NULL) {
 			/* error on overflow attempt */
 			if (dst_pos == dest_size)
 				return -1;
@@ -759,9 +617,10 @@ unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
 						unmap_len = hp->size;
 
 						/* get start addr and len of the remaining segment */
-						munmap(hp->final_va, (size_t) unmap_len);
+						munmap(hp->orig_va,
+							(size_t)unmap_len);
 
-						hp->final_va = NULL;
+						hp->orig_va = NULL;
 						if (unlink(hp->filepath) == -1) {
 							RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n",
 									__func__, hp->filepath, strerror(errno));
@@ -780,6 +639,408 @@ unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
 	return 0;
 }
 
+static int
+remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct rte_memseg_list *msl;
+	struct rte_fbarray *arr;
+	int cur_page, seg_len;
+	unsigned int msl_idx;
+	int ms_idx;
+	uint64_t page_sz;
+	size_t memseg_len;
+	int socket_id;
+
+	page_sz = hugepages[seg_start].size;
+	socket_id = hugepages[seg_start].socket_id;
+	seg_len = seg_end - seg_start;
+
+	RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n",
+			(seg_len * page_sz) >> 20ULL, socket_id);
+
+	/* find free space in memseg lists */
+	for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+		bool empty;
+		msl = &mcfg->memsegs[msl_idx];
+		arr = &msl->memseg_arr;
+
+		if (msl->page_sz != page_sz)
+			continue;
+		if (msl->socket_id != socket_id)
+			continue;
+
+		/* leave space for a hole if array is not empty */
+		empty = arr->count == 0;
+		ms_idx = rte_fbarray_find_next_n_free(arr, 0,
+				seg_len + (empty ? 0 : 1));
+
+		/* memseg list is full? */
+		if (ms_idx < 0)
+			continue;
+
+		/* leave some space between memsegs, they are not IOVA
+		 * contiguous, so they shouldn't be VA contiguous either.
+		 */
+		if (!empty)
+			ms_idx++;
+		break;
+	}
+	if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+		RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
+				RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
+				RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
+		return -1;
+	}
+
+#ifdef RTE_ARCH_PPC64
+	/* for PPC64 we go through the list backwards */
+	for (cur_page = seg_end - 1; cur_page >= seg_start;
+			cur_page--, ms_idx++) {
+#else
+	for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {
+#endif
+		struct hugepage_file *hfile = &hugepages[cur_page];
+		struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+		void *addr;
+		int fd;
+
+		fd = open(hfile->filepath, O_RDWR);
+		if (fd < 0) {
+			RTE_LOG(ERR, EAL, "Could not open '%s': %s\n",
+					hfile->filepath, strerror(errno));
+			return -1;
+		}
+		/* set shared lock on the file. */
+		if (flock(fd, LOCK_SH) < 0) {
+			RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n",
+					hfile->filepath, strerror(errno));
+			close(fd);
+			return -1;
+		}
+		memseg_len = (size_t)page_sz;
+		addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
+
+		/* we know this address is already mmapped by memseg list, so
+		 * using MAP_FIXED here is safe
+		 */
+		addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,
+				MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
+		if (addr == MAP_FAILED) {
+			RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n",
+					hfile->filepath, strerror(errno));
+			close(fd);
+			return -1;
+		}
+
+		/* we have a new address, so unmap previous one */
+#ifndef RTE_ARCH_64
+		/* in 32-bit legacy mode, we have already unmapped the page */
+		if (!internal_config.legacy_mem)
+			munmap(hfile->orig_va, page_sz);
+#else
+		munmap(hfile->orig_va, page_sz);
+#endif
+
+		hfile->orig_va = NULL;
+		hfile->final_va = addr;
+
+		/* rewrite physical addresses in IOVA as VA mode */
+		if (rte_eal_iova_mode() == RTE_IOVA_VA)
+			hfile->physaddr = (uintptr_t)addr;
+
+		/* set up memseg data */
+		ms->addr = addr;
+		ms->hugepage_sz = page_sz;
+		ms->len = memseg_len;
+		ms->iova = hfile->physaddr;
+		ms->socket_id = hfile->socket_id;
+		ms->nchannel = rte_memory_get_nchannel();
+		ms->nrank = rte_memory_get_nrank();
+
+		rte_fbarray_set_used(arr, ms_idx);
+
+		close(fd);
+	}
+	RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n",
+			(seg_len * page_sz) >> 20, socket_id);
+	return 0;
+}
+
+#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+		int n_segs, int socket_id, int type_msl_idx)
+{
+	char name[RTE_FBARRAY_NAME_LEN];
+
+	snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
+		 type_msl_idx);
+	if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
+			sizeof(struct rte_memseg))) {
+		RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
+			rte_strerror(rte_errno));
+		return -1;
+	}
+
+	msl->page_sz = page_sz;
+	msl->socket_id = socket_id;
+	msl->base_va = NULL;
+
+	RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
+			(size_t)page_sz >> 10, socket_id);
+
+	return 0;
+}
+
+static int
+alloc_va_space(struct rte_memseg_list *msl)
+{
+	uint64_t page_sz;
+	size_t mem_sz;
+	void *addr;
+	int flags = 0;
+
+#ifdef RTE_ARCH_PPC_64
+	flags |= MAP_HUGETLB;
+#endif
+
+	page_sz = msl->page_sz;
+	mem_sz = page_sz * msl->memseg_arr.len;
+
+	addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
+	if (addr == NULL) {
+		if (rte_errno == EADDRNOTAVAIL)
+			RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
+				(unsigned long long)mem_sz, msl->base_va);
+		else
+			RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+		return -1;
+	}
+	msl->base_va = addr;
+
+	return 0;
+}
+
+/*
+ * Our VA space is not preallocated yet, so preallocate it here. We need to know
+ * how many segments there are in order to map all pages into one address space,
+ * and leave appropriate holes between segments so that rte_malloc does not
+ * concatenate them into one big segment.
+ *
+ * we also need to unmap original pages to free up address space.
+ */
+static int __rte_unused
+prealloc_segments(struct hugepage_file *hugepages, int n_pages)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	int cur_page, seg_start_page, end_seg, new_memseg;
+	unsigned int hpi_idx, socket, i;
+	int n_contig_segs, n_segs;
+	int msl_idx;
+
+	/* before we preallocate segments, we need to free up our VA space.
+	 * we're not removing files, and we already have information about
+	 * PA-contiguousness, so it is safe to unmap everything.
+	 */
+	for (cur_page = 0; cur_page < n_pages; cur_page++) {
+		struct hugepage_file *hpi = &hugepages[cur_page];
+		munmap(hpi->orig_va, hpi->size);
+		hpi->orig_va = NULL;
+	}
+
+	/* we cannot know how many page sizes and sockets we have discovered, so
+	 * loop over all of them
+	 */
+	for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes;
+			hpi_idx++) {
+		uint64_t page_sz =
+			internal_config.hugepage_info[hpi_idx].hugepage_sz;
+
+		for (i = 0; i < rte_socket_count(); i++) {
+			struct rte_memseg_list *msl;
+
+			socket = rte_socket_id_by_idx(i);
+			n_contig_segs = 0;
+			n_segs = 0;
+			seg_start_page = -1;
+
+			for (cur_page = 0; cur_page < n_pages; cur_page++) {
+				struct hugepage_file *prev, *cur;
+				int prev_seg_start_page = -1;
+
+				cur = &hugepages[cur_page];
+				prev = cur_page == 0 ? NULL :
+						&hugepages[cur_page - 1];
+
+				new_memseg = 0;
+				end_seg = 0;
+
+				if (cur->size == 0)
+					end_seg = 1;
+				else if (cur->socket_id != (int) socket)
+					end_seg = 1;
+				else if (cur->size != page_sz)
+					end_seg = 1;
+				else if (cur_page == 0)
+					new_memseg = 1;
+#ifdef RTE_ARCH_PPC_64
+				/* On PPC64 architecture, the mmap always start
+				 * from higher address to lower address. Here,
+				 * physical addresses are in descending order.
+				 */
+				else if ((prev->physaddr - cur->physaddr) !=
+						cur->size)
+					new_memseg = 1;
+#else
+				else if ((cur->physaddr - prev->physaddr) !=
+						cur->size)
+					new_memseg = 1;
+#endif
+				if (new_memseg) {
+					/* if we're already inside a segment,
+					 * new segment means end of current one
+					 */
+					if (seg_start_page != -1) {
+						end_seg = 1;
+						prev_seg_start_page =
+								seg_start_page;
+					}
+					seg_start_page = cur_page;
+				}
+
+				if (end_seg) {
+					if (prev_seg_start_page != -1) {
+						/* we've found a new segment */
+						n_contig_segs++;
+						n_segs += cur_page -
+							prev_seg_start_page;
+					} else if (seg_start_page != -1) {
+						/* we didn't find new segment,
+						 * but did end current one
+						 */
+						n_contig_segs++;
+						n_segs += cur_page -
+								seg_start_page;
+						seg_start_page = -1;
+						continue;
+					} else {
+						/* we're skipping this page */
+						continue;
+					}
+				}
+				/* segment continues */
+			}
+			/* check if we missed last segment */
+			if (seg_start_page != -1) {
+				n_contig_segs++;
+				n_segs += cur_page - seg_start_page;
+			}
+
+			/* if no segments were found, do not preallocate */
+			if (n_segs == 0)
+				continue;
+
+			/* we now have total number of pages that we will
+			 * allocate for this segment list. add separator pages
+			 * to the total count, and preallocate VA space.
+			 */
+			n_segs += n_contig_segs - 1;
+
+			/* now, preallocate VA space for these segments */
+
+			/* first, find suitable memseg list for this */
+			for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
+					msl_idx++) {
+				msl = &mcfg->memsegs[msl_idx];
+
+				if (msl->base_va != NULL)
+					continue;
+				break;
+			}
+			if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+				RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n",
+					RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+				return -1;
+			}
+
+			/* now, allocate fbarray itself */
+			if (alloc_memseg_list(msl, page_sz, n_segs, socket,
+						msl_idx) < 0)
+				return -1;
+
+			/* finally, allocate VA space */
+			if (alloc_va_space(msl) < 0)
+				return -1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * We cannot reallocate memseg lists on the fly because PPC64 stores pages
+ * backwards, therefore we have to process the entire memseg first before
+ * remapping it into memseg list VA space.
+ */
+static int
+remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)
+{
+	int cur_page, seg_start_page, new_memseg, ret;
+
+	seg_start_page = 0;
+	for (cur_page = 0; cur_page < n_pages; cur_page++) {
+		struct hugepage_file *prev, *cur;
+
+		new_memseg = 0;
+
+		cur = &hugepages[cur_page];
+		prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];
+
+		/* if size is zero, no more pages left */
+		if (cur->size == 0)
+			break;
+
+		if (cur_page == 0)
+			new_memseg = 1;
+		else if (cur->socket_id != prev->socket_id)
+			new_memseg = 1;
+		else if (cur->size != prev->size)
+			new_memseg = 1;
+#ifdef RTE_ARCH_PPC_64
+		/* On PPC64 architecture, the mmap always start from higher
+		 * address to lower address. Here, physical addresses are in
+		 * descending order.
+		 */
+		else if ((prev->physaddr - cur->physaddr) != cur->size)
+			new_memseg = 1;
+#else
+		else if ((cur->physaddr - prev->physaddr) != cur->size)
+			new_memseg = 1;
+#endif
+
+		if (new_memseg) {
+			/* if this isn't the first time, remap segment */
+			if (cur_page != 0) {
+				ret = remap_segment(hugepages, seg_start_page,
+						cur_page);
+				if (ret != 0)
+					return -1;
+			}
+			/* remember where we started */
+			seg_start_page = cur_page;
+		}
+		/* continuation of previous memseg */
+	}
+	/* we were stopped, but we didn't remap the last segment, do it now */
+	if (cur_page != 0) {
+		ret = remap_segment(hugepages, seg_start_page,
+				cur_page);
+		if (ret != 0)
+			return -1;
+	}
+	return 0;
+}
+
 static inline uint64_t
 get_socket_mem_size(int socket)
 {
@@ -788,7 +1049,7 @@ get_socket_mem_size(int socket)
 
 	for (i = 0; i < internal_config.num_hugepage_sizes; i++){
 		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-		if (hpi->hugedir != NULL)
+		if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0)
 			size += hpi->hugepage_sz * hpi->num_pages[socket];
 	}
 
@@ -818,8 +1079,10 @@ calc_num_pages_per_socket(uint64_t * memory,
 
 	/* if specific memory amounts per socket weren't requested */
 	if (internal_config.force_sockets == 0) {
+		size_t total_size;
+#ifdef RTE_ARCH_64
 		int cpu_per_socket[RTE_MAX_NUMA_NODES];
-		size_t default_size, total_size;
+		size_t default_size;
 		unsigned lcore_id;
 
 		/* Compute number of cores per socket */
@@ -837,7 +1100,7 @@ calc_num_pages_per_socket(uint64_t * memory,
 
 			/* Set memory amount per socket */
 			default_size = (internal_config.memory * cpu_per_socket[socket])
-			                / rte_lcore_count();
+					/ rte_lcore_count();
 
 			/* Limit to maximum available memory on socket */
 			default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
@@ -854,18 +1117,40 @@ calc_num_pages_per_socket(uint64_t * memory,
 		for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
 			/* take whatever is available */
 			default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
-			                       total_size);
+					       total_size);
 
 			/* Update sizes */
 			memory[socket] += default_size;
 			total_size -= default_size;
 		}
+#else
+		/* in 32-bit mode, allocate all of the memory only on master
+		 * lcore socket
+		 */
+		total_size = internal_config.memory;
+		for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
+				socket++) {
+			struct rte_config *cfg = rte_eal_get_configuration();
+			unsigned int master_lcore_socket;
+
+			master_lcore_socket =
+				rte_lcore_to_socket_id(cfg->master_lcore);
+
+			if (master_lcore_socket != socket)
+				continue;
+
+			/* Update sizes */
+			memory[socket] = total_size;
+			break;
+		}
+#endif
 	}
 
 	for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
 		/* skips if the memory on specific socket wasn't requested */
 		for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
-			hp_used[i].hugedir = hp_info[i].hugedir;
+			strlcpy(hp_used[i].hugedir, hp_info[i].hugedir,
+				sizeof(hp_used[i].hugedir));
 			hp_used[i].num_pages[socket] = RTE_MIN(
 					memory[socket] / hp_info[i].hugepage_sz,
 					hp_info[i].num_pages[socket]);
@@ -907,7 +1192,8 @@ calc_num_pages_per_socket(uint64_t * memory,
 			}
 		}
 		/* if we didn't satisfy all memory requirements per socket */
-		if (memory[socket] > 0) {
+		if (memory[socket] > 0 &&
+				internal_config.socket_mem[socket] != 0) {
 			/* to prevent icc errors */
 			requested = (unsigned) (internal_config.socket_mem[socket] /
 					0x100000);
@@ -939,7 +1225,7 @@ eal_get_hugepage_mem_size(void)
 
 	for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
 		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-		if (hpi->hugedir != NULL) {
+		if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
 			for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
 				size += hpi->hugepage_sz * hpi->num_pages[j];
 			}
@@ -987,17 +1273,19 @@ huge_recover_sigbus(void)
  *  6. unmap the first mapping
  *  7. fill memsegs in configuration with contiguous zones
  */
-int
-rte_eal_hugepage_init(void)
+static int
+eal_legacy_hugepage_init(void)
 {
 	struct rte_mem_config *mcfg;
 	struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
 	struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+	struct rte_fbarray *arr;
+	struct rte_memseg *ms;
 
 	uint64_t memory[RTE_MAX_NUMA_NODES];
 
 	unsigned hp_offset;
-	int i, j, new_memseg;
+	int i, j;
 	int nr_hugefiles, nr_hugepages = 0;
 	void *addr;
 
@@ -1010,21 +1298,54 @@ rte_eal_hugepage_init(void)
 
 	/* hugetlbfs can be disabled */
 	if (internal_config.no_hugetlbfs) {
+		struct rte_memseg_list *msl;
+		uint64_t page_sz;
+		int n_segs, cur_seg;
+
+		/* nohuge mode is legacy mode */
+		internal_config.legacy_mem = 1;
+
+		/* create a memseg list */
+		msl = &mcfg->memsegs[0];
+
+		page_sz = RTE_PGSIZE_4K;
+		n_segs = internal_config.memory / page_sz;
+
+		if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
+					sizeof(struct rte_memseg))) {
+			RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+			return -1;
+		}
+
 		addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
-				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+				MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 		if (addr == MAP_FAILED) {
 			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
 					strerror(errno));
 			return -1;
 		}
-		if (rte_eal_iova_mode() == RTE_IOVA_VA)
-			mcfg->memseg[0].iova = (uintptr_t)addr;
-		else
-			mcfg->memseg[0].iova = RTE_BAD_IOVA;
-		mcfg->memseg[0].addr = addr;
-		mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
-		mcfg->memseg[0].len = internal_config.memory;
-		mcfg->memseg[0].socket_id = 0;
+		msl->base_va = addr;
+		msl->page_sz = page_sz;
+		msl->socket_id = 0;
+
+		/* populate memsegs. each memseg is one page long */
+		for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
+			arr = &msl->memseg_arr;
+
+			ms = rte_fbarray_get(arr, cur_seg);
+			if (rte_eal_iova_mode() == RTE_IOVA_VA)
+				ms->iova = (uintptr_t)addr;
+			else
+				ms->iova = RTE_BAD_IOVA;
+			ms->addr = addr;
+			ms->hugepage_sz = page_sz;
+			ms->socket_id = 0;
+			ms->len = page_sz;
+
+			rte_fbarray_set_used(arr, cur_seg);
+
+			addr = RTE_PTR_ADD(addr, (size_t)page_sz);
+		}
 		return 0;
 	}
 
@@ -1057,7 +1378,6 @@ rte_eal_hugepage_init(void)
 	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
 		memory[i] = internal_config.socket_mem[i];
 
-
 	/* map all hugepages and sort them */
 	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
 		unsigned pages_old, pages_new;
@@ -1075,8 +1395,7 @@ rte_eal_hugepage_init(void)
 
 		/* map all hugepages available */
 		pages_old = hpi->num_pages[0];
-		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi,
-					      memory, 1);
+		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
 		if (pages_new < pages_old) {
 			RTE_LOG(DEBUG, EAL,
 				"%d not %d hugepages of size %u MB allocated\n",
@@ -1091,7 +1410,8 @@ rte_eal_hugepage_init(void)
 				continue;
 		}
 
-		if (phys_addrs_available) {
+		if (phys_addrs_available &&
+				rte_eal_iova_mode() != RTE_IOVA_VA) {
 			/* find physical addresses for each hugepage */
 			if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
 				RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
@@ -1118,18 +1438,6 @@ rte_eal_hugepage_init(void)
 		qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
 		      sizeof(struct hugepage_file), cmp_physaddr);
 
-		/* remap all hugepages */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, NULL, 0) !=
-		    hpi->num_pages[0]) {
-			RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
-					(unsigned)(hpi->hugepage_sz / 0x100000));
-			goto fail;
-		}
-
-		/* unmap original mappings */
-		if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
-			goto fail;
-
 		/* we have processed a num of hugepages of this size, so inc offset */
 		hp_offset += hpi->num_pages[0];
 	}
@@ -1191,7 +1499,7 @@ rte_eal_hugepage_init(void)
 	}
 
 	/* create shared memory */
-	hugepage = create_shared_memory(eal_hugepage_info_path(),
+	hugepage = create_shared_memory(eal_hugepage_data_path(),
 			nr_hugefiles * sizeof(struct hugepage_file));
 
 	if (hugepage == NULL) {
@@ -1212,7 +1520,7 @@ rte_eal_hugepage_init(void)
 
 	/*
 	 * copy stuff from malloc'd hugepage* to the actual shared memory.
-	 * this procedure only copies those hugepages that have final_va
+	 * this procedure only copies those hugepages that have orig_va
 	 * not NULL. has overflow protection.
 	 */
 	if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
@@ -1221,6 +1529,23 @@ rte_eal_hugepage_init(void)
 		goto fail;
 	}
 
+#ifndef RTE_ARCH_64
+	/* for legacy 32-bit mode, we did not preallocate VA space, so do it */
+	if (internal_config.legacy_mem &&
+			prealloc_segments(hugepage, nr_hugefiles)) {
+		RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n");
+		goto fail;
+	}
+#endif
+
+	/* remap all pages we do need into memseg list VA space, so that those
+	 * pages become first-class citizens in DPDK memory subsystem
+	 */
+	if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
+		RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n");
+		goto fail;
+	}
+
 	/* free the hugepage backing files */
 	if (internal_config.hugepage_unlink &&
 		unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) {
@@ -1232,75 +1557,30 @@ rte_eal_hugepage_init(void)
 	free(tmp_hp);
 	tmp_hp = NULL;
 
-	/* first memseg index shall be 0 after incrementing it below */
-	j = -1;
-	for (i = 0; i < nr_hugefiles; i++) {
-		new_memseg = 0;
-
-		/* if this is a new section, create a new memseg */
-		if (i == 0)
-			new_memseg = 1;
-		else if (hugepage[i].socket_id != hugepage[i-1].socket_id)
-			new_memseg = 1;
-		else if (hugepage[i].size != hugepage[i-1].size)
-			new_memseg = 1;
-
-#ifdef RTE_ARCH_PPC_64
-		/* On PPC64 architecture, the mmap always start from higher
-		 * virtual address to lower address. Here, both the physical
-		 * address and virtual address are in descending order */
-		else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=
-		    hugepage[i].size)
-			new_memseg = 1;
-		else if (((unsigned long)hugepage[i-1].final_va -
-		    (unsigned long)hugepage[i].final_va) != hugepage[i].size)
-			new_memseg = 1;
-#else
-		else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
-		    hugepage[i].size)
-			new_memseg = 1;
-		else if (((unsigned long)hugepage[i].final_va -
-		    (unsigned long)hugepage[i-1].final_va) != hugepage[i].size)
-			new_memseg = 1;
-#endif
+	munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
 
-		if (new_memseg) {
-			j += 1;
-			if (j == RTE_MAX_MEMSEG)
-				break;
+	/* we're not going to allocate more pages, so release VA space for
+	 * unused memseg lists
+	 */
+	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+		struct rte_memseg_list *msl = &mcfg->memsegs[i];
+		size_t mem_sz;
 
-			mcfg->memseg[j].iova = hugepage[i].physaddr;
-			mcfg->memseg[j].addr = hugepage[i].final_va;
-			mcfg->memseg[j].len = hugepage[i].size;
-			mcfg->memseg[j].socket_id = hugepage[i].socket_id;
-			mcfg->memseg[j].hugepage_sz = hugepage[i].size;
-		}
-		/* continuation of previous memseg */
-		else {
-#ifdef RTE_ARCH_PPC_64
-		/* Use the phy and virt address of the last page as segment
-		 * address for IBM Power architecture */
-			mcfg->memseg[j].iova = hugepage[i].physaddr;
-			mcfg->memseg[j].addr = hugepage[i].final_va;
-#endif
-			mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
-		}
-		hugepage[i].memseg_id = j;
-	}
+		/* skip inactive lists */
+		if (msl->base_va == NULL)
+			continue;
+		/* skip lists where there is at least one page allocated */
+		if (msl->memseg_arr.count > 0)
+			continue;
+		/* this is an unused list, deallocate it */
+		mem_sz = (size_t)msl->page_sz * msl->memseg_arr.len;
+		munmap(msl->base_va, mem_sz);
+		msl->base_va = NULL;
 
-	if (i < nr_hugefiles) {
-		RTE_LOG(ERR, EAL, "Can only reserve %d pages "
-			"from %d requested\n"
-			"Current %s=%d is not enough\n"
-			"Please either increase it or request less amount "
-			"of memory.\n",
-			i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
-			RTE_MAX_MEMSEG);
-		goto fail;
+		/* destroy backing fbarray */
+		rte_fbarray_destroy(&msl->memseg_arr);
 	}
 
-	munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
-
 	return 0;
 
 fail:
@@ -1312,6 +1592,104 @@ fail:
 	return -1;
 }
 
+static int __rte_unused
+hugepage_count_walk(const struct rte_memseg_list *msl, void *arg)
+{
+	struct hugepage_info *hpi = arg;
+
+	if (msl->page_sz != hpi->hugepage_sz)
+		return 0;
+
+	hpi->num_pages[msl->socket_id] += msl->memseg_arr.len;
+	return 0;
+}
+
+static int
+eal_hugepage_init(void)
+{
+	struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+	uint64_t memory[RTE_MAX_NUMA_NODES];
+	int hp_sz_idx, socket_id;
+
+	test_phys_addrs_available();
+
+	memset(used_hp, 0, sizeof(used_hp));
+
+	for (hp_sz_idx = 0;
+			hp_sz_idx < (int) internal_config.num_hugepage_sizes;
+			hp_sz_idx++) {
+#ifndef RTE_ARCH_64
+		struct hugepage_info dummy;
+		unsigned int i;
+#endif
+		/* also initialize used_hp hugepage sizes in used_hp */
+		struct hugepage_info *hpi;
+		hpi = &internal_config.hugepage_info[hp_sz_idx];
+		used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
+
+#ifndef RTE_ARCH_64
+		/* for 32-bit, limit number of pages on socket to whatever we've
+		 * preallocated, as we cannot allocate more.
+		 */
+		memset(&dummy, 0, sizeof(dummy));
+		dummy.hugepage_sz = hpi->hugepage_sz;
+		if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0)
+			return -1;
+
+		for (i = 0; i < RTE_DIM(dummy.num_pages); i++) {
+			hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i],
+					dummy.num_pages[i]);
+		}
+#endif
+	}
+
+	/* make a copy of socket_mem, needed for balanced allocation. */
+	for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
+		memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx];
+
+	/* calculate final number of pages */
+	if (calc_num_pages_per_socket(memory,
+			internal_config.hugepage_info, used_hp,
+			internal_config.num_hugepage_sizes) < 0)
+		return -1;
+
+	for (hp_sz_idx = 0;
+			hp_sz_idx < (int)internal_config.num_hugepage_sizes;
+			hp_sz_idx++) {
+		for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
+				socket_id++) {
+			struct rte_memseg **pages;
+			struct hugepage_info *hpi = &used_hp[hp_sz_idx];
+			unsigned int num_pages = hpi->num_pages[socket_id];
+			int num_pages_alloc, i;
+
+			if (num_pages == 0)
+				continue;
+
+			pages = malloc(sizeof(*pages) * num_pages);
+
+			RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n",
+				num_pages, hpi->hugepage_sz >> 20, socket_id);
+
+			num_pages_alloc = eal_memalloc_alloc_seg_bulk(pages,
+					num_pages, hpi->hugepage_sz,
+					socket_id, true);
+			if (num_pages_alloc < 0) {
+				free(pages);
+				return -1;
+			}
+
+			/* mark preallocated pages as unfreeable */
+			for (i = 0; i < num_pages_alloc; i++) {
+				struct rte_memseg *ms = pages[i];
+				ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
+			}
+			free(pages);
+		}
+	}
+	return 0;
+}
+
 /*
  * uses fstat to report the size of a file on disk
  */
@@ -1330,16 +1708,15 @@ getFileSize(int fd)
  * configuration and finds the hugepages which form that segment, mapping them
  * in order to form a contiguous block in the virtual memory space
  */
-int
-rte_eal_hugepage_attach(void)
+static int
+eal_legacy_hugepage_attach(void)
 {
-	const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	struct hugepage_file *hp = NULL;
-	unsigned num_hp = 0;
-	unsigned i, s = 0; /* s used to track the segment number */
-	unsigned max_seg = RTE_MAX_MEMSEG;
+	unsigned int num_hp = 0;
+	unsigned int i = 0;
+	unsigned int cur_seg;
 	off_t size = 0;
-	int fd, fd_zero = -1, fd_hugepage = -1;
+	int fd, fd_hugepage = -1;
 
 	if (aslr_enabled() > 0) {
 		RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
@@ -1350,137 +1727,114 @@ rte_eal_hugepage_attach(void)
 
 	test_phys_addrs_available();
 
-	fd_zero = open("/dev/zero", O_RDONLY);
-	if (fd_zero < 0) {
-		RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");
-		goto error;
-	}
-	fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY);
+	fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
 	if (fd_hugepage < 0) {
-		RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
+		RTE_LOG(ERR, EAL, "Could not open %s\n",
+				eal_hugepage_data_path());
 		goto error;
 	}
 
-	/* map all segments into memory to make sure we get the addrs */
-	for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
-		void *base_addr;
-
-		/*
-		 * the first memory segment with len==0 is the one that
-		 * follows the last valid segment.
-		 */
-		if (mcfg->memseg[s].len == 0)
-			break;
-
-		/*
-		 * fdzero is mmapped to get a contiguous block of virtual
-		 * addresses of the appropriate memseg size.
-		 * use mmap to get identical addresses as the primary process.
-		 */
-		base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
-				 PROT_READ,
-#ifdef RTE_ARCH_PPC_64
-				 MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-#else
-				 MAP_PRIVATE,
-#endif
-				 fd_zero, 0);
-		if (base_addr == MAP_FAILED ||
-		    base_addr != mcfg->memseg[s].addr) {
-			max_seg = s;
-			if (base_addr != MAP_FAILED) {
-				/* errno is stale, don't use */
-				RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
-					"in /dev/zero at [%p], got [%p] - "
-					"please use '--base-virtaddr' option\n",
-					(unsigned long long)mcfg->memseg[s].len,
-					mcfg->memseg[s].addr, base_addr);
-				munmap(base_addr, mcfg->memseg[s].len);
-			} else {
-				RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
-					"in /dev/zero at [%p]: '%s'\n",
-					(unsigned long long)mcfg->memseg[s].len,
-					mcfg->memseg[s].addr, strerror(errno));
-			}
-			if (aslr_enabled() > 0) {
-				RTE_LOG(ERR, EAL, "It is recommended to "
-					"disable ASLR in the kernel "
-					"and retry running both primary "
-					"and secondary processes\n");
-			}
-			goto error;
-		}
-	}
-
 	size = getFileSize(fd_hugepage);
 	hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
 	if (hp == MAP_FAILED) {
-		RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
+		RTE_LOG(ERR, EAL, "Could not mmap %s\n",
+				eal_hugepage_data_path());
 		goto error;
 	}
 
 	num_hp = size / sizeof(struct hugepage_file);
 	RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
 
-	s = 0;
-	while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
-		void *addr, *base_addr;
-		uintptr_t offset = 0;
-		size_t mapping_size;
-		/*
-		 * free previously mapped memory so we can map the
-		 * hugepages into the space
-		 */
-		base_addr = mcfg->memseg[s].addr;
-		munmap(base_addr, mcfg->memseg[s].len);
-
-		/* find the hugepages for this segment and map them
-		 * we don't need to worry about order, as the server sorted the
-		 * entries before it did the second mmap of them */
-		for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
-			if (hp[i].memseg_id == (int)s){
-				fd = open(hp[i].filepath, O_RDWR);
-				if (fd < 0) {
-					RTE_LOG(ERR, EAL, "Could not open %s\n",
-						hp[i].filepath);
-					goto error;
-				}
-				mapping_size = hp[i].size;
-				addr = mmap(RTE_PTR_ADD(base_addr, offset),
-						mapping_size, PROT_READ | PROT_WRITE,
-						MAP_SHARED, fd, 0);
-				close(fd); /* close file both on success and on failure */
-				if (addr == MAP_FAILED ||
-						addr != RTE_PTR_ADD(base_addr, offset)) {
-					RTE_LOG(ERR, EAL, "Could not mmap %s\n",
-						hp[i].filepath);
-					goto error;
-				}
-				offset+=mapping_size;
-			}
+	/* map all segments into memory to make sure we get the addrs. the
+	 * segments themselves are already in memseg list (which is shared and
+	 * has its VA space already preallocated), so we just need to map
+	 * everything into correct addresses.
+	 */
+	for (i = 0; i < num_hp; i++) {
+		struct hugepage_file *hf = &hp[i];
+		size_t map_sz = hf->size;
+		void *map_addr = hf->final_va;
+
+		/* if size is zero, no more pages left */
+		if (map_sz == 0)
+			break;
+
+		fd = open(hf->filepath, O_RDWR);
+		if (fd < 0) {
+			RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
+				hf->filepath, strerror(errno));
+			goto error;
+		}
+
+		map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,
+				MAP_SHARED | MAP_FIXED, fd, 0);
+		if (map_addr == MAP_FAILED) {
+			RTE_LOG(ERR, EAL, "Could not map %s: %s\n",
+				hf->filepath, strerror(errno));
+			close(fd);
+			goto error;
+		}
+
+		/* set shared lock on the file. */
+		if (flock(fd, LOCK_SH) < 0) {
+			RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n",
+				__func__, strerror(errno));
+			close(fd);
+			goto error;
 		}
-		RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
-				(unsigned long long)mcfg->memseg[s].len);
-		s++;
+
+		close(fd);
 	}
 	/* unmap the hugepage config file, since we are done using it */
 	munmap(hp, size);
-	close(fd_zero);
 	close(fd_hugepage);
 	return 0;
 
 error:
-	for (i = 0; i < max_seg && mcfg->memseg[i].len > 0; i++)
-		munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len);
+	/* map all segments into memory to make sure we get the addrs */
+	cur_seg = 0;
+	for (cur_seg = 0; cur_seg < i; cur_seg++) {
+		struct hugepage_file *hf = &hp[i];
+		size_t map_sz = hf->size;
+		void *map_addr = hf->final_va;
+
+		munmap(map_addr, map_sz);
+	}
 	if (hp != NULL && hp != MAP_FAILED)
 		munmap(hp, size);
-	if (fd_zero >= 0)
-		close(fd_zero);
 	if (fd_hugepage >= 0)
 		close(fd_hugepage);
 	return -1;
 }
 
+static int
+eal_hugepage_attach(void)
+{
+	if (eal_memalloc_sync_with_primary()) {
+		RTE_LOG(ERR, EAL, "Could not map memory from primary process\n");
+		if (aslr_enabled() > 0)
+			RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n");
+		return -1;
+	}
+	return 0;
+}
+
+int
+rte_eal_hugepage_init(void)
+{
+	return internal_config.legacy_mem ?
+			eal_legacy_hugepage_init() :
+			eal_hugepage_init();
+}
+
+int
+rte_eal_hugepage_attach(void)
+{
+	return internal_config.legacy_mem ?
+			eal_legacy_hugepage_attach() :
+			eal_hugepage_attach();
+}
+
 int
 rte_eal_using_phys_addrs(void)
 {
diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
index 08e150b7..f652ff98 100644
--- a/lib/librte_eal/linuxapp/eal/eal_thread.c
+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c
@@ -119,7 +119,7 @@ eal_thread_loop(__attribute__((unused)) void *arg)
 	if (eal_thread_set_affinity() < 0)
 		rte_panic("cannot set affinity\n");
 
-	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
+	ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
 
 	RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
 		lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "...");
diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c
index 161322f2..2766bd78 100644
--- a/lib/librte_eal/linuxapp/eal/eal_timer.c
+++ b/lib/librte_eal/linuxapp/eal/eal_timer.c
@@ -137,7 +137,6 @@ int
 rte_eal_hpet_init(int make_default)
 {
 	int fd, ret;
-	char thread_name[RTE_MAX_THREAD_NAME_LEN];
 
 	if (internal_config.no_hpet) {
 		RTE_LOG(NOTICE, EAL, "HPET is disabled\n");
@@ -178,7 +177,7 @@ rte_eal_hpet_init(int make_default)
 
 	/* create a thread that will increment a global variable for
 	 * msb (hpet is 32 bits by default under linux) */
-	ret = pthread_create(&msb_inc_thread_id, NULL,
+	ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL,
 			(void *(*)(void *))hpet_msb_inc, NULL);
 	if (ret != 0) {
 		RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n");
@@ -186,15 +185,6 @@ rte_eal_hpet_init(int make_default)
 		return -1;
 	}
 
-	/*
-	 * Set thread_name for aid in debugging.
-	 */
-	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "hpet-msb-inc");
-	ret = rte_thread_setname(msb_inc_thread_id, thread_name);
-	if (ret != 0)
-		RTE_LOG(DEBUG, EAL,
-			"Cannot set HPET timer thread name!\n");
-
 	if (make_default)
 		eal_timer_source = EAL_TIMER_HPET;
 	return 0;
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d0..a2bbdfbf 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1,12 +1,14 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
+#include <inttypes.h>
 #include <string.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/ioctl.h>
 
+#include <rte_errno.h>
 #include <rte_log.h>
 #include <rte_memory.h>
 #include <rte_eal_memconfig.h>
@@ -18,59 +20,294 @@
 
 #ifdef VFIO_PRESENT
 
+#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
+
+/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
+ * recreate the mappings for DPDK segments, but we cannot do so for memory that
+ * was registered by the user themselves, so we need to store the user mappings
+ * somewhere, to recreate them later.
+ */
+#define VFIO_MAX_USER_MEM_MAPS 256
+struct user_mem_map {
+	uint64_t addr;
+	uint64_t iova;
+	uint64_t len;
+};
+
+struct user_mem_maps {
+	rte_spinlock_recursive_t lock;
+	int n_maps;
+	struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
+};
+
+struct vfio_config {
+	int vfio_enabled;
+	int vfio_container_fd;
+	int vfio_active_groups;
+	const struct vfio_iommu_type *vfio_iommu_type;
+	struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
+	struct user_mem_maps mem_maps;
+};
+
 /* per-process VFIO config */
-static struct vfio_config vfio_cfg;
+static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS];
+static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0];
 
 static int vfio_type1_dma_map(int);
+static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
 static int vfio_spapr_dma_map(int);
+static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
 static int vfio_noiommu_dma_map(int);
+static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
+static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
+		uint64_t iova, uint64_t len, int do_map);
 
 /* IOMMU types we support */
 static const struct vfio_iommu_type iommu_types[] = {
 	/* x86 IOMMU, otherwise known as type 1 */
-	{ RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+	{
+		.type_id = RTE_VFIO_TYPE1,
+		.name = "Type 1",
+		.dma_map_func = &vfio_type1_dma_map,
+		.dma_user_map_func = &vfio_type1_dma_mem_map
+	},
 	/* ppc64 IOMMU, otherwise known as spapr */
-	{ RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
+	{
+		.type_id = RTE_VFIO_SPAPR,
+		.name = "sPAPR",
+		.dma_map_func = &vfio_spapr_dma_map,
+		.dma_user_map_func = &vfio_spapr_dma_mem_map
+	},
 	/* IOMMU-less mode */
-	{ RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
+	{
+		.type_id = RTE_VFIO_NOIOMMU,
+		.name = "No-IOMMU",
+		.dma_map_func = &vfio_noiommu_dma_map,
+		.dma_user_map_func = &vfio_noiommu_dma_mem_map
+	},
 };
 
-int
-vfio_get_group_fd(int iommu_group_no)
+/* for sPAPR IOMMU, we will need to walk memseg list, but we cannot use
+ * rte_memseg_walk() because by the time we enter callback we will be holding a
+ * write lock, so regular rte-memseg_walk will deadlock. copying the same
+ * iteration code everywhere is not ideal as well. so, use a lockless copy of
+ * memseg walk here.
+ */
+static int
+memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg)
 {
-	int i;
-	int vfio_group_fd;
-	char filename[PATH_MAX];
-	struct vfio_group *cur_grp;
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	int i, ms_idx, ret = 0;
 
-	/* check if we already have the group descriptor open */
-	for (i = 0; i < VFIO_MAX_GROUPS; i++)
-		if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
-			return vfio_cfg.vfio_groups[i].fd;
+	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+		struct rte_memseg_list *msl = &mcfg->memsegs[i];
+		const struct rte_memseg *ms;
+		struct rte_fbarray *arr;
 
-	/* Lets see first if there is room for a new group */
-	if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) {
-		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
-		return -1;
-	}
+		if (msl->memseg_arr.count == 0)
+			continue;
 
-	/* Now lets get an index for the new group */
-	for (i = 0; i < VFIO_MAX_GROUPS; i++)
-		if (vfio_cfg.vfio_groups[i].group_no == -1) {
-			cur_grp = &vfio_cfg.vfio_groups[i];
-			break;
+		arr = &msl->memseg_arr;
+
+		ms_idx = rte_fbarray_find_next_used(arr, 0);
+		while (ms_idx >= 0) {
+			ms = rte_fbarray_get(arr, ms_idx);
+			ret = func(msl, ms, arg);
+			if (ret < 0)
+				return -1;
+			if (ret > 0)
+				return 1;
+			ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
 		}
+	}
+	return 0;
+}
 
-	/* This should not happen */
-	if (i == VFIO_MAX_GROUPS) {
-		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+static int
+is_null_map(const struct user_mem_map *map)
+{
+	return map->addr == 0 && map->iova == 0 && map->len == 0;
+}
+
+/* we may need to merge user mem maps together in case of user mapping/unmapping
+ * chunks of memory, so we'll need a comparator function to sort segments.
+ */
+static int
+user_mem_map_cmp(const void *a, const void *b)
+{
+	const struct user_mem_map *umm_a = a;
+	const struct user_mem_map *umm_b = b;
+
+	/* move null entries to end */
+	if (is_null_map(umm_a))
+		return 1;
+	if (is_null_map(umm_b))
+		return -1;
+
+	/* sort by iova first */
+	if (umm_a->iova < umm_b->iova)
 		return -1;
+	if (umm_a->iova > umm_b->iova)
+		return 1;
+
+	if (umm_a->addr < umm_b->addr)
+		return -1;
+	if (umm_a->addr > umm_b->addr)
+		return 1;
+
+	if (umm_a->len < umm_b->len)
+		return -1;
+	if (umm_a->len > umm_b->len)
+		return 1;
+
+	return 0;
+}
+
+/* adjust user map entry. this may result in shortening of existing map, or in
+ * splitting existing map in two pieces.
+ */
+static void
+adjust_map(struct user_mem_map *src, struct user_mem_map *end,
+		uint64_t remove_va_start, uint64_t remove_len)
+{
+	/* if va start is same as start address, we're simply moving start */
+	if (remove_va_start == src->addr) {
+		src->addr += remove_len;
+		src->iova += remove_len;
+		src->len -= remove_len;
+	} else if (remove_va_start + remove_len == src->addr + src->len) {
+		/* we're shrinking mapping from the end */
+		src->len -= remove_len;
+	} else {
+		/* we're blowing a hole in the middle */
+		struct user_mem_map tmp;
+		uint64_t total_len = src->len;
+
+		/* adjust source segment length */
+		src->len = remove_va_start - src->addr;
+
+		/* create temporary segment in the middle */
+		tmp.addr = src->addr + src->len;
+		tmp.iova = src->iova + src->len;
+		tmp.len = remove_len;
+
+		/* populate end segment - this one we will be keeping */
+		end->addr = tmp.addr + tmp.len;
+		end->iova = tmp.iova + tmp.len;
+		end->len = total_len - src->len - tmp.len;
 	}
+}
+
+/* try merging two maps into one, return 1 if succeeded */
+static int
+merge_map(struct user_mem_map *left, struct user_mem_map *right)
+{
+	if (left->addr + left->len != right->addr)
+		return 0;
+	if (left->iova + left->len != right->iova)
+		return 0;
+
+	left->len += right->len;
+
+	memset(right, 0, sizeof(*right));
+
+	return 1;
+}
+
+static struct user_mem_map *
+find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
+		uint64_t iova, uint64_t len)
+{
+	uint64_t va_end = addr + len;
+	uint64_t iova_end = iova + len;
+	int i;
+
+	for (i = 0; i < user_mem_maps->n_maps; i++) {
+		struct user_mem_map *map = &user_mem_maps->maps[i];
+		uint64_t map_va_end = map->addr + map->len;
+		uint64_t map_iova_end = map->iova + map->len;
+
+		/* check start VA */
+		if (addr < map->addr || addr >= map_va_end)
+			continue;
+		/* check if VA end is within boundaries */
+		if (va_end <= map->addr || va_end > map_va_end)
+			continue;
+
+		/* check start IOVA */
+		if (iova < map->iova || iova >= map_iova_end)
+			continue;
+		/* check if IOVA end is within boundaries */
+		if (iova_end <= map->iova || iova_end > map_iova_end)
+			continue;
+
+		/* we've found our map */
+		return map;
+	}
+	return NULL;
+}
+
+/* this will sort all user maps, and merge/compact any adjacent maps */
+static void
+compact_user_maps(struct user_mem_maps *user_mem_maps)
+{
+	int i, n_merged, cur_idx;
+
+	qsort(user_mem_maps->maps, user_mem_maps->n_maps,
+			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
+
+	/* we'll go over the list backwards when merging */
+	n_merged = 0;
+	for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
+		struct user_mem_map *l, *r;
+
+		l = &user_mem_maps->maps[i];
+		r = &user_mem_maps->maps[i + 1];
+
+		if (is_null_map(l) || is_null_map(r))
+			continue;
+
+		if (merge_map(l, r))
+			n_merged++;
+	}
+
+	/* the entries are still sorted, but now they have holes in them, so
+	 * walk through the list and remove the holes
+	 */
+	if (n_merged > 0) {
+		cur_idx = 0;
+		for (i = 0; i < user_mem_maps->n_maps; i++) {
+			if (!is_null_map(&user_mem_maps->maps[i])) {
+				struct user_mem_map *src, *dst;
+
+				src = &user_mem_maps->maps[i];
+				dst = &user_mem_maps->maps[cur_idx++];
+
+				if (src != dst) {
+					memcpy(dst, src, sizeof(*src));
+					memset(src, 0, sizeof(*src));
+				}
+			}
+		}
+		user_mem_maps->n_maps = cur_idx;
+	}
+}
+
+static int
+vfio_open_group_fd(int iommu_group_num)
+{
+	int vfio_group_fd;
+	char filename[PATH_MAX];
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
 	/* if primary, try to open the group */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 		/* try regular group format */
 		snprintf(filename, sizeof(filename),
-				 VFIO_GROUP_FMT, iommu_group_no);
+				 VFIO_GROUP_FMT, iommu_group_num);
 		vfio_group_fd = open(filename, O_RDWR);
 		if (vfio_group_fd < 0) {
 			/* if file not found, it's not an error */
@@ -82,7 +319,8 @@ vfio_get_group_fd(int iommu_group_no)
 
 			/* special case: try no-IOMMU path as well */
 			snprintf(filename, sizeof(filename),
-					VFIO_NOIOMMU_GROUP_FMT, iommu_group_no);
+					VFIO_NOIOMMU_GROUP_FMT,
+					iommu_group_num);
 			vfio_group_fd = open(filename, O_RDWR);
 			if (vfio_group_fd < 0) {
 				if (errno != ENOENT) {
@@ -95,162 +333,293 @@ vfio_get_group_fd(int iommu_group_no)
 			/* noiommu group found */
 		}
 
-		cur_grp->group_no = iommu_group_no;
-		cur_grp->fd = vfio_group_fd;
-		vfio_cfg.vfio_active_groups++;
 		return vfio_group_fd;
 	}
 	/* if we're in a secondary process, request group fd from the primary
-	 * process via our socket
+	 * process via mp channel.
 	 */
-	else {
-		int socket_fd, ret;
+	p->req = SOCKET_REQ_GROUP;
+	p->group_num = iommu_group_num;
+	strcpy(mp_req.name, EAL_VFIO_MP);
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
+
+	vfio_group_fd = -1;
+	if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+			vfio_group_fd = mp_rep->fds[0];
+		} else if (p->result == SOCKET_NO_FD) {
+			RTE_LOG(ERR, EAL, "  bad VFIO group fd\n");
+			vfio_group_fd = 0;
+		}
+		free(mp_reply.msgs);
+	}
 
-		socket_fd = vfio_mp_sync_connect_to_primary();
+	if (vfio_group_fd < 0)
+		RTE_LOG(ERR, EAL, "  cannot request group fd\n");
+	return vfio_group_fd;
+}
 
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot send group number!\n");
-			close(socket_fd);
-			return -1;
-		}
-		ret = vfio_mp_sync_receive_request(socket_fd);
-		switch (ret) {
-		case SOCKET_NO_FD:
-			close(socket_fd);
-			return 0;
-		case SOCKET_OK:
-			vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
-			/* if we got the fd, store it and return it */
-			if (vfio_group_fd > 0) {
-				close(socket_fd);
-				cur_grp->group_no = iommu_group_no;
-				cur_grp->fd = vfio_group_fd;
-				vfio_cfg.vfio_active_groups++;
-				return vfio_group_fd;
-			}
-			/* fall-through on error */
-		default:
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+static struct vfio_config *
+get_vfio_cfg_by_group_num(int iommu_group_num)
+{
+	struct vfio_config *vfio_cfg;
+	int i, j;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		vfio_cfg = &vfio_cfgs[i];
+		for (j = 0; j < VFIO_MAX_GROUPS; j++) {
+			if (vfio_cfg->vfio_groups[j].group_num ==
+					iommu_group_num)
+				return vfio_cfg;
 		}
 	}
-	return -1;
+
+	return NULL;
 }
 
+static struct vfio_config *
+get_vfio_cfg_by_group_fd(int vfio_group_fd)
+{
+	struct vfio_config *vfio_cfg;
+	int i, j;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		vfio_cfg = &vfio_cfgs[i];
+		for (j = 0; j < VFIO_MAX_GROUPS; j++)
+			if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+				return vfio_cfg;
+	}
 
-static int
-get_vfio_group_idx(int vfio_group_fd)
+	return NULL;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_container_fd(int container_fd)
 {
 	int i;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		if (vfio_cfgs[i].vfio_container_fd == container_fd)
+			return &vfio_cfgs[i];
+	}
+
+	return NULL;
+}
+
+int
+rte_vfio_get_group_fd(int iommu_group_num)
+{
+	int i;
+	int vfio_group_fd;
+	struct vfio_group *cur_grp;
+	struct vfio_config *vfio_cfg;
+
+	/* get the vfio_config it belongs to */
+	vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+	vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+
+	/* check if we already have the group descriptor open */
 	for (i = 0; i < VFIO_MAX_GROUPS; i++)
-		if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd)
-			return i;
+		if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num)
+			return vfio_cfg->vfio_groups[i].fd;
+
+	/* Lets see first if there is room for a new group */
+	if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+		return -1;
+	}
+
+	/* Now lets get an index for the new group */
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg->vfio_groups[i].group_num == -1) {
+			cur_grp = &vfio_cfg->vfio_groups[i];
+			break;
+		}
+
+	/* This should not happen */
+	if (i == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+		return -1;
+	}
+
+	vfio_group_fd = vfio_open_group_fd(iommu_group_num);
+	if (vfio_group_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
+		return -1;
+	}
+
+	cur_grp->group_num = iommu_group_num;
+	cur_grp->fd = vfio_group_fd;
+	vfio_cfg->vfio_active_groups++;
+
+	return vfio_group_fd;
+}
+
+static int
+get_vfio_group_idx(int vfio_group_fd)
+{
+	struct vfio_config *vfio_cfg;
+	int i, j;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		vfio_cfg = &vfio_cfgs[i];
+		for (j = 0; j < VFIO_MAX_GROUPS; j++)
+			if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+				return j;
+	}
+
 	return -1;
 }
 
 static void
 vfio_group_device_get(int vfio_group_fd)
 {
+	struct vfio_config *vfio_cfg;
 	int i;
 
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "  invalid group fd!\n");
+		return;
+	}
+
 	i = get_vfio_group_idx(vfio_group_fd);
 	if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
 		RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
 	else
-		vfio_cfg.vfio_groups[i].devices++;
+		vfio_cfg->vfio_groups[i].devices++;
 }
 
 static void
 vfio_group_device_put(int vfio_group_fd)
 {
+	struct vfio_config *vfio_cfg;
 	int i;
 
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "  invalid group fd!\n");
+		return;
+	}
+
 	i = get_vfio_group_idx(vfio_group_fd);
 	if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
 		RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
 	else
-		vfio_cfg.vfio_groups[i].devices--;
+		vfio_cfg->vfio_groups[i].devices--;
 }
 
 static int
 vfio_group_device_count(int vfio_group_fd)
 {
+	struct vfio_config *vfio_cfg;
 	int i;
 
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "  invalid group fd!\n");
+		return -1;
+	}
+
 	i = get_vfio_group_idx(vfio_group_fd);
 	if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
 		RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
 		return -1;
 	}
 
-	return vfio_cfg.vfio_groups[i].devices;
+	return vfio_cfg->vfio_groups[i].devices;
+}
+
+static void
+vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
+		void *arg __rte_unused)
+{
+	struct rte_memseg_list *msl;
+	struct rte_memseg *ms;
+	size_t cur_len = 0;
+
+	msl = rte_mem_virt2memseg_list(addr);
+
+	/* for IOVA as VA mode, no need to care for IOVA addresses */
+	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+		uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
+		if (type == RTE_MEM_EVENT_ALLOC)
+			vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
+					len, 1);
+		else
+			vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
+					len, 0);
+		return;
+	}
+
+	/* memsegs are contiguous in memory */
+	ms = rte_mem_virt2memseg(addr, msl);
+	while (cur_len < len) {
+		if (type == RTE_MEM_EVENT_ALLOC)
+			vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
+					ms->iova, ms->len, 1);
+		else
+			vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
+					ms->iova, ms->len, 0);
+
+		cur_len += ms->len;
+		++ms;
+	}
 }
 
 int
 rte_vfio_clear_group(int vfio_group_fd)
 {
 	int i;
-	int socket_fd, ret;
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+	struct vfio_config *vfio_cfg;
+
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "  invalid group fd!\n");
+		return -1;
+	}
 
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 
 		i = get_vfio_group_idx(vfio_group_fd);
 		if (i < 0)
 			return -1;
-		vfio_cfg.vfio_groups[i].group_no = -1;
-		vfio_cfg.vfio_groups[i].fd = -1;
-		vfio_cfg.vfio_groups[i].devices = 0;
-		vfio_cfg.vfio_active_groups--;
+		vfio_cfg->vfio_groups[i].group_num = -1;
+		vfio_cfg->vfio_groups[i].fd = -1;
+		vfio_cfg->vfio_groups[i].devices = 0;
+		vfio_cfg->vfio_active_groups--;
 		return 0;
 	}
 
-	/* This is just for SECONDARY processes */
-	socket_fd = vfio_mp_sync_connect_to_primary();
-
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-		return -1;
-	}
-
-	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-		close(socket_fd);
-		return -1;
-	}
+	p->req = SOCKET_CLR_GROUP;
+	p->group_num = vfio_group_fd;
+	strcpy(mp_req.name, EAL_VFIO_MP);
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
+
+	if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK) {
+			free(mp_reply.msgs);
+			return 0;
+		} else if (p->result == SOCKET_NO_FD)
+			RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
+		else
+			RTE_LOG(ERR, EAL, "  no such VFIO group fd!\n");
 
-	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
-		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
-		close(socket_fd);
-		return -1;
+		free(mp_reply.msgs);
 	}
 
-	ret = vfio_mp_sync_receive_request(socket_fd);
-	switch (ret) {
-	case SOCKET_NO_FD:
-		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
-		close(socket_fd);
-		break;
-	case SOCKET_OK:
-		close(socket_fd);
-		return 0;
-	case SOCKET_ERR:
-		RTE_LOG(ERR, EAL, "  Socket error\n");
-		close(socket_fd);
-		break;
-	default:
-		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
-		close(socket_fd);
-	}
 	return -1;
 }
 
@@ -258,15 +627,20 @@ int
 rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		int *vfio_dev_fd, struct vfio_device_info *device_info)
 {
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
 	struct vfio_group_status group_status = {
 			.argsz = sizeof(group_status)
 	};
+	struct vfio_config *vfio_cfg;
+	struct user_mem_maps *user_mem_maps;
+	int vfio_container_fd;
 	int vfio_group_fd;
-	int iommu_group_no;
-	int ret;
+	int iommu_group_num;
+	int i, ret;
 
 	/* get group number */
-	ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no);
+	ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
 	if (ret == 0) {
 		RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
 			dev_addr);
@@ -278,7 +652,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		return -1;
 
 	/* get the actual group fd */
-	vfio_group_fd = vfio_get_group_fd(iommu_group_no);
+	vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
 	if (vfio_group_fd < 0)
 		return -1;
 
@@ -309,12 +683,18 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		return -1;
 	}
 
+	/* get the vfio_config it belongs to */
+	vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+	vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+	vfio_container_fd = vfio_cfg->vfio_container_fd;
+	user_mem_maps = &vfio_cfg->mem_maps;
+
 	/* check if group does not have a container yet */
 	if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
 
 		/* add group to a container */
 		ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
-				&vfio_cfg.vfio_container_fd);
+				&vfio_container_fd);
 		if (ret) {
 			RTE_LOG(ERR, EAL, "  %s cannot add VFIO group to container, "
 					"error %i (%s)\n", dev_addr, errno, strerror(errno));
@@ -332,10 +712,12 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		 * functionality.
 		 */
 		if (internal_config.process_type == RTE_PROC_PRIMARY &&
-				vfio_cfg.vfio_active_groups == 1) {
+				vfio_cfg->vfio_active_groups == 1 &&
+				vfio_group_device_count(vfio_group_fd) == 0) {
+			const struct vfio_iommu_type *t;
+
 			/* select an IOMMU type which we will be using */
-			const struct vfio_iommu_type *t =
-				vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+			t = vfio_set_iommu_type(vfio_container_fd);
 			if (!t) {
 				RTE_LOG(ERR, EAL,
 					"  %s failed to select IOMMU type\n",
@@ -344,15 +726,75 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 				rte_vfio_clear_group(vfio_group_fd);
 				return -1;
 			}
-			ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
+			/* lock memory hotplug before mapping and release it
+			 * after registering callback, to prevent races
+			 */
+			rte_rwlock_read_lock(mem_lock);
+			if (vfio_cfg == default_vfio_cfg)
+				ret = t->dma_map_func(vfio_container_fd);
+			else
+				ret = 0;
 			if (ret) {
 				RTE_LOG(ERR, EAL,
 					"  %s DMA remapping failed, error %i (%s)\n",
 					dev_addr, errno, strerror(errno));
 				close(vfio_group_fd);
 				rte_vfio_clear_group(vfio_group_fd);
+				rte_rwlock_read_unlock(mem_lock);
 				return -1;
 			}
+
+			vfio_cfg->vfio_iommu_type = t;
+
+			/* re-map all user-mapped segments */
+			rte_spinlock_recursive_lock(&user_mem_maps->lock);
+
+			/* this IOMMU type may not support DMA mapping, but
+			 * if we have mappings in the list - that means we have
+			 * previously mapped something successfully, so we can
+			 * be sure that DMA mapping is supported.
+			 */
+			for (i = 0; i < user_mem_maps->n_maps; i++) {
+				struct user_mem_map *map;
+				map = &user_mem_maps->maps[i];
+
+				ret = t->dma_user_map_func(
+						vfio_container_fd,
+						map->addr, map->iova, map->len,
+						1);
+				if (ret) {
+					RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: "
+							"va: 0x%" PRIx64 " "
+							"iova: 0x%" PRIx64 " "
+							"len: 0x%" PRIu64 "\n",
+							map->addr, map->iova,
+							map->len);
+					rte_spinlock_recursive_unlock(
+							&user_mem_maps->lock);
+					rte_rwlock_read_unlock(mem_lock);
+					return -1;
+				}
+			}
+			rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+
+			/* register callback for mem events */
+			if (vfio_cfg == default_vfio_cfg)
+				ret = rte_mem_event_callback_register(
+					VFIO_MEM_EVENT_CLB_NAME,
+					vfio_mem_event_callback, NULL);
+			else
+				ret = 0;
+			/* unlock memory hotplug */
+			rte_rwlock_read_unlock(mem_lock);
+
+			if (ret && rte_errno != ENOTSUP) {
+				RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
+				return -1;
+			}
+			if (ret)
+				RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
+			else
+				RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
 		}
 	}
 
@@ -390,30 +832,45 @@ int
 rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
 		    int vfio_dev_fd)
 {
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
 	struct vfio_group_status group_status = {
 			.argsz = sizeof(group_status)
 	};
+	struct vfio_config *vfio_cfg;
 	int vfio_group_fd;
-	int iommu_group_no;
+	int iommu_group_num;
 	int ret;
 
+	/* we don't want any DMA mapping messages to come while we're detaching
+	 * VFIO device, because this might be the last device and we might need
+	 * to unregister the callback.
+	 */
+	rte_rwlock_read_lock(mem_lock);
+
 	/* get group number */
-	ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no);
+	ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
 	if (ret <= 0) {
 		RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver\n",
 			dev_addr);
 		/* This is an error at this point. */
-		return -1;
+		ret = -1;
+		goto out;
 	}
 
 	/* get the actual group fd */
-	vfio_group_fd = vfio_get_group_fd(iommu_group_no);
+	vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
 	if (vfio_group_fd <= 0) {
-		RTE_LOG(INFO, EAL, "vfio_get_group_fd failed for %s\n",
+		RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n",
 				   dev_addr);
-		return -1;
+		ret = -1;
+		goto out;
 	}
 
+	/* get the vfio_config it belongs to */
+	vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+	vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+
 	/* At this point we got an active group. Closing it will make the
 	 * container detachment. If this is the last active group, VFIO kernel
 	 * code will unset the container and the IOMMU mappings.
@@ -423,7 +880,8 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
 	if (close(vfio_dev_fd) < 0) {
 		RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
 				   dev_addr);
-		return -1;
+		ret = -1;
+		goto out;
 	}
 
 	/* An VFIO group can have several devices attached. Just when there is
@@ -435,30 +893,53 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
 		if (close(vfio_group_fd) < 0) {
 			RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
 				dev_addr);
-			return -1;
+			ret = -1;
+			goto out;
 		}
 
 		if (rte_vfio_clear_group(vfio_group_fd) < 0) {
 			RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
 					   dev_addr);
-			return -1;
+			ret = -1;
+			goto out;
 		}
 	}
 
-	return 0;
+	/* if there are no active device groups, unregister the callback to
+	 * avoid spurious attempts to map/unmap memory from VFIO.
+	 */
+	if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0)
+		rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME,
+				NULL);
+
+	/* success */
+	ret = 0;
+
+out:
+	rte_rwlock_read_unlock(mem_lock);
+	return ret;
 }
 
 int
 rte_vfio_enable(const char *modname)
 {
 	/* initialize group list */
-	int i;
+	int i, j;
 	int vfio_available;
 
-	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
-		vfio_cfg.vfio_groups[i].fd = -1;
-		vfio_cfg.vfio_groups[i].group_no = -1;
-		vfio_cfg.vfio_groups[i].devices = 0;
+	rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		vfio_cfgs[i].vfio_container_fd = -1;
+		vfio_cfgs[i].vfio_active_groups = 0;
+		vfio_cfgs[i].vfio_iommu_type = NULL;
+		vfio_cfgs[i].mem_maps.lock = lock;
+
+		for (j = 0; j < VFIO_MAX_GROUPS; j++) {
+			vfio_cfgs[i].vfio_groups[j].fd = -1;
+			vfio_cfgs[i].vfio_groups[j].group_num = -1;
+			vfio_cfgs[i].vfio_groups[j].devices = 0;
+		}
 	}
 
 	/* inform the user that we are probing for VFIO */
@@ -480,12 +961,12 @@ rte_vfio_enable(const char *modname)
 		return 0;
 	}
 
-	vfio_cfg.vfio_container_fd = vfio_get_container_fd();
+	default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd();
 
 	/* check if we have VFIO driver enabled */
-	if (vfio_cfg.vfio_container_fd != -1) {
+	if (default_vfio_cfg->vfio_container_fd != -1) {
 		RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
-		vfio_cfg.vfio_enabled = 1;
+		default_vfio_cfg->vfio_enabled = 1;
 	} else {
 		RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
 	}
@@ -497,7 +978,7 @@ int
 rte_vfio_is_enabled(const char *modname)
 {
 	const int mod_available = rte_eal_check_module(modname) > 0;
-	return vfio_cfg.vfio_enabled && mod_available;
+	return default_vfio_cfg->vfio_enabled && mod_available;
 }
 
 const struct vfio_iommu_type *
@@ -558,9 +1039,14 @@ vfio_has_supported_extensions(int vfio_container_fd)
 }
 
 int
-vfio_get_container_fd(void)
+rte_vfio_get_container_fd(void)
 {
 	int ret, vfio_container_fd;
+	struct rte_mp_msg mp_req, *mp_rep;
+	struct rte_mp_reply mp_reply;
+	struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+	struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
 
 	/* if we're in a primary process, try to open the container */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,39 +1077,35 @@ vfio_get_container_fd(void)
 		}
 
 		return vfio_container_fd;
-	} else {
-		/*
-		 * if we're in a secondary process, request container fd from the
-		 * primary process via our socket
-		 */
-		int socket_fd;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
-		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-			return -1;
-		}
-		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
-			RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-			close(socket_fd);
-			return -1;
-		}
-		vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
-		if (vfio_container_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-			close(socket_fd);
-			return -1;
+	}
+	/*
+	 * if we're in a secondary process, request container fd from the
+	 * primary process via mp channel
+	 */
+	p->req = SOCKET_REQ_CONTAINER;
+	strcpy(mp_req.name, EAL_VFIO_MP);
+	mp_req.len_param = sizeof(*p);
+	mp_req.num_fds = 0;
+
+	vfio_container_fd = -1;
+	if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+	    mp_reply.nb_received == 1) {
+		mp_rep = &mp_reply.msgs[0];
+		p = (struct vfio_mp_param *)mp_rep->param;
+		if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+			free(mp_reply.msgs);
+			return mp_rep->fds[0];
 		}
-		close(socket_fd);
-		return vfio_container_fd;
+		free(mp_reply.msgs);
 	}
 
+	RTE_LOG(ERR, EAL, "  cannot request container fd\n");
 	return -1;
 }
 
 int
-vfio_get_group_no(const char *sysfs_base,
-		const char *dev_addr, int *iommu_group_no)
+rte_vfio_get_group_num(const char *sysfs_base,
+		const char *dev_addr, int *iommu_group_num)
 {
 	char linkname[PATH_MAX];
 	char filename[PATH_MAX];
@@ -655,7 +1137,7 @@ vfio_get_group_no(const char *sysfs_base,
 	errno = 0;
 	group_tok = tok[ret - 1];
 	end = group_tok;
-	*iommu_group_no = strtol(group_tok, &end, 10);
+	*iommu_group_num = strtol(group_tok, &end, 10);
 	if ((end != group_tok && *end != '\0') || errno != 0) {
 		RTE_LOG(ERR, EAL, "  %s error parsing IOMMU number!\n", dev_addr);
 		return -1;
@@ -665,34 +1147,49 @@ vfio_get_group_no(const char *sysfs_base,
 }
 
 static int
-vfio_type1_dma_map(int vfio_container_fd)
+type1_map(const struct rte_memseg_list *msl __rte_unused,
+		const struct rte_memseg *ms, void *arg)
 {
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	int i, ret;
+	int *vfio_container_fd = arg;
 
-	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		struct vfio_iommu_type1_dma_map dma_map;
+	return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+			ms->len, 1);
+}
 
-		if (ms[i].addr == NULL)
-			break;
+static int
+vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len, int do_map)
+{
+	struct vfio_iommu_type1_dma_map dma_map;
+	struct vfio_iommu_type1_dma_unmap dma_unmap;
+	int ret;
 
+	if (do_map != 0) {
 		memset(&dma_map, 0, sizeof(dma_map));
 		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-		dma_map.vaddr = ms[i].addr_64;
-		dma_map.size = ms[i].len;
-		if (rte_eal_iova_mode() == RTE_IOVA_VA)
-			dma_map.iova = dma_map.vaddr;
-		else
-			dma_map.iova = ms[i].iova;
-		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+		dma_map.vaddr = vaddr;
+		dma_map.size = len;
+		dma_map.iova = iova;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+				VFIO_DMA_MAP_FLAG_WRITE;
 
 		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
+				errno, strerror(errno));
+				return -1;
+		}
+	} else {
+		memset(&dma_unmap, 0, sizeof(dma_unmap));
+		dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+		dma_unmap.size = len;
+		dma_unmap.iova = iova;
 
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+				&dma_unmap);
 		if (ret) {
-			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
-					  "error %i (%s)\n", errno,
-					  strerror(errno));
+			RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)\n",
+					errno, strerror(errno));
 			return -1;
 		}
 	}
@@ -701,24 +1198,107 @@ vfio_type1_dma_map(int vfio_container_fd)
 }
 
 static int
-vfio_spapr_dma_map(int vfio_container_fd)
+vfio_type1_dma_map(int vfio_container_fd)
 {
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	int i, ret;
+	return rte_memseg_walk(type1_map, &vfio_container_fd);
+}
+
+static int
+vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len, int do_map)
+{
+	struct vfio_iommu_type1_dma_map dma_map;
+	struct vfio_iommu_type1_dma_unmap dma_unmap;
+	int ret;
+
+	if (do_map != 0) {
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = vaddr;
+		dma_map.size = len;
+		dma_map.iova = iova;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+				VFIO_DMA_MAP_FLAG_WRITE;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
+				errno, strerror(errno));
+				return -1;
+		}
+
+	} else {
+		struct vfio_iommu_spapr_register_memory reg = {
+			.argsz = sizeof(reg),
+			.flags = 0
+		};
+		reg.vaddr = (uintptr_t) vaddr;
+		reg.size = len;
+
+		ret = ioctl(vfio_container_fd,
+				VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i (%s)\n",
+					errno, strerror(errno));
+			return -1;
+		}
+
+		memset(&dma_unmap, 0, sizeof(dma_unmap));
+		dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+		dma_unmap.size = len;
+		dma_unmap.iova = iova;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+				&dma_unmap);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)\n",
+					errno, strerror(errno));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused,
+		const struct rte_memseg *ms, void *arg)
+{
+	int *vfio_container_fd = arg;
+
+	return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+			ms->len, 1);
+}
+
+struct spapr_walk_param {
+	uint64_t window_size;
+	uint64_t hugepage_sz;
+};
+static int
+vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused,
+		const struct rte_memseg *ms, void *arg)
+{
+	struct spapr_walk_param *param = arg;
+	uint64_t max = ms->iova + ms->len;
+
+	if (max > param->window_size) {
+		param->hugepage_sz = ms->hugepage_sz;
+		param->window_size = max;
+	}
+
+	return 0;
+}
 
-	struct vfio_iommu_spapr_register_memory reg = {
-		.argsz = sizeof(reg),
-		.flags = 0
+static int
+vfio_spapr_create_new_dma_window(int vfio_container_fd,
+		struct vfio_iommu_spapr_tce_create *create) {
+	struct vfio_iommu_spapr_tce_remove remove = {
+		.argsz = sizeof(remove),
 	};
 	struct vfio_iommu_spapr_tce_info info = {
 		.argsz = sizeof(info),
 	};
-	struct vfio_iommu_spapr_tce_create create = {
-		.argsz = sizeof(create),
-	};
-	struct vfio_iommu_spapr_tce_remove remove = {
-		.argsz = sizeof(remove),
-	};
+	int ret;
 
 	/* query spapr iommu info */
 	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
@@ -737,69 +1317,155 @@ vfio_spapr_dma_map(int vfio_container_fd)
 		return -1;
 	}
 
-	/* create DMA window from 0 to max(phys_addr + len) */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		if (ms[i].addr == NULL)
-			break;
-
-		create.window_size = RTE_MAX(create.window_size,
-				ms[i].iova + ms[i].len);
-	}
-
-	/* sPAPR requires window size to be a power of 2 */
-	create.window_size = rte_align64pow2(create.window_size);
-	create.page_shift = __builtin_ctzll(ms->hugepage_sz);
-	create.levels = 1;
-
-	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+	/* create new DMA window */
+	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
 	if (ret) {
 		RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
 				"error %i (%s)\n", errno, strerror(errno));
 		return -1;
 	}
 
-	if (create.start_addr != 0) {
+	if (create->start_addr != 0) {
 		RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");
 		return -1;
 	}
 
-	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		struct vfio_iommu_type1_dma_map dma_map;
+	return 0;
+}
 
-		if (ms[i].addr == NULL)
-			break;
+static int
+vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len, int do_map)
+{
+	struct spapr_walk_param param;
+	struct vfio_iommu_spapr_tce_create create = {
+		.argsz = sizeof(create),
+	};
+	struct vfio_config *vfio_cfg;
+	struct user_mem_maps *user_mem_maps;
+	int i, ret = 0;
 
-		reg.vaddr = (uintptr_t) ms[i].addr;
-		reg.size = ms[i].len;
-		ret = ioctl(vfio_container_fd,
-			VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
-		if (ret) {
-			RTE_LOG(ERR, EAL, "  cannot register vaddr for IOMMU, "
-				"error %i (%s)\n", errno, strerror(errno));
-			return -1;
-		}
+	vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "  invalid container fd!\n");
+		return -1;
+	}
 
-		memset(&dma_map, 0, sizeof(dma_map));
-		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-		dma_map.vaddr = ms[i].addr_64;
-		dma_map.size = ms[i].len;
-		if (rte_eal_iova_mode() == RTE_IOVA_VA)
-			dma_map.iova = dma_map.vaddr;
-		else
-			dma_map.iova = ms[i].iova;
-		dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
-				 VFIO_DMA_MAP_FLAG_WRITE;
+	user_mem_maps = &vfio_cfg->mem_maps;
+	rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
-		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+	/* check if window size needs to be adjusted */
+	memset(&param, 0, sizeof(param));
 
-		if (ret) {
-			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
-				"error %i (%s)\n", errno, strerror(errno));
-			return -1;
+	if (memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
+				&param) < 0) {
+		RTE_LOG(ERR, EAL, "Could not get window size\n");
+		ret = -1;
+		goto out;
+	}
+
+	/* also check user maps */
+	for (i = 0; i < user_mem_maps->n_maps; i++) {
+		uint64_t max = user_mem_maps->maps[i].iova +
+				user_mem_maps->maps[i].len;
+		create.window_size = RTE_MAX(create.window_size, max);
+	}
+
+	/* sPAPR requires window size to be a power of 2 */
+	create.window_size = rte_align64pow2(param.window_size);
+	create.page_shift = __builtin_ctzll(param.hugepage_sz);
+	create.levels = 1;
+
+	if (do_map) {
+		void *addr;
+		/* re-create window and remap the entire memory */
+		if (iova > create.window_size) {
+			if (vfio_spapr_create_new_dma_window(vfio_container_fd,
+					&create) < 0) {
+				RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
+				ret = -1;
+				goto out;
+			}
+			if (memseg_walk_thread_unsafe(vfio_spapr_map_walk,
+					&vfio_container_fd) < 0) {
+				RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
+				ret = -1;
+				goto out;
+			}
+			/* remap all user maps */
+			for (i = 0; i < user_mem_maps->n_maps; i++) {
+				struct user_mem_map *map =
+						&user_mem_maps->maps[i];
+				if (vfio_spapr_dma_do_map(vfio_container_fd,
+						map->addr, map->iova, map->len,
+						1)) {
+					RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n");
+					ret = -1;
+					goto out;
+				}
+			}
+		}
+
+		/* now that we've remapped all of the memory that was present
+		 * before, map the segment that we were requested to map.
+		 *
+		 * however, if we were called by the callback, the memory we
+		 * were called with was already in the memseg list, so previous
+		 * mapping should've mapped that segment already.
+		 *
+		 * virt2memseg_list is a relatively cheap check, so use that. if
+		 * memory is within any memseg list, it's a memseg, so it's
+		 * already mapped.
+		 */
+		addr = (void *)(uintptr_t)vaddr;
+		if (rte_mem_virt2memseg_list(addr) == NULL &&
+				vfio_spapr_dma_do_map(vfio_container_fd,
+					vaddr, iova, len, 1) < 0) {
+			RTE_LOG(ERR, EAL, "Could not map segment\n");
+			ret = -1;
+			goto out;
+		}
+	} else {
+		/* for unmap, check if iova within DMA window */
+		if (iova > create.window_size) {
+			RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap");
+			ret = -1;
+			goto out;
 		}
 
+		vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0);
 	}
+out:
+	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+	return ret;
+}
+
+static int
+vfio_spapr_dma_map(int vfio_container_fd)
+{
+	struct vfio_iommu_spapr_tce_create create = {
+		.argsz = sizeof(create),
+	};
+	struct spapr_walk_param param;
+
+	memset(&param, 0, sizeof(param));
+
+	/* create DMA window from 0 to max(phys_addr + len) */
+	rte_memseg_walk(vfio_spapr_window_size_walk, &param);
+
+	/* sPAPR requires window size to be a power of 2 */
+	create.window_size = rte_align64pow2(param.window_size);
+	create.page_shift = __builtin_ctzll(param.hugepage_sz);
+	create.levels = 1;
+
+	if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) {
+		RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
+		return -1;
+	}
+
+	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+	if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
+		return -1;
 
 	return 0;
 }
@@ -811,6 +1477,175 @@ vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
 	return 0;
 }
 
+static int
+vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
+			 uint64_t __rte_unused vaddr,
+			 uint64_t __rte_unused iova, uint64_t __rte_unused len,
+			 int __rte_unused do_map)
+{
+	/* No-IOMMU mode does not need DMA mapping */
+	return 0;
+}
+
+static int
+vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+		uint64_t len, int do_map)
+{
+	const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type;
+
+	if (!t) {
+		RTE_LOG(ERR, EAL, "  VFIO support not initialized\n");
+		rte_errno = ENODEV;
+		return -1;
+	}
+
+	if (!t->dma_user_map_func) {
+		RTE_LOG(ERR, EAL,
+			"  VFIO custom DMA region maping not supported by IOMMU %s\n",
+			t->name);
+		rte_errno = ENOTSUP;
+		return -1;
+	}
+
+	return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova,
+			len, do_map);
+}
+
+static int
+container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+		uint64_t len)
+{
+	struct user_mem_map *new_map;
+	struct user_mem_maps *user_mem_maps;
+	int ret = 0;
+
+	user_mem_maps = &vfio_cfg->mem_maps;
+	rte_spinlock_recursive_lock(&user_mem_maps->lock);
+	if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
+		RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto out;
+	}
+	/* map the entry */
+	if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
+		/* technically, this will fail if there are currently no devices
+		 * plugged in, even if a device were added later, this mapping
+		 * might have succeeded. however, since we cannot verify if this
+		 * is a valid mapping without having a device attached, consider
+		 * this to be unsupported, because we can't just store any old
+		 * mapping and pollute list of active mappings willy-nilly.
+		 */
+		RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
+		ret = -1;
+		goto out;
+	}
+	/* create new user mem map entry */
+	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	new_map->addr = vaddr;
+	new_map->iova = iova;
+	new_map->len = len;
+
+	compact_user_maps(user_mem_maps);
+out:
+	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+	return ret;
+}
+
+static int
+container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+		uint64_t len)
+{
+	struct user_mem_map *map, *new_map = NULL;
+	struct user_mem_maps *user_mem_maps;
+	int ret = 0;
+
+	user_mem_maps = &vfio_cfg->mem_maps;
+	rte_spinlock_recursive_lock(&user_mem_maps->lock);
+
+	/* find our mapping */
+	map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
+	if (!map) {
+		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
+		rte_errno = EINVAL;
+		ret = -1;
+		goto out;
+	}
+	if (map->addr != vaddr || map->iova != iova || map->len != len) {
+		/* we're partially unmapping a previously mapped region, so we
+		 * need to split entry into two.
+		 */
+		if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
+			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+			rte_errno = ENOMEM;
+			ret = -1;
+			goto out;
+		}
+		new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	}
+
+	/* unmap the entry */
+	if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
+		/* there may not be any devices plugged in, so unmapping will
+		 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
+		 * stop us from removing the mapping, as the assumption is we
+		 * won't be needing this memory any more and thus will want to
+		 * prevent it from being remapped again on hotplug. so, only
+		 * fail if we indeed failed to unmap (e.g. if the mapping was
+		 * within our mapped range but had invalid alignment).
+		 */
+		if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
+			RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
+			ret = -1;
+			goto out;
+		} else {
+			RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
+		}
+	}
+	/* remove map from the list of active mappings */
+	if (new_map != NULL) {
+		adjust_map(map, new_map, vaddr, len);
+
+		/* if we've created a new map by splitting, sort everything */
+		if (!is_null_map(new_map)) {
+			compact_user_maps(user_mem_maps);
+		} else {
+			/* we've created a new mapping, but it was unused */
+			user_mem_maps->n_maps--;
+		}
+	} else {
+		memset(map, 0, sizeof(*map));
+		compact_user_maps(user_mem_maps);
+		user_mem_maps->n_maps--;
+	}
+
+out:
+	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+	return ret;
+}
+
+int __rte_experimental
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+	if (len == 0) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	return container_dma_map(default_vfio_cfg, vaddr, iova, len);
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+	if (len == 0) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	return container_dma_unmap(default_vfio_cfg, vaddr, iova, len);
+}
+
 int
 rte_vfio_noiommu_is_enabled(void)
 {
@@ -843,4 +1678,299 @@ rte_vfio_noiommu_is_enabled(void)
 	return c == 'Y';
 }
 
-#endif
+int __rte_experimental
+rte_vfio_container_create(void)
+{
+	int i;
+
+	/* Find an empty slot to store new vfio config */
+	for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
+		if (vfio_cfgs[i].vfio_container_fd == -1)
+			break;
+	}
+
+	if (i == VFIO_MAX_CONTAINERS) {
+		RTE_LOG(ERR, EAL, "exceed max vfio container limit\n");
+		return -1;
+	}
+
+	vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd();
+	if (vfio_cfgs[i].vfio_container_fd < 0) {
+		RTE_LOG(NOTICE, EAL, "fail to create a new container\n");
+		return -1;
+	}
+
+	return vfio_cfgs[i].vfio_container_fd;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(int container_fd)
+{
+	struct vfio_config *vfio_cfg;
+	int i;
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg->vfio_groups[i].group_num != -1)
+			rte_vfio_container_group_unbind(container_fd,
+				vfio_cfg->vfio_groups[i].group_num);
+
+	close(container_fd);
+	vfio_cfg->vfio_container_fd = -1;
+	vfio_cfg->vfio_active_groups = 0;
+	vfio_cfg->vfio_iommu_type = NULL;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
+{
+	struct vfio_config *vfio_cfg;
+	struct vfio_group *cur_grp;
+	int vfio_group_fd;
+	int i;
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	/* Check room for new group */
+	if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+		return -1;
+	}
+
+	/* Get an index for the new group */
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg->vfio_groups[i].group_num == -1) {
+			cur_grp = &vfio_cfg->vfio_groups[i];
+			break;
+		}
+
+	/* This should not happen */
+	if (i == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+		return -1;
+	}
+
+	vfio_group_fd = vfio_open_group_fd(iommu_group_num);
+	if (vfio_group_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
+		return -1;
+	}
+	cur_grp->group_num = iommu_group_num;
+	cur_grp->fd = vfio_group_fd;
+	cur_grp->devices = 0;
+	vfio_cfg->vfio_active_groups++;
+
+	return vfio_group_fd;
+}
+
+int __rte_experimental
+rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
+{
+	struct vfio_config *vfio_cfg;
+	struct vfio_group *cur_grp;
+	int i;
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
+		if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) {
+			cur_grp = &vfio_cfg->vfio_groups[i];
+			break;
+		}
+	}
+
+	/* This should not happen */
+	if (i == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Specified group number not found\n");
+		return -1;
+	}
+
+	if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
+		RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for"
+			" iommu_group_num %d\n", iommu_group_num);
+		return -1;
+	}
+	cur_grp->group_num = -1;
+	cur_grp->fd = -1;
+	cur_grp->devices = 0;
+	vfio_cfg->vfio_active_groups--;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len)
+{
+	struct vfio_config *vfio_cfg;
+
+	if (len == 0) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	return container_dma_map(vfio_cfg, vaddr, iova, len);
+}
+
+int __rte_experimental
+rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len)
+{
+	struct vfio_config *vfio_cfg;
+
+	if (len == 0) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	return container_dma_unmap(vfio_cfg, vaddr, iova, len);
+}
+
+#else
+
+int __rte_experimental
+rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
+		  __rte_unused uint64_t len)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
+		    __rte_unused uint64_t len)
+{
+	return -1;
+}
+
+int
+rte_vfio_setup_device(__rte_unused const char *sysfs_base,
+		__rte_unused const char *dev_addr,
+		__rte_unused int *vfio_dev_fd,
+		__rte_unused struct vfio_device_info *device_info)
+{
+	return -1;
+}
+
+int
+rte_vfio_release_device(__rte_unused const char *sysfs_base,
+		__rte_unused const char *dev_addr, __rte_unused int fd)
+{
+	return -1;
+}
+
+int
+rte_vfio_enable(__rte_unused const char *modname)
+{
+	return -1;
+}
+
+int
+rte_vfio_is_enabled(__rte_unused const char *modname)
+{
+	return -1;
+}
+
+int
+rte_vfio_noiommu_is_enabled(void)
+{
+	return -1;
+}
+
+int
+rte_vfio_clear_group(__rte_unused int vfio_group_fd)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
+		__rte_unused const char *dev_addr,
+		__rte_unused int *iommu_group_num)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_get_container_fd(void)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_create(void)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(__rte_unused int container_fd)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_group_bind(__rte_unused int container_fd,
+		__rte_unused int iommu_group_num)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_group_unbind(__rte_unused int container_fd,
+		__rte_unused int iommu_group_num)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_map(__rte_unused int container_fd,
+		__rte_unused uint64_t vaddr,
+		__rte_unused uint64_t iova,
+		__rte_unused uint64_t len)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_unmap(__rte_unused int container_fd,
+		__rte_unused uint64_t vaddr,
+		__rte_unused uint64_t iova,
+		__rte_unused uint64_t len)
+{
+	return -1;
+}
+
+#endif /* VFIO_PRESENT */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 80595773..e65b1037 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -19,6 +19,7 @@
 
 #ifdef VFIO_PRESENT
 
+#include <stdint.h>
 #include <linux/vfio.h>
 
 #define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
@@ -26,6 +27,7 @@
 #ifndef VFIO_SPAPR_TCE_v2_IOMMU
 #define RTE_VFIO_SPAPR 7
 #define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17)
+#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18)
 #define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19)
 #define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20)
 
@@ -79,49 +81,37 @@ struct vfio_iommu_spapr_tce_info {
 #define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
-#define RTE_VFIO_NOIOMMU 8
-#else
-#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU
-#endif
-
 #define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
-
-/*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
+#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS
 
 /*
  * we don't need to store device fd's anywhere since they can be obtained from
  * the group fd via an ioctl() call.
  */
 struct vfio_group {
-	int group_no;
+	int group_num;
 	int fd;
 	int devices;
 };
 
-struct vfio_config {
-	int vfio_enabled;
-	int vfio_container_fd;
-	int vfio_active_groups;
-	struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
-};
-
 /* DMA mapping function prototype.
  * Takes VFIO container fd as a parameter.
  * Returns 0 on success, -1 on error.
  * */
 typedef int (*vfio_dma_func_t)(int);
 
+/* Custom memory region DMA mapping function prototype.
+ * Takes VFIO container fd, virtual address, phisical address, length and
+ * operation type (0 to unmap 1 for map) as a parameters.
+ * Returns 0 on success, -1 on error.
+ **/
+typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len, int do_map);
+
 struct vfio_iommu_type {
 	int type_id;
 	const char *name;
+	vfio_dma_user_func_t dma_user_map_func;
 	vfio_dma_func_t dma_map_func;
 };
 
@@ -133,23 +123,10 @@ vfio_set_iommu_type(int vfio_container_fd);
 int
 vfio_has_supported_extensions(int vfio_container_fd);
 
-/* open container fd or get an existing one */
-int
-vfio_get_container_fd(void);
-
-/* parse IOMMU group number for a device
- * returns 1 on success, -1 for errors, 0 for non-existent group
- */
-int
-vfio_get_group_no(const char *sysfs_base,
-		const char *dev_addr, int *iommu_group_no);
-
-/* open group fd or get an existing one */
-int
-vfio_get_group_fd(int iommu_group_no);
-
 int vfio_mp_sync_setup(void);
 
+#define EAL_VFIO_MP "eal_vfio_mp_sync"
+
 #define SOCKET_REQ_CONTAINER 0x100
 #define SOCKET_REQ_GROUP 0x200
 #define SOCKET_CLR_GROUP 0x300
@@ -157,6 +134,12 @@ int vfio_mp_sync_setup(void);
 #define SOCKET_NO_FD 0x1
 #define SOCKET_ERR 0xFF
 
+struct vfio_mp_param {
+	int req;
+	int result;
+	int group_num;
+};
+
 #endif /* VFIO_PRESENT */
 
 #endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 7cc3c152..9c202bb0 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,32 +1,16 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
  */
 
+#include <unistd.h>
 #include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
 
+#include <rte_compat.h>
 #include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
 #include <rte_vfio.h>
+#include <rte_eal.h>
 
-#include "eal_filesystem.h"
 #include "eal_vfio.h"
-#include "eal_thread.h"
 
 /**
  * @file
@@ -37,358 +21,78 @@
 
 #ifdef VFIO_PRESENT
 
-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
-		do {\
-			(chdr).cmsg_len = CMSGLEN;\
-			(chdr).cmsg_level = SOL_SOCKET;\
-			(chdr).cmsg_type = SCM_RIGHTS;\
-			memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
-		} while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
-			memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
-	const char *dir = "/var/run";
-	const char *home_dir = getenv("HOME");
-
-	if (getuid() != 0 && home_dir != NULL)
-		dir = home_dir;
-
-	/* use current prefix as file path */
-	snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
-			internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
-	struct msghdr hdr;
-	struct iovec iov;
-	int buf;
-	int ret;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = req;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct iovec iov;
-	int ret, req;
-
-	memset(&hdr, 0, sizeof(hdr));
-
-	buf = SOCKET_ERR;
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
 {
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
+	int fd = -1;
 	int ret;
+	struct rte_mp_msg reply;
+	struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+	const struct vfio_mp_param *m =
+		(const struct vfio_mp_param *)msg->param;
 
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	buf = SOCKET_OK;
-	FD_TO_CMSGHDR(fd, *chdr);
-
-	ret = sendmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-	return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
-	int buf;
-	struct msghdr hdr;
-	struct cmsghdr *chdr;
-	char chdr_buf[CMSGLEN];
-	struct iovec iov;
-	int ret, req, fd;
-
-	buf = SOCKET_ERR;
-
-	chdr = (struct cmsghdr *) chdr_buf;
-	memset(chdr, 0, sizeof(chdr_buf));
-	memset(&hdr, 0, sizeof(hdr));
-
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-	iov.iov_base = (char *) &buf;
-	iov.iov_len = sizeof(buf);
-	hdr.msg_control = chdr;
-	hdr.msg_controllen = CMSGLEN;
-
-	ret = recvmsg(socket, &hdr, 0);
-	if (ret < 0)
-		return -1;
-
-	req = buf;
-
-	if (req != SOCKET_OK)
-		return -1;
-
-	CMSGHDR_TO_FD(*chdr, fd);
-
-	return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-	int socket_fd;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+	if (msg->len_param != sizeof(*m)) {
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
-		return socket_fd;
-
-	/* if connect failed */
-	close(socket_fd);
-	return -1;
-}
-
+	memset(&reply, 0, sizeof(reply));
 
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
-	int ret, fd, vfio_data;
-
-	/* wait for requests on the socket */
-	for (;;) {
-		int conn_sock;
-		struct sockaddr_un addr;
-		socklen_t sockaddr_len = sizeof(addr);
-
-		/* this is a blocking call */
-		conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
-				&sockaddr_len);
-
-		/* just restart on error */
-		if (conn_sock == -1)
-			continue;
-
-		/* set socket to linger after close */
-		struct linger l;
-		l.l_onoff = 1;
-		l.l_linger = 60;
-
-		if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
-			RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
-					"on listen socket (%s)\n", strerror(errno));
-
-		ret = vfio_mp_sync_receive_request(conn_sock);
-
-		switch (ret) {
-		case SOCKET_REQ_CONTAINER:
-			fd = vfio_get_container_fd();
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			else
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			if (fd >= 0)
-				close(fd);
-			break;
-		case SOCKET_REQ_GROUP:
-			/* wait for group number */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			fd = vfio_get_group_fd(vfio_data);
-
-			if (fd < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+	switch (m->req) {
+	case SOCKET_REQ_GROUP:
+		r->req = SOCKET_REQ_GROUP;
+		r->group_num = m->group_num;
+		fd = rte_vfio_get_group_fd(m->group_num);
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else if (fd == 0)
 			/* if VFIO group exists but isn't bound to VFIO driver */
-			else if (fd == 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+			r->result = SOCKET_NO_FD;
+		else {
 			/* if group exists and is bound to VFIO driver */
-			else {
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-				vfio_mp_sync_send_fd(conn_sock, fd);
-			}
-			break;
-		case SOCKET_CLR_GROUP:
-			/* wait for group fd */
-			vfio_data = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_data < 0) {
-				close(conn_sock);
-				continue;
-			}
-
-			ret = rte_vfio_clear_group(vfio_data);
-
-			if (ret < 0)
-				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
-			else
-				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
-			break;
-		default:
-			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
-			break;
+			r->result = SOCKET_OK;
+			reply.num_fds = 1;
+			reply.fds[0] = fd;
 		}
-		close(conn_sock);
-	}
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
-	int ret, socket_fd;
-	struct sockaddr_un addr;
-	socklen_t sockaddr_len;
-
-	/* set up a socket */
-	socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
-	if (socket_fd < 0) {
-		RTE_LOG(ERR, EAL, "Failed to create socket!\n");
-		return -1;
-	}
-
-	get_socket_path(addr.sun_path, sizeof(addr.sun_path));
-	addr.sun_family = AF_UNIX;
-
-	sockaddr_len = sizeof(struct sockaddr_un);
-
-	unlink(addr.sun_path);
-
-	ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
-		close(socket_fd);
-		return -1;
-	}
-
-	ret = listen(socket_fd, 50);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
-		close(socket_fd);
+		break;
+	case SOCKET_CLR_GROUP:
+		r->req = SOCKET_CLR_GROUP;
+		r->group_num = m->group_num;
+		if (rte_vfio_clear_group(m->group_num) < 0)
+			r->result = SOCKET_NO_FD;
+		else
+			r->result = SOCKET_OK;
+		break;
+	case SOCKET_REQ_CONTAINER:
+		r->req = SOCKET_REQ_CONTAINER;
+		fd = rte_vfio_get_container_fd();
+		if (fd < 0)
+			r->result = SOCKET_ERR;
+		else {
+			r->result = SOCKET_OK;
+			reply.num_fds = 1;
+			reply.fds[0] = fd;
+		}
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
 		return -1;
 	}
 
-	/* save the socket in local configuration */
-	mp_socket_fd = socket_fd;
+	strcpy(reply.name, EAL_VFIO_MP);
+	reply.len_param = sizeof(*r);
 
-	return 0;
+	ret = rte_mp_reply(&reply, peer);
+	if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
+		close(fd);
+	return ret;
 }
 
-/*
- * set up a local socket and tell it to listen for incoming connections
- */
 int
 vfio_mp_sync_setup(void)
 {
-	int ret;
-	char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
-	if (vfio_mp_sync_socket_setup() < 0) {
-		RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
-		return -1;
-	}
-
-	ret = pthread_create(&socket_thread, NULL,
-			vfio_mp_sync_thread, NULL);
-	if (ret) {
-		RTE_LOG(ERR, EAL,
-			"Failed to create thread for communication with secondary processes!\n");
-		close(mp_socket_fd);
-		return -1;
-	}
-
-	/* Set thread_name for aid in debugging. */
-	snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
-	ret = rte_thread_setname(socket_thread, thread_name);
-	if (ret)
-		RTE_LOG(DEBUG, EAL,
-			"Failed to set thread name for secondary processes!\n");
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/meson.build b/lib/librte_eal/linuxapp/eal/meson.build
index 03974ff2..cce37712 100644
--- a/lib/librte_eal/linuxapp/eal/meson.build
+++ b/lib/librte_eal/linuxapp/eal/meson.build
@@ -7,9 +7,11 @@ install_subdir('include/exec-env', install_dir: get_option('includedir'))
 env_objs = []
 env_headers = []
 env_sources = files('eal_alarm.c',
+		'eal_cpuflags.c',
 		'eal_debug.c',
 		'eal_hugepage_info.c',
 		'eal_interrupts.c',
+		'eal_memalloc.c',
 		'eal_lcore.c',
 		'eal_log.c',
 		'eal_thread.c',
@@ -18,6 +20,7 @@ env_sources = files('eal_alarm.c',
 		'eal_vfio_mp_sync.c',
 		'eal.c',
 		'eal_memory.c',
+		'eal_dev.c',
 )
 
 if has_libnuma == 1
author	Christian Ehrhardt <christian.ehrhardt@canonical.com>	2018-06-01 09:09:08 +0200
committer	Christian Ehrhardt <christian.ehrhardt@canonical.com>	2018-06-01 09:12:07 +0200
commit	1bd9b61222f3a81ffe770fc00b70ded6e760c42b (patch)
tree	0bf7d996cf0664796687c1be6d22958fcf6a8096 /lib/librte_eal/linuxapp/eal
parent	bb4e158029645f37809fcf81a3acddd6fa11f88a (diff)