From 055c52583a2794da8ba1e85a48cce3832372b12f Mon Sep 17 00:00:00 2001
From: Luca Boccassi <luca.boccassi@gmail.com>
Date: Wed, 8 Nov 2017 14:15:11 +0000
Subject: New upstream version 17.11-rc3

Change-Id: I6a5baa40612fe0c20f30b5fa773a6cbbac63a685
Signed-off-by: Luca Boccassi <luca.boccassi@gmail.com>
---
 lib/librte_vhost/Makefile     |   5 +-
 lib/librte_vhost/fd_man.c     |   5 +-
 lib/librte_vhost/iotlb.c      | 350 +++++++++++++++++++++++++++++++++++++++
 lib/librte_vhost/iotlb.h      |  76 +++++++++
 lib/librte_vhost/rte_vhost.h  |   6 +-
 lib/librte_vhost/socket.c     |  37 ++++-
 lib/librte_vhost/vhost.c      | 131 +++++++++++++--
 lib/librte_vhost/vhost.h      |  76 ++++++++-
 lib/librte_vhost/vhost_user.c | 370 ++++++++++++++++++++++++++++++++++++------
 lib/librte_vhost/vhost_user.h |  25 ++-
 lib/librte_vhost/virtio_net.c | 339 +++++++++++++++++++++++++++++---------
 11 files changed, 1258 insertions(+), 162 deletions(-)
 create mode 100644 lib/librte_vhost/iotlb.c
 create mode 100644 lib/librte_vhost/iotlb.h

(limited to 'lib/librte_vhost')

diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
index 4a116fe3..be182798 100644
--- a/lib/librte_vhost/Makefile
+++ b/lib/librte_vhost/Makefile
@@ -45,10 +45,11 @@ LDLIBS += -lpthread
 ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y)
 LDLIBS += -lnuma
 endif
+LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
 
 # all source are stored in SRCS-y
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c socket.c vhost.c vhost_user.c \
-				   virtio_net.c
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
+					vhost_user.c virtio_net.c
 
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
diff --git a/lib/librte_vhost/fd_man.c b/lib/librte_vhost/fd_man.c
index 2ceacc9a..4c6fed41 100644
--- a/lib/librte_vhost/fd_man.c
+++ b/lib/librte_vhost/fd_man.c
@@ -222,6 +222,7 @@ fdset_event_dispatch(void *arg)
 	int remove1, remove2;
 	int need_shrink;
 	struct fdset *pfdset = arg;
+	int val;
 
 	if (pfdset == NULL)
 		return NULL;
@@ -239,7 +240,9 @@ fdset_event_dispatch(void *arg)
 		numfds = pfdset->num;
 		pthread_mutex_unlock(&pfdset->fd_mutex);
 
-		poll(pfdset->rwfds, numfds, 1000 /* millisecs */);
+		val = poll(pfdset->rwfds, numfds, 1000 /* millisecs */);
+		if (val < 0)
+			continue;
 
 		need_shrink = 0;
 		for (i = 0; i < numfds; i++) {
diff --git a/lib/librte_vhost/iotlb.c b/lib/librte_vhost/iotlb.c
new file mode 100644
index 00000000..b74cc6a7
--- /dev/null
+++ b/lib/librte_vhost/iotlb.c
@@ -0,0 +1,350 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) 2017 Red Hat, Inc.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_tailq.h>
+
+#include "iotlb.h"
+#include "vhost.h"
+
+struct vhost_iotlb_entry {
+	TAILQ_ENTRY(vhost_iotlb_entry) next;
+
+	uint64_t iova;
+	uint64_t uaddr;
+	uint64_t size;
+	uint8_t perm;
+};
+
+#define IOTLB_CACHE_SIZE 2048
+
+static void
+vhost_user_iotlb_pending_remove_all(struct vhost_virtqueue *vq)
+{
+	struct vhost_iotlb_entry *node, *temp_node;
+
+	rte_rwlock_write_lock(&vq->iotlb_pending_lock);
+
+	TAILQ_FOREACH_SAFE(node, &vq->iotlb_pending_list, next, temp_node) {
+		TAILQ_REMOVE(&vq->iotlb_pending_list, node, next);
+		rte_mempool_put(vq->iotlb_pool, node);
+	}
+
+	rte_rwlock_write_unlock(&vq->iotlb_pending_lock);
+}
+
+bool
+vhost_user_iotlb_pending_miss(struct vhost_virtqueue *vq, uint64_t iova,
+				uint8_t perm)
+{
+	struct vhost_iotlb_entry *node;
+	bool found = false;
+
+	rte_rwlock_read_lock(&vq->iotlb_pending_lock);
+
+	TAILQ_FOREACH(node, &vq->iotlb_pending_list, next) {
+		if ((node->iova == iova) && (node->perm == perm)) {
+			found = true;
+			break;
+		}
+	}
+
+	rte_rwlock_read_unlock(&vq->iotlb_pending_lock);
+
+	return found;
+}
+
+void
+vhost_user_iotlb_pending_insert(struct vhost_virtqueue *vq,
+				uint64_t iova, uint8_t perm)
+{
+	struct vhost_iotlb_entry *node;
+	int ret;
+
+	ret = rte_mempool_get(vq->iotlb_pool, (void **)&node);
+	if (ret) {
+		RTE_LOG(INFO, VHOST_CONFIG,
+				"IOTLB pool empty, clear pending misses\n");
+		vhost_user_iotlb_pending_remove_all(vq);
+		ret = rte_mempool_get(vq->iotlb_pool, (void **)&node);
+		if (ret) {
+			RTE_LOG(ERR, VHOST_CONFIG, "IOTLB pool still empty, failure\n");
+			return;
+		}
+	}
+
+	node->iova = iova;
+	node->perm = perm;
+
+	rte_rwlock_write_lock(&vq->iotlb_pending_lock);
+
+	TAILQ_INSERT_TAIL(&vq->iotlb_pending_list, node, next);
+
+	rte_rwlock_write_unlock(&vq->iotlb_pending_lock);
+}
+
+static void
+vhost_user_iotlb_pending_remove(struct vhost_virtqueue *vq,
+				uint64_t iova, uint64_t size, uint8_t perm)
+{
+	struct vhost_iotlb_entry *node, *temp_node;
+
+	rte_rwlock_write_lock(&vq->iotlb_pending_lock);
+
+	TAILQ_FOREACH_SAFE(node, &vq->iotlb_pending_list, next, temp_node) {
+		if (node->iova < iova)
+			continue;
+		if (node->iova >= iova + size)
+			continue;
+		if ((node->perm & perm) != node->perm)
+			continue;
+		TAILQ_REMOVE(&vq->iotlb_pending_list, node, next);
+		rte_mempool_put(vq->iotlb_pool, node);
+	}
+
+	rte_rwlock_write_unlock(&vq->iotlb_pending_lock);
+}
+
+static void
+vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq)
+{
+	struct vhost_iotlb_entry *node, *temp_node;
+
+	rte_rwlock_write_lock(&vq->iotlb_lock);
+
+	TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
+		TAILQ_REMOVE(&vq->iotlb_list, node, next);
+		rte_mempool_put(vq->iotlb_pool, node);
+	}
+
+	vq->iotlb_cache_nr = 0;
+
+	rte_rwlock_write_unlock(&vq->iotlb_lock);
+}
+
+static void
+vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq)
+{
+	struct vhost_iotlb_entry *node, *temp_node;
+	int entry_idx;
+
+	rte_rwlock_write_lock(&vq->iotlb_lock);
+
+	entry_idx = rte_rand() % vq->iotlb_cache_nr;
+
+	TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
+		if (!entry_idx) {
+			TAILQ_REMOVE(&vq->iotlb_list, node, next);
+			rte_mempool_put(vq->iotlb_pool, node);
+			vq->iotlb_cache_nr--;
+			break;
+		}
+		entry_idx--;
+	}
+
+	rte_rwlock_write_unlock(&vq->iotlb_lock);
+}
+
+void
+vhost_user_iotlb_cache_insert(struct vhost_virtqueue *vq, uint64_t iova,
+				uint64_t uaddr, uint64_t size, uint8_t perm)
+{
+	struct vhost_iotlb_entry *node, *new_node;
+	int ret;
+
+	ret = rte_mempool_get(vq->iotlb_pool, (void **)&new_node);
+	if (ret) {
+		RTE_LOG(DEBUG, VHOST_CONFIG, "IOTLB pool empty, evict one entry\n");
+		vhost_user_iotlb_cache_random_evict(vq);
+		ret = rte_mempool_get(vq->iotlb_pool, (void **)&new_node);
+		if (ret) {
+			RTE_LOG(ERR, VHOST_CONFIG, "IOTLB pool still empty, failure\n");
+			return;
+		}
+	}
+
+	new_node->iova = iova;
+	new_node->uaddr = uaddr;
+	new_node->size = size;
+	new_node->perm = perm;
+
+	rte_rwlock_write_lock(&vq->iotlb_lock);
+
+	TAILQ_FOREACH(node, &vq->iotlb_list, next) {
+		/*
+		 * Entries must be invalidated before being updated.
+		 * So if iova already in list, assume identical.
+		 */
+		if (node->iova == new_node->iova) {
+			rte_mempool_put(vq->iotlb_pool, new_node);
+			goto unlock;
+		} else if (node->iova > new_node->iova) {
+			TAILQ_INSERT_BEFORE(node, new_node, next);
+			vq->iotlb_cache_nr++;
+			goto unlock;
+		}
+	}
+
+	TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next);
+	vq->iotlb_cache_nr++;
+
+unlock:
+	vhost_user_iotlb_pending_remove(vq, iova, size, perm);
+
+	rte_rwlock_write_unlock(&vq->iotlb_lock);
+
+}
+
+void
+vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq,
+					uint64_t iova, uint64_t size)
+{
+	struct vhost_iotlb_entry *node, *temp_node;
+
+	if (unlikely(!size))
+		return;
+
+	rte_rwlock_write_lock(&vq->iotlb_lock);
+
+	TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
+		/* Sorted list */
+		if (unlikely(iova + size < node->iova))
+			break;
+
+		if (iova < node->iova + node->size) {
+			TAILQ_REMOVE(&vq->iotlb_list, node, next);
+			rte_mempool_put(vq->iotlb_pool, node);
+			vq->iotlb_cache_nr--;
+		}
+	}
+
+	rte_rwlock_write_unlock(&vq->iotlb_lock);
+}
+
+uint64_t
+vhost_user_iotlb_cache_find(struct vhost_virtqueue *vq, uint64_t iova,
+						uint64_t *size, uint8_t perm)
+{
+	struct vhost_iotlb_entry *node;
+	uint64_t offset, vva = 0, mapped = 0;
+
+	if (unlikely(!*size))
+		goto out;
+
+	TAILQ_FOREACH(node, &vq->iotlb_list, next) {
+		/* List sorted by iova */
+		if (unlikely(iova < node->iova))
+			break;
+
+		if (iova >= node->iova + node->size)
+			continue;
+
+		if (unlikely((perm & node->perm) != perm)) {
+			vva = 0;
+			break;
+		}
+
+		offset = iova - node->iova;
+		if (!vva)
+			vva = node->uaddr + offset;
+
+		mapped += node->size - offset;
+		iova = node->iova + node->size;
+
+		if (mapped >= *size)
+			break;
+	}
+
+out:
+	/* Only part of the requested chunk is mapped */
+	if (unlikely(mapped < *size))
+		*size = mapped;
+
+	return vva;
+}
+
+int
+vhost_user_iotlb_init(struct virtio_net *dev, int vq_index)
+{
+	char pool_name[RTE_MEMPOOL_NAMESIZE];
+	struct vhost_virtqueue *vq = dev->virtqueue[vq_index];
+	int socket = 0;
+
+	if (vq->iotlb_pool) {
+		/*
+		 * The cache has already been initialized,
+		 * just drop all cached and pending entries.
+		 */
+		vhost_user_iotlb_cache_remove_all(vq);
+		vhost_user_iotlb_pending_remove_all(vq);
+	}
+
+#ifdef RTE_LIBRTE_VHOST_NUMA
+	if (get_mempolicy(&socket, NULL, 0, vq, MPOL_F_NODE | MPOL_F_ADDR) != 0)
+		socket = 0;
+#endif
+
+	rte_rwlock_init(&vq->iotlb_lock);
+	rte_rwlock_init(&vq->iotlb_pending_lock);
+
+	TAILQ_INIT(&vq->iotlb_list);
+	TAILQ_INIT(&vq->iotlb_pending_list);
+
+	snprintf(pool_name, sizeof(pool_name), "iotlb_cache_%d_%d",
+			dev->vid, vq_index);
+
+	/* If already created, free it and recreate */
+	vq->iotlb_pool = rte_mempool_lookup(pool_name);
+	if (vq->iotlb_pool)
+		rte_mempool_free(vq->iotlb_pool);
+
+	vq->iotlb_pool = rte_mempool_create(pool_name,
+			IOTLB_CACHE_SIZE, sizeof(struct vhost_iotlb_entry), 0,
+			0, 0, NULL, NULL, NULL, socket,
+			MEMPOOL_F_NO_CACHE_ALIGN |
+			MEMPOOL_F_SP_PUT |
+			MEMPOOL_F_SC_GET);
+	if (!vq->iotlb_pool) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+				"Failed to create IOTLB cache pool (%s)\n",
+				pool_name);
+		return -1;
+	}
+
+	vq->iotlb_cache_nr = 0;
+
+	return 0;
+}
+
diff --git a/lib/librte_vhost/iotlb.h b/lib/librte_vhost/iotlb.h
new file mode 100644
index 00000000..f1a050e4
--- /dev/null
+++ b/lib/librte_vhost/iotlb.h
@@ -0,0 +1,76 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) 2017 Red Hat, Inc.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _VHOST_IOTLB_H_
+#define _VHOST_IOTLB_H_
+
+#include <stdbool.h>
+
+#include "vhost.h"
+
+static __rte_always_inline void
+vhost_user_iotlb_rd_lock(struct vhost_virtqueue *vq)
+{
+	rte_rwlock_read_lock(&vq->iotlb_lock);
+}
+
+static __rte_always_inline void
+vhost_user_iotlb_rd_unlock(struct vhost_virtqueue *vq)
+{
+	rte_rwlock_read_unlock(&vq->iotlb_lock);
+}
+
+static __rte_always_inline void
+vhost_user_iotlb_wr_lock(struct vhost_virtqueue *vq)
+{
+	rte_rwlock_write_lock(&vq->iotlb_lock);
+}
+
+static __rte_always_inline void
+vhost_user_iotlb_wr_unlock(struct vhost_virtqueue *vq)
+{
+	rte_rwlock_write_unlock(&vq->iotlb_lock);
+}
+
+void vhost_user_iotlb_cache_insert(struct vhost_virtqueue *vq, uint64_t iova,
+					uint64_t uaddr, uint64_t size,
+					uint8_t perm);
+void vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq,
+					uint64_t iova, uint64_t size);
+uint64_t vhost_user_iotlb_cache_find(struct vhost_virtqueue *vq, uint64_t iova,
+					uint64_t *size, uint8_t perm);
+bool vhost_user_iotlb_pending_miss(struct vhost_virtqueue *vq, uint64_t iova,
+						uint8_t perm);
+void vhost_user_iotlb_pending_insert(struct vhost_virtqueue *vq, uint64_t iova,
+						uint8_t perm);
+int vhost_user_iotlb_init(struct virtio_net *dev, int vq_index);
+
+#endif /* _VHOST_IOTLB_H_ */
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 8c974eb1..f6536449 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -56,6 +56,7 @@ extern "C" {
 #define RTE_VHOST_USER_CLIENT		(1ULL << 0)
 #define RTE_VHOST_USER_NO_RECONNECT	(1ULL << 1)
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY	(1ULL << 2)
+#define RTE_VHOST_USER_IOMMU_SUPPORT	(1ULL << 3)
 
 /**
  * Information relating to memory regions including offsets to
@@ -107,7 +108,10 @@ struct vhost_device_ops {
 	 */
 	int (*features_changed)(int vid, uint64_t features);
 
-	void *reserved[4]; /**< Reserved for future extension */
+	int (*new_connection)(int vid);
+	void (*destroy_connection)(int vid);
+
+	void *reserved[2]; /**< Reserved for future extension */
 };
 
 /**
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 41aa3f9b..422da002 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -68,6 +68,7 @@ struct vhost_user_socket {
 	bool is_server;
 	bool reconnect;
 	bool dequeue_zero_copy;
+	bool iommu_support;
 
 	/*
 	 * The "supported_features" indicates the feature bits the
@@ -217,9 +218,7 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
 
 	vid = vhost_new_device();
 	if (vid == -1) {
-		close(fd);
-		free(conn);
-		return;
+		goto err;
 	}
 
 	size = strnlen(vsocket->path, PATH_MAX);
@@ -230,24 +229,40 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
 
 	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
 
+	if (vsocket->notify_ops->new_connection) {
+		ret = vsocket->notify_ops->new_connection(vid);
+		if (ret < 0) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"failed to add vhost user connection with fd %d\n",
+				fd);
+			goto err;
+		}
+	}
+
 	conn->connfd = fd;
 	conn->vsocket = vsocket;
 	conn->vid = vid;
 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
 			NULL, conn);
 	if (ret < 0) {
-		conn->connfd = -1;
-		free(conn);
-		close(fd);
 		RTE_LOG(ERR, VHOST_CONFIG,
 			"failed to add fd %d into vhost server fdset\n",
 			fd);
-		return;
+
+		if (vsocket->notify_ops->destroy_connection)
+			vsocket->notify_ops->destroy_connection(conn->vid);
+
+		goto err;
 	}
 
 	pthread_mutex_lock(&vsocket->conn_mutex);
 	TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
 	pthread_mutex_unlock(&vsocket->conn_mutex);
+	return;
+
+err:
+	free(conn);
+	close(fd);
 }
 
 /* call back when there is new vhost-user connection from client  */
@@ -277,6 +292,9 @@ vhost_user_read_cb(int connfd, void *dat, int *remove)
 		*remove = 1;
 		vhost_destroy_device(conn->vid);
 
+		if (vsocket->notify_ops->destroy_connection)
+			vsocket->notify_ops->destroy_connection(conn->vid);
+
 		pthread_mutex_lock(&vsocket->conn_mutex);
 		TAILQ_REMOVE(&vsocket->conn_list, conn, next);
 		pthread_mutex_unlock(&vsocket->conn_mutex);
@@ -652,6 +670,11 @@ rte_vhost_driver_register(const char *path, uint64_t flags)
 	vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
 	vsocket->features           = VIRTIO_NET_SUPPORTED_FEATURES;
 
+	if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
+		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
+		vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
+	}
+
 	if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
 		vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
 		if (vsocket->reconnect && reconn_tid == 0) {
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 0b6aa1cc..4f8b73a0 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -47,11 +47,49 @@
 #include <rte_memory.h>
 #include <rte_malloc.h>
 #include <rte_vhost.h>
+#include <rte_rwlock.h>
 
+#include "iotlb.h"
 #include "vhost.h"
+#include "vhost_user.h"
 
 struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
 
+/* Called with iotlb_lock read-locked */
+uint64_t
+__vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		    uint64_t iova, uint64_t size, uint8_t perm)
+{
+	uint64_t vva, tmp_size;
+
+	if (unlikely(!size))
+		return 0;
+
+	tmp_size = size;
+
+	vva = vhost_user_iotlb_cache_find(vq, iova, &tmp_size, perm);
+	if (tmp_size == size)
+		return vva;
+
+	if (!vhost_user_iotlb_pending_miss(vq, iova + tmp_size, perm)) {
+		/*
+		 * iotlb_lock is read-locked for a full burst,
+		 * but it only protects the iotlb cache.
+		 * In case of IOTLB miss, we might block on the socket,
+		 * which could cause a deadlock with QEMU if an IOTLB update
+		 * is being handled. We can safely unlock here to avoid it.
+		 */
+		vhost_user_iotlb_rd_unlock(vq);
+
+		vhost_user_iotlb_pending_insert(vq, iova + tmp_size, perm);
+		vhost_user_iotlb_miss(dev, iova + tmp_size, perm);
+
+		vhost_user_iotlb_rd_lock(vq);
+	}
+
+	return 0;
+}
+
 struct virtio_net *
 get_device(int vid)
 {
@@ -102,40 +140,108 @@ free_device(struct virtio_net *dev)
 		vq = dev->virtqueue[i];
 
 		rte_free(vq->shadow_used_ring);
-
+		rte_free(vq->batch_copy_elems);
+		rte_mempool_free(vq->iotlb_pool);
 		rte_free(vq);
 	}
 
 	rte_free(dev);
 }
 
+int
+vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	uint64_t size;
+
+	if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
+		goto out;
+
+	size = sizeof(struct vring_desc) * vq->size;
+	vq->desc = (struct vring_desc *)(uintptr_t)vhost_iova_to_vva(dev, vq,
+						vq->ring_addrs.desc_user_addr,
+						size, VHOST_ACCESS_RW);
+	if (!vq->desc)
+		return -1;
+
+	size = sizeof(struct vring_avail);
+	size += sizeof(uint16_t) * vq->size;
+	vq->avail = (struct vring_avail *)(uintptr_t)vhost_iova_to_vva(dev, vq,
+						vq->ring_addrs.avail_user_addr,
+						size, VHOST_ACCESS_RW);
+	if (!vq->avail)
+		return -1;
+
+	size = sizeof(struct vring_used);
+	size += sizeof(struct vring_used_elem) * vq->size;
+	vq->used = (struct vring_used *)(uintptr_t)vhost_iova_to_vva(dev, vq,
+						vq->ring_addrs.used_user_addr,
+						size, VHOST_ACCESS_RW);
+	if (!vq->used)
+		return -1;
+
+out:
+	vq->access_ok = 1;
+
+	return 0;
+}
+
+void
+vring_invalidate(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_wr_lock(vq);
+
+	vq->access_ok = 0;
+	vq->desc = NULL;
+	vq->avail = NULL;
+	vq->used = NULL;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_wr_unlock(vq);
+}
+
 static void
-init_vring_queue(struct vhost_virtqueue *vq)
+init_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
 {
+	struct vhost_virtqueue *vq;
+
+	if (vring_idx >= VHOST_MAX_VRING) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+				"Failed not init vring, out of bound (%d)\n",
+				vring_idx);
+		return;
+	}
+
+	vq = dev->virtqueue[vring_idx];
+
 	memset(vq, 0, sizeof(struct vhost_virtqueue));
 
 	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
 	vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
 
+	vhost_user_iotlb_init(dev, vring_idx);
 	/* Backends are set to -1 indicating an inactive device. */
 	vq->backend = -1;
 
-	/*
-	 * always set the vq to enabled; this is to keep compatibility
-	 * with the old QEMU, whereas there is no SET_VRING_ENABLE message.
-	 */
-	vq->enabled = 1;
-
 	TAILQ_INIT(&vq->zmbuf_list);
 }
 
 static void
-reset_vring_queue(struct vhost_virtqueue *vq)
+reset_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
 {
+	struct vhost_virtqueue *vq;
 	int callfd;
 
+	if (vring_idx >= VHOST_MAX_VRING) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+				"Failed not init vring, out of bound (%d)\n",
+				vring_idx);
+		return;
+	}
+
+	vq = dev->virtqueue[vring_idx];
 	callfd = vq->callfd;
-	init_vring_queue(vq);
+	init_vring_queue(dev, vring_idx);
 	vq->callfd = callfd;
 }
 
@@ -152,7 +258,7 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
 	}
 
 	dev->virtqueue[vring_idx] = vq;
-	init_vring_queue(vq);
+	init_vring_queue(dev, vring_idx);
 
 	dev->nr_vring += 1;
 
@@ -174,7 +280,7 @@ reset_device(struct virtio_net *dev)
 	dev->flags = 0;
 
 	for (i = 0; i < dev->nr_vring; i++)
-		reset_vring_queue(dev->virtqueue[i]);
+		reset_vring_queue(dev, i);
 }
 
 /*
@@ -207,6 +313,7 @@ vhost_new_device(void)
 
 	vhost_devices[i] = dev;
 	dev->vid = i;
+	dev->slave_req_fd = -1;
 
 	return i;
 }
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 6fe72aeb..1cc81c17 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -45,6 +45,7 @@
 
 #include <rte_log.h>
 #include <rte_ether.h>
+#include <rte_rwlock.h>
 
 #include "rte_vhost.h"
 
@@ -81,6 +82,16 @@ struct zcopy_mbuf {
 };
 TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf);
 
+/*
+ * Structure contains the info for each batched memory copy.
+ */
+struct batch_copy_elem {
+	void *dst;
+	void *src;
+	uint32_t len;
+	uint64_t log_addr;
+};
+
 /**
  * Structure contains variables relevant to RX/TX virtqueues.
  */
@@ -102,6 +113,7 @@ struct vhost_virtqueue {
 	/* Currently unused as polling mode is enabled */
 	int			kickfd;
 	int			enabled;
+	int			access_ok;
 
 	/* Physical address of used ring, for logging */
 	uint64_t		log_guest_addr;
@@ -114,6 +126,17 @@ struct vhost_virtqueue {
 
 	struct vring_used_elem  *shadow_used_ring;
 	uint16_t                shadow_used_idx;
+	struct vhost_vring_addr ring_addrs;
+
+	struct batch_copy_elem	*batch_copy_elems;
+	uint16_t		batch_copy_nb_elems;
+
+	rte_rwlock_t	iotlb_lock;
+	rte_rwlock_t	iotlb_pending_lock;
+	struct rte_mempool *iotlb_pool;
+	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
+	int				iotlb_cache_nr;
+	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macros defined */
@@ -132,6 +155,37 @@ struct vhost_virtqueue {
  #define VIRTIO_NET_F_MTU 3
 #endif
 
+/* Declare IOMMU related bits for older kernels */
+#ifndef VIRTIO_F_IOMMU_PLATFORM
+
+#define VIRTIO_F_IOMMU_PLATFORM 33
+
+struct vhost_iotlb_msg {
+	__u64 iova;
+	__u64 size;
+	__u64 uaddr;
+#define VHOST_ACCESS_RO      0x1
+#define VHOST_ACCESS_WO      0x2
+#define VHOST_ACCESS_RW      0x3
+	__u8 perm;
+#define VHOST_IOTLB_MISS           1
+#define VHOST_IOTLB_UPDATE         2
+#define VHOST_IOTLB_INVALIDATE     3
+#define VHOST_IOTLB_ACCESS_FAIL    4
+	__u8 type;
+};
+
+#define VHOST_IOTLB_MSG 0x1
+
+struct vhost_msg {
+	int type;
+	union {
+		struct vhost_iotlb_msg iotlb;
+		__u8 padding[64];
+	};
+};
+#endif
+
 /*
  * Define virtio 1.0 for older kernels
  */
@@ -157,7 +211,8 @@ struct vhost_virtqueue {
 				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
 				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
 				(1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
-				(1ULL << VIRTIO_NET_F_MTU))
+				(1ULL << VIRTIO_NET_F_MTU) | \
+				(1ULL << VIRTIO_F_IOMMU_PLATFORM))
 
 
 struct guest_page {
@@ -196,6 +251,8 @@ struct virtio_net {
 	uint32_t		nr_guest_pages;
 	uint32_t		max_guest_pages;
 	struct guest_page       *guest_pages;
+
+	int			slave_req_fd;
 } __rte_cache_aligned;
 
 
@@ -281,7 +338,7 @@ extern uint64_t VHOST_FEATURES;
 extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
 
 /* Convert guest physical address to host physical address */
-static __rte_always_inline phys_addr_t
+static __rte_always_inline rte_iova_t
 gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
 {
 	uint32_t i;
@@ -321,4 +378,19 @@ struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
  */
 void vhost_backend_cleanup(struct virtio_net *dev);
 
+uint64_t __vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
+			uint64_t iova, uint64_t size, uint8_t perm);
+int vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq);
+void vring_invalidate(struct virtio_net *dev, struct vhost_virtqueue *vq);
+
+static __rte_always_inline uint64_t
+vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
+			uint64_t iova, uint64_t size, uint8_t perm)
+{
+	if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
+		return rte_vhost_gpa_to_vva(dev->mem, iova);
+
+	return __vhost_iova_to_vva(dev, vq, iova, size, perm);
+}
+
 #endif /* _VHOST_NET_CDEV_H_ */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index ad2e8d38..f4c7ce46 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -48,6 +48,7 @@
 #include <rte_malloc.h>
 #include <rte_log.h>
 
+#include "iotlb.h"
 #include "vhost.h"
 #include "vhost_user.h"
 
@@ -76,6 +77,8 @@ static const char *vhost_message_str[VHOST_USER_MAX] = {
 	[VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
 	[VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
 	[VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
+	[VHOST_USER_SET_SLAVE_REQ_FD]  = "VHOST_USER_SET_SLAVE_REQ_FD",
+	[VHOST_USER_IOTLB_MSG]  = "VHOST_USER_IOTLB_MSG",
 };
 
 static uint64_t
@@ -122,6 +125,11 @@ vhost_backend_cleanup(struct virtio_net *dev)
 		munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
 		dev->log_addr = 0;
 	}
+
+	if (dev->slave_req_fd >= 0) {
+		close(dev->slave_req_fd);
+		dev->slave_req_fd = -1;
+	}
 }
 
 /*
@@ -230,6 +238,15 @@ vhost_user_set_vring_num(struct virtio_net *dev,
 		return -1;
 	}
 
+	vq->batch_copy_elems = rte_malloc(NULL,
+				vq->size * sizeof(struct batch_copy_elem),
+				RTE_CACHE_LINE_SIZE);
+	if (!vq->batch_copy_elems) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"failed to allocate memory for batching copy.\n");
+		return -1;
+	}
+
 	return 0;
 }
 
@@ -297,6 +314,9 @@ out:
 	dev->virtqueue[index] = vq;
 	vhost_devices[dev->vid] = dev;
 
+	if (old_vq != vq)
+		vhost_user_iotlb_init(dev, index);
+
 	return dev;
 }
 #else
@@ -307,10 +327,7 @@ numa_realloc(struct virtio_net *dev, int index __rte_unused)
 }
 #endif
 
-/*
- * Converts QEMU virtual address to Vhost virtual address. This function is
- * used to convert the ring addresses to our address space.
- */
+/* Converts QEMU virtual address to Vhost virtual address. */
 static uint64_t
 qva_to_vva(struct virtio_net *dev, uint64_t qva)
 {
@@ -331,50 +348,69 @@ qva_to_vva(struct virtio_net *dev, uint64_t qva)
 	return 0;
 }
 
+
 /*
- * The virtio device sends us the desc, used and avail ring addresses.
- * This function then converts these to our address space.
+ * Converts ring address to Vhost virtual address.
+ * If IOMMU is enabled, the ring address is a guest IO virtual address,
+ * else it is a QEMU virtual address.
  */
-static int
-vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg)
+static uint64_t
+ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint64_t ra, uint64_t size)
 {
-	struct vhost_virtqueue *vq;
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
+		uint64_t vva;
 
-	if (dev->mem == NULL)
-		return -1;
+		vva = vhost_user_iotlb_cache_find(vq, ra,
+					&size, VHOST_ACCESS_RW);
+		if (!vva)
+			vhost_user_iotlb_miss(dev, ra, VHOST_ACCESS_RW);
 
-	/* addr->index refers to the queue index. The txq 1, rxq is 0. */
-	vq = dev->virtqueue[msg->payload.addr.index];
+		return vva;
+	}
+
+	return qva_to_vva(dev, ra);
+}
+
+static struct virtio_net *
+translate_ring_addresses(struct virtio_net *dev, int vq_index)
+{
+	struct vhost_virtqueue *vq = dev->virtqueue[vq_index];
+	struct vhost_vring_addr *addr = &vq->ring_addrs;
 
 	/* The addresses are converted from QEMU virtual to Vhost virtual. */
-	vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
-			msg->payload.addr.desc_user_addr);
+	if (vq->desc && vq->avail && vq->used)
+		return dev;
+
+	vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev,
+			vq, addr->desc_user_addr, sizeof(struct vring_desc));
 	if (vq->desc == 0) {
-		RTE_LOG(ERR, VHOST_CONFIG,
+		RTE_LOG(DEBUG, VHOST_CONFIG,
 			"(%d) failed to find desc ring address.\n",
 			dev->vid);
-		return -1;
+		return dev;
 	}
 
-	dev = numa_realloc(dev, msg->payload.addr.index);
-	vq = dev->virtqueue[msg->payload.addr.index];
+	dev = numa_realloc(dev, vq_index);
+	vq = dev->virtqueue[vq_index];
+	addr = &vq->ring_addrs;
 
-	vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
-			msg->payload.addr.avail_user_addr);
+	vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev,
+			vq, addr->avail_user_addr, sizeof(struct vring_avail));
 	if (vq->avail == 0) {
-		RTE_LOG(ERR, VHOST_CONFIG,
+		RTE_LOG(DEBUG, VHOST_CONFIG,
 			"(%d) failed to find avail ring address.\n",
 			dev->vid);
-		return -1;
+		return dev;
 	}
 
-	vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
-			msg->payload.addr.used_user_addr);
+	vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev,
+			vq, addr->used_user_addr, sizeof(struct vring_used));
 	if (vq->used == 0) {
-		RTE_LOG(ERR, VHOST_CONFIG,
+		RTE_LOG(DEBUG, VHOST_CONFIG,
 			"(%d) failed to find used ring address.\n",
 			dev->vid);
-		return -1;
+		return dev;
 	}
 
 	if (vq->last_used_idx != vq->used->idx) {
@@ -386,7 +422,7 @@ vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg)
 		vq->last_avail_idx = vq->used->idx;
 	}
 
-	vq->log_guest_addr = msg->payload.addr.log_guest_addr;
+	vq->log_guest_addr = addr->log_guest_addr;
 
 	LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
 			dev->vid, vq->desc);
@@ -397,6 +433,43 @@ vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg)
 	LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
 			dev->vid, vq->log_guest_addr);
 
+	return dev;
+}
+
+/*
+ * The virtio device sends us the desc, used and avail ring addresses.
+ * This function then converts these to our address space.
+ */
+static int
+vhost_user_set_vring_addr(struct virtio_net **pdev, VhostUserMsg *msg)
+{
+	struct vhost_virtqueue *vq;
+	struct vhost_vring_addr *addr = &msg->payload.addr;
+	struct virtio_net *dev = *pdev;
+
+	if (dev->mem == NULL)
+		return -1;
+
+	/* addr->index refers to the queue index. The txq 1, rxq is 0. */
+	vq = dev->virtqueue[msg->payload.addr.index];
+
+	/*
+	 * Rings addresses should not be interpreted as long as the ring is not
+	 * started and enabled
+	 */
+	memcpy(&vq->ring_addrs, addr, sizeof(*addr));
+
+	vring_invalidate(dev, vq);
+
+	if (vq->enabled && (dev->features &
+				(1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {
+		dev = translate_ring_addresses(dev, msg->payload.state.index);
+		if (!dev)
+			return -1;
+
+		*pdev = dev;
+	}
+
 	return 0;
 }
 
@@ -453,7 +526,7 @@ add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
 	uint64_t host_phys_addr;
 	uint64_t size;
 
-	host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
+	host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
 	size = page_size - (guest_phys_addr & (page_size - 1));
 	size = RTE_MIN(size, reg_size);
 
@@ -464,7 +537,7 @@ add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
 
 	while (reg_size > 0) {
 		size = RTE_MIN(reg_size, page_size);
-		host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)
+		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
 						  host_user_addr);
 		add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
 
@@ -620,7 +693,7 @@ err_mmap:
 static int
 vq_is_ready(struct vhost_virtqueue *vq)
 {
-	return vq && vq->desc   &&
+	return vq && vq->desc && vq->avail && vq->used &&
 	       vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
 	       vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
 }
@@ -668,10 +741,11 @@ vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 }
 
 static void
-vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *pmsg)
 {
 	struct vhost_vring_file file;
 	struct vhost_virtqueue *vq;
+	struct virtio_net *dev = *pdev;
 
 	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 	if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
@@ -681,7 +755,23 @@ vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 	RTE_LOG(INFO, VHOST_CONFIG,
 		"vring kick idx:%d file:%d\n", file.index, file.fd);
 
+	/* Interpret ring addresses only when ring is started. */
+	dev = translate_ring_addresses(dev, file.index);
+	if (!dev)
+		return;
+
+	*pdev = dev;
+
 	vq = dev->virtqueue[file.index];
+
+	/*
+	 * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated,
+	 * the ring starts already enabled. Otherwise, it is enabled via
+	 * the SET_VRING_ENABLE message.
+	 */
+	if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)))
+		vq->enabled = 1;
+
 	if (vq->kickfd >= 0)
 		close(vq->kickfd);
 	vq->kickfd = file.fd;
@@ -741,6 +831,9 @@ vhost_user_get_vring_base(struct virtio_net *dev,
 	rte_free(vq->shadow_used_ring);
 	vq->shadow_used_ring = NULL;
 
+	rte_free(vq->batch_copy_elems);
+	vq->batch_copy_elems = NULL;
+
 	return 0;
 }
 
@@ -767,6 +860,27 @@ vhost_user_set_vring_enable(struct virtio_net *dev,
 	return 0;
 }
 
+static void
+vhost_user_get_protocol_features(struct virtio_net *dev,
+				 struct VhostUserMsg *msg)
+{
+	uint64_t features, protocol_features = VHOST_USER_PROTOCOL_FEATURES;
+
+	rte_vhost_driver_get_features(dev->ifname, &features);
+
+	/*
+	 * REPLY_ACK protocol feature is only mandatory for now
+	 * for IOMMU feature. If IOMMU is explicitly disabled by the
+	 * application, disable also REPLY_ACK feature for older buggy
+	 * Qemu versions (from v2.7.0 to v2.9.0).
+	 */
+	if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
+		protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK);
+
+	msg->payload.u64 = protocol_features;
+	msg->size = sizeof(msg->payload.u64);
+}
+
 static void
 vhost_user_set_protocol_features(struct virtio_net *dev,
 				 uint64_t protocol_features)
@@ -874,6 +988,116 @@ vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg)
 	return 0;
 }
 
+static int
+vhost_user_set_req_fd(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+	int fd = msg->fds[0];
+
+	if (fd < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+				"Invalid file descriptor for slave channel (%d)\n",
+				fd);
+		return -1;
+	}
+
+	dev->slave_req_fd = fd;
+
+	return 0;
+}
+
+static int
+is_vring_iotlb_update(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg)
+{
+	struct vhost_vring_addr *ra;
+	uint64_t start, end;
+
+	start = imsg->iova;
+	end = start + imsg->size;
+
+	ra = &vq->ring_addrs;
+	if (ra->desc_user_addr >= start && ra->desc_user_addr < end)
+		return 1;
+	if (ra->avail_user_addr >= start && ra->avail_user_addr < end)
+		return 1;
+	if (ra->used_user_addr >= start && ra->used_user_addr < end)
+		return 1;
+
+	return 0;
+}
+
+static int
+is_vring_iotlb_invalidate(struct vhost_virtqueue *vq,
+				struct vhost_iotlb_msg *imsg)
+{
+	uint64_t istart, iend, vstart, vend;
+
+	istart = imsg->iova;
+	iend = istart + imsg->size - 1;
+
+	vstart = (uintptr_t)vq->desc;
+	vend = vstart + sizeof(struct vring_desc) * vq->size - 1;
+	if (vstart <= iend && istart <= vend)
+		return 1;
+
+	vstart = (uintptr_t)vq->avail;
+	vend = vstart + sizeof(struct vring_avail);
+	vend += sizeof(uint16_t) * vq->size - 1;
+	if (vstart <= iend && istart <= vend)
+		return 1;
+
+	vstart = (uintptr_t)vq->used;
+	vend = vstart + sizeof(struct vring_used);
+	vend += sizeof(struct vring_used_elem) * vq->size - 1;
+	if (vstart <= iend && istart <= vend)
+		return 1;
+
+	return 0;
+}
+
+static int
+vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg)
+{
+	struct virtio_net *dev = *pdev;
+	struct vhost_iotlb_msg *imsg = &msg->payload.iotlb;
+	uint16_t i;
+	uint64_t vva;
+
+	switch (imsg->type) {
+	case VHOST_IOTLB_UPDATE:
+		vva = qva_to_vva(dev, imsg->uaddr);
+		if (!vva)
+			return -1;
+
+		for (i = 0; i < dev->nr_vring; i++) {
+			struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+			vhost_user_iotlb_cache_insert(vq, imsg->iova, vva,
+					imsg->size, imsg->perm);
+
+			if (is_vring_iotlb_update(vq, imsg))
+				*pdev = dev = translate_ring_addresses(dev, i);
+		}
+		break;
+	case VHOST_IOTLB_INVALIDATE:
+		for (i = 0; i < dev->nr_vring; i++) {
+			struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+			vhost_user_iotlb_cache_remove(vq, imsg->iova,
+					imsg->size);
+
+			if (is_vring_iotlb_invalidate(vq, imsg))
+				vring_invalidate(dev, vq);
+		}
+		break;
+	default:
+		RTE_LOG(ERR, VHOST_CONFIG, "Invalid IOTLB message type (%d)\n",
+				imsg->type);
+		return -1;
+	}
+
+	return 0;
+}
+
 /* return bytes# of read on success or negative val on failure. */
 static int
 read_vhost_message(int sockfd, struct VhostUserMsg *msg)
@@ -907,8 +1131,16 @@ read_vhost_message(int sockfd, struct VhostUserMsg *msg)
 static int
 send_vhost_message(int sockfd, struct VhostUserMsg *msg)
 {
-	int ret;
+	if (!msg)
+		return 0;
+
+	return send_fd_message(sockfd, (char *)msg,
+		VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
+}
 
+static int
+send_vhost_reply(int sockfd, struct VhostUserMsg *msg)
+{
 	if (!msg)
 		return 0;
 
@@ -917,10 +1149,7 @@ send_vhost_message(int sockfd, struct VhostUserMsg *msg)
 	msg->flags |= VHOST_USER_VERSION;
 	msg->flags |= VHOST_USER_REPLY_MASK;
 
-	ret = send_fd_message(sockfd, (char *)msg,
-		VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
-
-	return ret;
+	return send_vhost_message(sockfd, msg);
 }
 
 /*
@@ -931,7 +1160,7 @@ vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
 {
 	uint16_t vring_idx;
 
-	switch (msg->request) {
+	switch (msg->request.master) {
 	case VHOST_USER_SET_VRING_KICK:
 	case VHOST_USER_SET_VRING_CALL:
 	case VHOST_USER_SET_VRING_ERR:
@@ -983,7 +1212,7 @@ vhost_user_msg_handler(int vid, int fd)
 	}
 
 	ret = read_vhost_message(fd, &msg);
-	if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
+	if (ret <= 0 || msg.request.master >= VHOST_USER_MAX) {
 		if (ret < 0)
 			RTE_LOG(ERR, VHOST_CONFIG,
 				"vhost read message failed\n");
@@ -998,8 +1227,12 @@ vhost_user_msg_handler(int vid, int fd)
 	}
 
 	ret = 0;
-	RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
-		vhost_message_str[msg.request]);
+	if (msg.request.master != VHOST_USER_IOTLB_MSG)
+		RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
+			vhost_message_str[msg.request.master]);
+	else
+		RTE_LOG(DEBUG, VHOST_CONFIG, "read message %s\n",
+			vhost_message_str[msg.request.master]);
 
 	ret = vhost_user_check_and_alloc_queue_pair(dev, &msg);
 	if (ret < 0) {
@@ -1008,20 +1241,19 @@ vhost_user_msg_handler(int vid, int fd)
 		return -1;
 	}
 
-	switch (msg.request) {
+	switch (msg.request.master) {
 	case VHOST_USER_GET_FEATURES:
 		msg.payload.u64 = vhost_user_get_features(dev);
 		msg.size = sizeof(msg.payload.u64);
-		send_vhost_message(fd, &msg);
+		send_vhost_reply(fd, &msg);
 		break;
 	case VHOST_USER_SET_FEATURES:
 		vhost_user_set_features(dev, msg.payload.u64);
 		break;
 
 	case VHOST_USER_GET_PROTOCOL_FEATURES:
-		msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
-		msg.size = sizeof(msg.payload.u64);
-		send_vhost_message(fd, &msg);
+		vhost_user_get_protocol_features(dev, &msg);
+		send_vhost_reply(fd, &msg);
 		break;
 	case VHOST_USER_SET_PROTOCOL_FEATURES:
 		vhost_user_set_protocol_features(dev, msg.payload.u64);
@@ -1043,7 +1275,7 @@ vhost_user_msg_handler(int vid, int fd)
 
 		/* it needs a reply */
 		msg.size = sizeof(msg.payload.u64);
-		send_vhost_message(fd, &msg);
+		send_vhost_reply(fd, &msg);
 		break;
 	case VHOST_USER_SET_LOG_FD:
 		close(msg.fds[0]);
@@ -1054,7 +1286,7 @@ vhost_user_msg_handler(int vid, int fd)
 		vhost_user_set_vring_num(dev, &msg);
 		break;
 	case VHOST_USER_SET_VRING_ADDR:
-		vhost_user_set_vring_addr(dev, &msg);
+		vhost_user_set_vring_addr(&dev, &msg);
 		break;
 	case VHOST_USER_SET_VRING_BASE:
 		vhost_user_set_vring_base(dev, &msg);
@@ -1063,11 +1295,11 @@ vhost_user_msg_handler(int vid, int fd)
 	case VHOST_USER_GET_VRING_BASE:
 		vhost_user_get_vring_base(dev, &msg);
 		msg.size = sizeof(msg.payload.state);
-		send_vhost_message(fd, &msg);
+		send_vhost_reply(fd, &msg);
 		break;
 
 	case VHOST_USER_SET_VRING_KICK:
-		vhost_user_set_vring_kick(dev, &msg);
+		vhost_user_set_vring_kick(&dev, &msg);
 		break;
 	case VHOST_USER_SET_VRING_CALL:
 		vhost_user_set_vring_call(dev, &msg);
@@ -1082,7 +1314,7 @@ vhost_user_msg_handler(int vid, int fd)
 	case VHOST_USER_GET_QUEUE_NUM:
 		msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
 		msg.size = sizeof(msg.payload.u64);
-		send_vhost_message(fd, &msg);
+		send_vhost_reply(fd, &msg);
 		break;
 
 	case VHOST_USER_SET_VRING_ENABLE:
@@ -1096,6 +1328,14 @@ vhost_user_msg_handler(int vid, int fd)
 		ret = vhost_user_net_set_mtu(dev, &msg);
 		break;
 
+	case VHOST_USER_SET_SLAVE_REQ_FD:
+		ret = vhost_user_set_req_fd(dev, &msg);
+		break;
+
+	case VHOST_USER_IOTLB_MSG:
+		ret = vhost_user_iotlb_msg(&dev, &msg);
+		break;
+
 	default:
 		ret = -1;
 		break;
@@ -1105,7 +1345,7 @@ vhost_user_msg_handler(int vid, int fd)
 	if (msg.flags & VHOST_USER_NEED_REPLY) {
 		msg.payload.u64 = !!ret;
 		msg.size = sizeof(msg.payload.u64);
-		send_vhost_message(fd, &msg);
+		send_vhost_reply(fd, &msg);
 	}
 
 	if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) {
@@ -1124,3 +1364,29 @@ vhost_user_msg_handler(int vid, int fd)
 
 	return 0;
 }
+
+int
+vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
+{
+	int ret;
+	struct VhostUserMsg msg = {
+		.request.slave = VHOST_USER_SLAVE_IOTLB_MSG,
+		.flags = VHOST_USER_VERSION,
+		.size = sizeof(msg.payload.iotlb),
+		.payload.iotlb = {
+			.iova = iova,
+			.perm = perm,
+			.type = VHOST_IOTLB_MISS,
+		},
+	};
+
+	ret = send_vhost_message(dev->slave_req_fd, &msg);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+				"Failed to send IOTLB miss message (%d)\n",
+				ret);
+		return ret;
+	}
+
+	return 0;
+}
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index 35ebd719..76d9fe2f 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -48,16 +48,14 @@
 #define VHOST_USER_PROTOCOL_F_RARP	2
 #define VHOST_USER_PROTOCOL_F_REPLY_ACK	3
 #define VHOST_USER_PROTOCOL_F_NET_MTU 4
+#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
 
-/*
- * disable REPLY_ACK feature to workaround the buggy QEMU implementation.
- * Proved buggy QEMU includes v2.7 - v2.9.
- */
 #define VHOST_USER_PROTOCOL_FEATURES	((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
 					 (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
 					 (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \
-					 (0ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \
-					 (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU))
+					 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \
+					 (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \
+					 (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ))
 
 typedef enum VhostUserRequest {
 	VHOST_USER_NONE = 0,
@@ -81,9 +79,17 @@ typedef enum VhostUserRequest {
 	VHOST_USER_SET_VRING_ENABLE = 18,
 	VHOST_USER_SEND_RARP = 19,
 	VHOST_USER_NET_SET_MTU = 20,
+	VHOST_USER_SET_SLAVE_REQ_FD = 21,
+	VHOST_USER_IOTLB_MSG = 22,
 	VHOST_USER_MAX
 } VhostUserRequest;
 
+typedef enum VhostUserSlaveRequest {
+	VHOST_USER_SLAVE_NONE = 0,
+	VHOST_USER_SLAVE_IOTLB_MSG = 1,
+	VHOST_USER_SLAVE_MAX
+} VhostUserSlaveRequest;
+
 typedef struct VhostUserMemoryRegion {
 	uint64_t guest_phys_addr;
 	uint64_t memory_size;
@@ -103,7 +109,10 @@ typedef struct VhostUserLog {
 } VhostUserLog;
 
 typedef struct VhostUserMsg {
-	VhostUserRequest request;
+	union {
+		VhostUserRequest master;
+		VhostUserSlaveRequest slave;
+	} request;
 
 #define VHOST_USER_VERSION_MASK     0x3
 #define VHOST_USER_REPLY_MASK       (0x1 << 2)
@@ -118,6 +127,7 @@ typedef struct VhostUserMsg {
 		struct vhost_vring_addr addr;
 		VhostUserMemory memory;
 		VhostUserLog    log;
+		struct vhost_iotlb_msg iotlb;
 	} payload;
 	int fds[VHOST_MEMORY_MAX_NREGIONS];
 } __attribute((packed)) VhostUserMsg;
@@ -130,6 +140,7 @@ typedef struct VhostUserMsg {
 
 /* vhost_user.c */
 int vhost_user_msg_handler(int vid, int fd);
+int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
 
 /* socket.c */
 int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index a5f0eeba..6fee16e5 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -45,10 +45,13 @@
 #include <rte_sctp.h>
 #include <rte_arp.h>
 
+#include "iotlb.h"
 #include "vhost.h"
 
 #define MAX_PKT_BURST 32
 
+#define MAX_BATCH_LEN 256
+
 static bool
 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
 {
@@ -105,6 +108,31 @@ update_shadow_used_ring(struct vhost_virtqueue *vq,
 	vq->shadow_used_ring[i].len = len;
 }
 
+static inline void
+do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	struct batch_copy_elem *elem = vq->batch_copy_elems;
+	uint16_t count = vq->batch_copy_nb_elems;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+		vhost_log_write(dev, elem[i].log_addr, elem[i].len);
+		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
+	}
+}
+
+static inline void
+do_data_copy_dequeue(struct vhost_virtqueue *vq)
+{
+	struct batch_copy_elem *elem = vq->batch_copy_elems;
+	uint16_t count = vq->batch_copy_nb_elems;
+	int i;
+
+	for (i = 0; i < count; i++)
+		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+}
+
 /* avoid write operation when necessary, to lessen cache issues */
 #define ASSIGN_UNLESS_EQUAL(var, val) do {	\
 	if ((var) != (val))			\
@@ -168,8 +196,9 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 }
 
 static __rte_always_inline int
-copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs,
-		  struct rte_mbuf *m, uint16_t desc_idx, uint32_t size)
+copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		  struct vring_desc *descs, struct rte_mbuf *m,
+		  uint16_t desc_idx, uint32_t size)
 {
 	uint32_t desc_avail, desc_offset;
 	uint32_t mbuf_avail, mbuf_offset;
@@ -178,16 +207,22 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs,
 	uint64_t desc_addr;
 	/* A counter to avoid desc dead loop chain */
 	uint16_t nr_desc = 1;
+	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+	uint16_t copy_nb = vq->batch_copy_nb_elems;
+	int error = 0;
 
 	desc = &descs[desc_idx];
-	desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr);
+	desc_addr = vhost_iova_to_vva(dev, vq, desc->addr,
+					desc->len, VHOST_ACCESS_RW);
 	/*
 	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
 	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
 	 * otherwise stores offset on the stack instead of in a register.
 	 */
-	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-		return -1;
+	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) {
+		error = -1;
+		goto out;
+	}
 
 	rte_prefetch0((void *)(uintptr_t)desc_addr);
 
@@ -213,27 +248,45 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs,
 		if (desc_avail == 0) {
 			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
 				/* Room in vring buffer is not enough */
-				return -1;
+				error = -1;
+				goto out;
+			}
+			if (unlikely(desc->next >= size || ++nr_desc > size)) {
+				error = -1;
+				goto out;
 			}
-			if (unlikely(desc->next >= size || ++nr_desc > size))
-				return -1;
 
 			desc = &descs[desc->next];
-			desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr);
-			if (unlikely(!desc_addr))
-				return -1;
+			desc_addr = vhost_iova_to_vva(dev, vq, desc->addr,
+							desc->len,
+							VHOST_ACCESS_RW);
+			if (unlikely(!desc_addr)) {
+				error = -1;
+				goto out;
+			}
 
 			desc_offset = 0;
 			desc_avail  = desc->len;
 		}
 
 		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			     cpy_len, 0);
+		if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
+			rte_memcpy((void *)((uintptr_t)(desc_addr +
+							desc_offset)),
+				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
+				cpy_len);
+			vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
+			PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
+				     cpy_len, 0);
+		} else {
+			batch_copy[copy_nb].dst =
+				(void *)((uintptr_t)(desc_addr + desc_offset));
+			batch_copy[copy_nb].src =
+				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
+			batch_copy[copy_nb].log_addr = desc->addr + desc_offset;
+			batch_copy[copy_nb].len = cpy_len;
+			copy_nb++;
+		}
 
 		mbuf_avail  -= cpy_len;
 		mbuf_offset += cpy_len;
@@ -241,7 +294,10 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs,
 		desc_offset += cpy_len;
 	}
 
-	return 0;
+out:
+	vq->batch_copy_nb_elems = copy_nb;
+
+	return error;
 }
 
 /**
@@ -273,17 +329,29 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 	if (unlikely(vq->enabled == 0))
 		return 0;
 
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_lock(vq);
+
+	if (unlikely(vq->access_ok == 0)) {
+		if (unlikely(vring_translate(dev, vq) < 0)) {
+			count = 0;
+			goto out;
+		}
+	}
+
 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 	start_idx = vq->last_used_idx;
 	free_entries = avail_idx - start_idx;
 	count = RTE_MIN(count, free_entries);
 	count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
 	if (count == 0)
-		return 0;
+		goto out;
 
 	LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
 		dev->vid, start_idx, start_idx + count);
 
+	vq->batch_copy_nb_elems = 0;
+
 	/* Retrieve all of the desc indexes first to avoid caching issues. */
 	rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
 	for (i = 0; i < count; i++) {
@@ -304,8 +372,10 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 
 		if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
 			descs = (struct vring_desc *)(uintptr_t)
-				rte_vhost_gpa_to_vva(dev->mem,
-					vq->desc[desc_idx].addr);
+				vhost_iova_to_vva(dev,
+						vq, vq->desc[desc_idx].addr,
+						vq->desc[desc_idx].len,
+						VHOST_ACCESS_RO);
 			if (unlikely(!descs)) {
 				count = i;
 				break;
@@ -318,19 +388,18 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 			sz = vq->size;
 		}
 
-		err = copy_mbuf_to_desc(dev, descs, pkts[i], desc_idx, sz);
+		err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz);
 		if (unlikely(err)) {
-			used_idx = (start_idx + i) & (vq->size - 1);
-			vq->used->ring[used_idx].len = dev->vhost_hlen;
-			vhost_log_used_vring(dev, vq,
-				offsetof(struct vring_used, ring[used_idx]),
-				sizeof(vq->used->ring[used_idx]));
+			count = i;
+			break;
 		}
 
 		if (i + 1 < count)
 			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
 	}
 
+	do_data_copy_enqueue(dev, vq);
+
 	rte_smp_wmb();
 
 	*(volatile uint16_t *)&vq->used->idx += count;
@@ -346,6 +415,10 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 			&& (vq->callfd >= 0))
 		eventfd_write(vq->callfd, (eventfd_t)1);
+out:
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_unlock(vq);
+
 	return count;
 }
 
@@ -364,7 +437,9 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 	if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
 		descs = (struct vring_desc *)(uintptr_t)
-			rte_vhost_gpa_to_vva(dev->mem, vq->desc[idx].addr);
+			vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
+						vq->desc[idx].len,
+						VHOST_ACCESS_RO);
 		if (unlikely(!descs))
 			return -1;
 
@@ -439,8 +514,9 @@ reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
 }
 
 static __rte_always_inline int
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
-			    struct buf_vector *buf_vec, uint16_t num_buffers)
+copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
+			    struct rte_mbuf *m, struct buf_vector *buf_vec,
+			    uint16_t num_buffers)
 {
 	uint32_t vec_idx = 0;
 	uint64_t desc_addr;
@@ -449,13 +525,22 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
 	uint32_t cpy_len;
 	uint64_t hdr_addr, hdr_phys_addr;
 	struct rte_mbuf *hdr_mbuf;
+	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+	uint16_t copy_nb = vq->batch_copy_nb_elems;
+	int error = 0;
 
-	if (unlikely(m == NULL))
-		return -1;
+	if (unlikely(m == NULL)) {
+		error = -1;
+		goto out;
+	}
 
-	desc_addr = rte_vhost_gpa_to_vva(dev->mem, buf_vec[vec_idx].buf_addr);
-	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
-		return -1;
+	desc_addr = vhost_iova_to_vva(dev, vq, buf_vec[vec_idx].buf_addr,
+						buf_vec[vec_idx].buf_len,
+						VHOST_ACCESS_RW);
+	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) {
+		error = -1;
+		goto out;
+	}
 
 	hdr_mbuf = m;
 	hdr_addr = desc_addr;
@@ -474,10 +559,15 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
 		/* done with current desc buf, get the next one */
 		if (desc_avail == 0) {
 			vec_idx++;
-			desc_addr = rte_vhost_gpa_to_vva(dev->mem,
-					buf_vec[vec_idx].buf_addr);
-			if (unlikely(!desc_addr))
-				return -1;
+			desc_addr =
+				vhost_iova_to_vva(dev, vq,
+					buf_vec[vec_idx].buf_addr,
+					buf_vec[vec_idx].buf_len,
+					VHOST_ACCESS_RW);
+			if (unlikely(!desc_addr)) {
+				error = -1;
+				goto out;
+			}
 
 			/* Prefetch buffer address. */
 			rte_prefetch0((void *)(uintptr_t)desc_addr);
@@ -509,13 +599,27 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
 		}
 
 		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
-			cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			cpy_len, 0);
+
+		if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
+			rte_memcpy((void *)((uintptr_t)(desc_addr +
+							desc_offset)),
+				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
+				cpy_len);
+			vhost_log_write(dev,
+				buf_vec[vec_idx].buf_addr + desc_offset,
+				cpy_len);
+			PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
+				cpy_len, 0);
+		} else {
+			batch_copy[copy_nb].dst =
+				(void *)((uintptr_t)(desc_addr + desc_offset));
+			batch_copy[copy_nb].src =
+				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
+			batch_copy[copy_nb].log_addr =
+				buf_vec[vec_idx].buf_addr + desc_offset;
+			batch_copy[copy_nb].len = cpy_len;
+			copy_nb++;
+		}
 
 		mbuf_avail  -= cpy_len;
 		mbuf_offset += cpy_len;
@@ -523,7 +627,10 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m,
 		desc_offset += cpy_len;
 	}
 
-	return 0;
+out:
+	vq->batch_copy_nb_elems = copy_nb;
+
+	return error;
 }
 
 static __rte_always_inline uint32_t
@@ -547,9 +654,18 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	if (unlikely(vq->enabled == 0))
 		return 0;
 
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_lock(vq);
+
+	if (unlikely(vq->access_ok == 0))
+		if (unlikely(vring_translate(dev, vq) < 0))
+			goto out;
+
 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
 	if (count == 0)
-		return 0;
+		goto out;
+
+	vq->batch_copy_nb_elems = 0;
 
 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
 
@@ -572,7 +688,7 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 			dev->vid, vq->last_avail_idx,
 			vq->last_avail_idx + num_buffers);
 
-		if (copy_mbuf_to_desc_mergeable(dev, pkts[pkt_idx],
+		if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
 						buf_vec, num_buffers) < 0) {
 			vq->shadow_used_idx -= num_buffers;
 			break;
@@ -581,6 +697,8 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 		vq->last_avail_idx += num_buffers;
 	}
 
+	do_data_copy_enqueue(dev, vq);
+
 	if (likely(vq->shadow_used_idx)) {
 		flush_shadow_used_ring(dev, vq);
 
@@ -593,6 +711,10 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 			eventfd_write(vq->callfd, (eventfd_t)1);
 	}
 
+out:
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_unlock(vq);
+
 	return pkt_idx;
 }
 
@@ -766,8 +888,9 @@ put_zmbuf(struct zcopy_mbuf *zmbuf)
 }
 
 static __rte_always_inline int
-copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
-		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
+copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		  struct vring_desc *descs, uint16_t max_desc,
+		  struct rte_mbuf *m, uint16_t desc_idx,
 		  struct rte_mempool *mbuf_pool)
 {
 	struct vring_desc *desc;
@@ -779,15 +902,25 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
 	struct virtio_net_hdr *hdr = NULL;
 	/* A counter to avoid desc dead loop chain */
 	uint32_t nr_desc = 1;
+	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+	uint16_t copy_nb = vq->batch_copy_nb_elems;
+	int error = 0;
 
 	desc = &descs[desc_idx];
 	if (unlikely((desc->len < dev->vhost_hlen)) ||
-			(desc->flags & VRING_DESC_F_INDIRECT))
-		return -1;
+			(desc->flags & VRING_DESC_F_INDIRECT)) {
+		error = -1;
+		goto out;
+	}
 
-	desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr);
-	if (unlikely(!desc_addr))
-		return -1;
+	desc_addr = vhost_iova_to_vva(dev,
+					vq, desc->addr,
+					desc->len,
+					VHOST_ACCESS_RO);
+	if (unlikely(!desc_addr)) {
+		error = -1;
+		goto out;
+	}
 
 	if (virtio_net_with_host_offload(dev)) {
 		hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
@@ -802,12 +935,19 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
 	if (likely((desc->len == dev->vhost_hlen) &&
 		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
 		desc = &descs[desc->next];
-		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
-			return -1;
+		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
+			error = -1;
+			goto out;
+		}
 
-		desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr);
-		if (unlikely(!desc_addr))
-			return -1;
+		desc_addr = vhost_iova_to_vva(dev,
+							vq, desc->addr,
+							desc->len,
+							VHOST_ACCESS_RO);
+		if (unlikely(!desc_addr)) {
+			error = -1;
+			goto out;
+		}
 
 		desc_offset = 0;
 		desc_avail  = desc->len;
@@ -838,7 +978,7 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
 			cur->data_len = cpy_len;
 			cur->data_off = 0;
 			cur->buf_addr = (void *)(uintptr_t)desc_addr;
-			cur->buf_physaddr = hpa;
+			cur->buf_iova = hpa;
 
 			/*
 			 * In zero copy mode, one mbuf can only reference data
@@ -846,10 +986,24 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
 			 */
 			mbuf_avail = cpy_len;
 		} else {
-			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
-							   mbuf_offset),
-				(void *)((uintptr_t)(desc_addr + desc_offset)),
-				cpy_len);
+			if (likely(cpy_len > MAX_BATCH_LEN ||
+				   copy_nb >= vq->size ||
+				   (hdr && cur == m))) {
+				rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
+								   mbuf_offset),
+					   (void *)((uintptr_t)(desc_addr +
+								desc_offset)),
+					   cpy_len);
+			} else {
+				batch_copy[copy_nb].dst =
+					rte_pktmbuf_mtod_offset(cur, void *,
+								mbuf_offset);
+				batch_copy[copy_nb].src =
+					(void *)((uintptr_t)(desc_addr +
+							     desc_offset));
+				batch_copy[copy_nb].len = cpy_len;
+				copy_nb++;
+			}
 		}
 
 		mbuf_avail  -= cpy_len;
@@ -863,15 +1017,24 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
 				break;
 
 			if (unlikely(desc->next >= max_desc ||
-				     ++nr_desc > max_desc))
-				return -1;
+				     ++nr_desc > max_desc)) {
+				error = -1;
+				goto out;
+			}
 			desc = &descs[desc->next];
-			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
-				return -1;
+			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
+				error = -1;
+				goto out;
+			}
 
-			desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr);
-			if (unlikely(!desc_addr))
-				return -1;
+			desc_addr = vhost_iova_to_vva(dev,
+							vq, desc->addr,
+							desc->len,
+							VHOST_ACCESS_RO);
+			if (unlikely(!desc_addr)) {
+				error = -1;
+				goto out;
+			}
 
 			rte_prefetch0((void *)(uintptr_t)desc_addr);
 
@@ -890,7 +1053,8 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
 			if (unlikely(cur == NULL)) {
 				RTE_LOG(ERR, VHOST_DATA, "Failed to "
 					"allocate memory for mbuf.\n");
-				return -1;
+				error = -1;
+				goto out;
 			}
 			if (unlikely(dev->dequeue_zero_copy))
 				rte_mbuf_refcnt_update(cur, 1);
@@ -912,7 +1076,10 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
 	if (hdr)
 		vhost_dequeue_offload(hdr, m);
 
-	return 0;
+out:
+	vq->batch_copy_nb_elems = copy_nb;
+
+	return error;
 }
 
 static __rte_always_inline void
@@ -1016,6 +1183,15 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	if (unlikely(vq->enabled == 0))
 		return 0;
 
+	vq->batch_copy_nb_elems = 0;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_lock(vq);
+
+	if (unlikely(vq->access_ok == 0))
+		if (unlikely(vring_translate(dev, vq) < 0))
+			goto out;
+
 	if (unlikely(dev->dequeue_zero_copy)) {
 		struct zcopy_mbuf *zmbuf, *next;
 		int nr_updated = 0;
@@ -1115,8 +1291,10 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 
 		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
 			desc = (struct vring_desc *)(uintptr_t)
-				rte_vhost_gpa_to_vva(dev->mem,
-					vq->desc[desc_indexes[i]].addr);
+				vhost_iova_to_vva(dev, vq,
+						vq->desc[desc_indexes[i]].addr,
+						sizeof(*desc),
+						VHOST_ACCESS_RO);
 			if (unlikely(!desc))
 				break;
 
@@ -1136,7 +1314,8 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 			break;
 		}
 
-		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
+		err = copy_desc_to_mbuf(dev, vq, desc, sz, pkts[i], idx,
+					mbuf_pool);
 		if (unlikely(err)) {
 			rte_pktmbuf_free(pkts[i]);
 			break;
@@ -1168,11 +1347,15 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	vq->last_avail_idx += i;
 
 	if (likely(dev->dequeue_zero_copy == 0)) {
+		do_data_copy_dequeue(vq);
 		vq->last_used_idx += i;
 		update_used_idx(dev, vq, i);
 	}
 
 out:
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+		vhost_user_iotlb_rd_unlock(vq);
+
 	if (unlikely(rarp_mbuf != NULL)) {
 		/*
 		 * Inject it to the head of "pkts" array, so that switch's mac
-- 
cgit 1.2.3-korg