15 files changed, 6318 insertions, 7239 deletions
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index c045bd79..f1f47c28 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -1,7 +1,7 @@
 #   BSD LICENSE
 #
-#   Copyright 2012-2015 6WIND S.A.
-#   Copyright 2012 Mellanox.
+#   Copyright 2012 6WIND S.A.
+#   Copyright 2012 Mellanox
 #
 #   Redistribution and use in source and binary forms, with or without
 #   modification, are permitted provided that the following conditions
@@ -36,7 +36,14 @@ LIB = librte_pmd_mlx4.a
 
 # Sources.
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_ethdev.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_txq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_utils.c
 
 # Basic CFLAGS.
 CFLAGS += -O3
@@ -47,7 +54,10 @@ CFLAGS += -D_BSD_SOURCE
 CFLAGS += -D_DEFAULT_SOURCE
 CFLAGS += -D_XOPEN_SOURCE=600
 CFLAGS += $(WERROR_FLAGS)
-LDLIBS += -libverbs
+LDLIBS += -libverbs -lmlx4
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_pci
 
 # A few warnings cannot be avoided in external headers.
 CFLAGS += -Wno-error=cast-qual
@@ -68,22 +78,10 @@ else
 CFLAGS += -DNDEBUG -UPEDANTIC
 endif
 
-ifdef CONFIG_RTE_LIBRTE_MLX4_SGE_WR_N
-CFLAGS += -DMLX4_PMD_SGE_WR_N=$(CONFIG_RTE_LIBRTE_MLX4_SGE_WR_N)
-endif
-
-ifdef CONFIG_RTE_LIBRTE_MLX4_MAX_INLINE
-CFLAGS += -DMLX4_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX4_MAX_INLINE)
-endif
-
 ifdef CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE
 CFLAGS += -DMLX4_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE)
 endif
 
-ifdef CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS
-CFLAGS += -DMLX4_PMD_SOFT_COUNTERS=$(CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS)
-endif
-
 ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DEBUG_BROKEN_VERBS),y)
 CFLAGS += -DMLX4_PMD_DEBUG_BROKEN_VERBS
 endif
@@ -103,23 +101,7 @@ mlx4_autoconf.h.new: FORCE
 
 mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 	$Q $(RM) -f -- '$@'
-	$Q sh -- '$<' '$@' \
-		RSS_SUPPORT \
-		infiniband/verbs.h \
-		enum IBV_EXP_DEVICE_UD_RSS $(AUTOCONF_OUTPUT)
-	$Q sh -- '$<' '$@' \
-		INLINE_RECV \
-		infiniband/verbs.h \
-		enum IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ $(AUTOCONF_OUTPUT)
-	$Q sh -- '$<' '$@' \
-		HAVE_EXP_QUERY_DEVICE \
-		infiniband/verbs.h \
-		type 'struct ibv_exp_device_attr' $(AUTOCONF_OUTPUT)
-	$Q sh -- '$<' '$@' \
-		HAVE_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK \
-		infiniband/verbs.h \
-		enum IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK \
-		$(AUTOCONF_OUTPUT)
+	$Q : > '$@'
 
 # Create mlx4_autoconf.h or update it in case it differs from the new one.
 
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 055de49a..f9e4f9d7 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -1,8 +1,8 @@
 /*-
  *   BSD LICENSE
  *
- *   Copyright 2012-2017 6WIND S.A.
- *   Copyright 2012-2017 Mellanox.
+ *   Copyright 2012 6WIND S.A.
+ *   Copyright 2012 Mellanox
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -31,95 +31,52 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/*
- * Known limitations:
- * - RSS hash key and options cannot be modified.
- * - Hardware counters aren't implemented.
+/**
+ * @file
+ * mlx4 driver initialization.
  */
 
-/* System headers. */
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdint.h>
-#include <inttypes.h>
 #include <string.h>
-#include <errno.h>
-#include <unistd.h>
-#include <limits.h>
-#include <assert.h>
-#include <arpa/inet.h>
-#include <net/if.h>
-#include <dirent.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <linux/ethtool.h>
-#include <linux/sockios.h>
-#include <fcntl.h>
 
-#include <rte_ether.h>
-#include <rte_ethdev.h>
-#include <rte_ethdev_pci.h>
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
 #include <rte_dev.h>
-#include <rte_mbuf.h>
 #include <rte_errno.h>
-#include <rte_mempool.h>
-#include <rte_prefetch.h>
-#include <rte_malloc.h>
-#include <rte_spinlock.h>
-#include <rte_atomic.h>
-#include <rte_version.h>
-#include <rte_log.h>
-#include <rte_alarm.h>
-#include <rte_memory.h>
+#include <rte_ethdev.h>
+#include <rte_ethdev_pci.h>
+#include <rte_ether.h>
 #include <rte_flow.h>
-#include <rte_kvargs.h>
 #include <rte_interrupts.h>
+#include <rte_kvargs.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
 
-/* Generated configuration header. */
-#include "mlx4_autoconf.h"
-
-/* PMD headers. */
 #include "mlx4.h"
 #include "mlx4_flow.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
 
-/* Convenience macros for accessing mbuf fields. */
-#define NEXT(m) ((m)->next)
-#define DATA_LEN(m) ((m)->data_len)
-#define PKT_LEN(m) ((m)->pkt_len)
-#define DATA_OFF(m) ((m)->data_off)
-#define SET_DATA_OFF(m, o) ((m)->data_off = (o))
-#define NB_SEGS(m) ((m)->nb_segs)
-#define PORT(m) ((m)->port)
-
-/* Work Request ID data type (64 bit). */
-typedef union {
-	struct {
-		uint32_t id;
-		uint16_t offset;
-	} data;
-	uint64_t raw;
-} wr_id_t;
-
-#define WR_ID(o) (((wr_id_t *)&(o))->data)
-
-/* Transpose flags. Useful to convert IBV to DPDK flags. */
-#define TRANSPOSE(val, from, to) \
-	(((from) >= (to)) ? \
-	 (((val) & (from)) / ((from) / (to))) : \
-	 (((val) & (from)) * ((to) / (from))))
-
-/* Local storage for secondary process data. */
-struct mlx4_secondary_data {
-	struct rte_eth_dev_data data; /* Local device data. */
-	struct priv *primary_priv; /* Private structure from primary. */
-	struct rte_eth_dev_data *shared_dev_data; /* Shared device data. */
-	rte_spinlock_t lock; /* Port configuration lock. */
-} mlx4_secondary_data[RTE_MAX_ETHPORTS];
-
+/** Configuration structure for device arguments. */
 struct mlx4_conf {
-	uint8_t active_ports;
+	struct {
+		uint32_t present; /**< Bit-field for existing ports. */
+		uint32_t enabled; /**< Bit-field for user-enabled ports. */
+	} ports;
 };
 
 /* Available parameters list. */
@@ -128,593 +85,6 @@ const char *pmd_mlx4_init_params[] = {
 	NULL,
 };
 
-static int
-mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
-
-static int
-mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
-
-static int
-priv_rx_intr_vec_enable(struct priv *priv);
-
-static void
-priv_rx_intr_vec_disable(struct priv *priv);
-
-/**
- * Check if running as a secondary process.
- *
- * @return
- *   Nonzero if running as a secondary process.
- */
-static inline int
-mlx4_is_secondary(void)
-{
-	return rte_eal_process_type() != RTE_PROC_PRIMARY;
-}
-
-/**
- * Return private structure associated with an Ethernet device.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- *
- * @return
- *   Pointer to private structure.
- */
-static struct priv *
-mlx4_get_priv(struct rte_eth_dev *dev)
-{
-	struct mlx4_secondary_data *sd;
-
-	if (!mlx4_is_secondary())
-		return dev->data->dev_private;
-	sd = &mlx4_secondary_data[dev->data->port_id];
-	return sd->data.dev_private;
-}
-
-/**
- * Lock private structure to protect it from concurrent access in the
- * control path.
- *
- * @param priv
- *   Pointer to private structure.
- */
-void priv_lock(struct priv *priv)
-{
-	rte_spinlock_lock(&priv->lock);
-}
-
-/**
- * Unlock private structure.
- *
- * @param priv
- *   Pointer to private structure.
- */
-void priv_unlock(struct priv *priv)
-{
-	rte_spinlock_unlock(&priv->lock);
-}
-
-/* Allocate a buffer on the stack and fill it with a printf format string. */
-#define MKSTR(name, ...) \
-	char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \
-	\
-	snprintf(name, sizeof(name), __VA_ARGS__)
-
-/**
- * Get interface name from private structure.
- *
- * @param[in] priv
- *   Pointer to private structure.
- * @param[out] ifname
- *   Interface name output buffer.
- *
- * @return
- *   0 on success, -1 on failure and errno is set.
- */
-static int
-priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
-{
-	DIR *dir;
-	struct dirent *dent;
-	unsigned int dev_type = 0;
-	unsigned int dev_port_prev = ~0u;
-	char match[IF_NAMESIZE] = "";
-
-	{
-		MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path);
-
-		dir = opendir(path);
-		if (dir == NULL)
-			return -1;
-	}
-	while ((dent = readdir(dir)) != NULL) {
-		char *name = dent->d_name;
-		FILE *file;
-		unsigned int dev_port;
-		int r;
-
-		if ((name[0] == '.') &&
-		    ((name[1] == '\0') ||
-		     ((name[1] == '.') && (name[2] == '\0'))))
-			continue;
-
-		MKSTR(path, "%s/device/net/%s/%s",
-		      priv->ctx->device->ibdev_path, name,
-		      (dev_type ? "dev_id" : "dev_port"));
-
-		file = fopen(path, "rb");
-		if (file == NULL) {
-			if (errno != ENOENT)
-				continue;
-			/*
-			 * Switch to dev_id when dev_port does not exist as
-			 * is the case with Linux kernel versions < 3.15.
-			 */
-try_dev_id:
-			match[0] = '\0';
-			if (dev_type)
-				break;
-			dev_type = 1;
-			dev_port_prev = ~0u;
-			rewinddir(dir);
-			continue;
-		}
-		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
-		fclose(file);
-		if (r != 1)
-			continue;
-		/*
-		 * Switch to dev_id when dev_port returns the same value for
-		 * all ports. May happen when using a MOFED release older than
-		 * 3.0 with a Linux kernel >= 3.15.
-		 */
-		if (dev_port == dev_port_prev)
-			goto try_dev_id;
-		dev_port_prev = dev_port;
-		if (dev_port == (priv->port - 1u))
-			snprintf(match, sizeof(match), "%s", name);
-	}
-	closedir(dir);
-	if (match[0] == '\0')
-		return -1;
-	strncpy(*ifname, match, sizeof(*ifname));
-	return 0;
-}
-
-/**
- * Read from sysfs entry.
- *
- * @param[in] priv
- *   Pointer to private structure.
- * @param[in] entry
- *   Entry name relative to sysfs path.
- * @param[out] buf
- *   Data output buffer.
- * @param size
- *   Buffer size.
- *
- * @return
- *   0 on success, -1 on failure and errno is set.
- */
-static int
-priv_sysfs_read(const struct priv *priv, const char *entry,
-		char *buf, size_t size)
-{
-	char ifname[IF_NAMESIZE];
-	FILE *file;
-	int ret;
-	int err;
-
-	if (priv_get_ifname(priv, &ifname))
-		return -1;
-
-	MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
-	      ifname, entry);
-
-	file = fopen(path, "rb");
-	if (file == NULL)
-		return -1;
-	ret = fread(buf, 1, size, file);
-	err = errno;
-	if (((size_t)ret < size) && (ferror(file)))
-		ret = -1;
-	else
-		ret = size;
-	fclose(file);
-	errno = err;
-	return ret;
-}
-
-/**
- * Write to sysfs entry.
- *
- * @param[in] priv
- *   Pointer to private structure.
- * @param[in] entry
- *   Entry name relative to sysfs path.
- * @param[in] buf
- *   Data buffer.
- * @param size
- *   Buffer size.
- *
- * @return
- *   0 on success, -1 on failure and errno is set.
- */
-static int
-priv_sysfs_write(const struct priv *priv, const char *entry,
-		 char *buf, size_t size)
-{
-	char ifname[IF_NAMESIZE];
-	FILE *file;
-	int ret;
-	int err;
-
-	if (priv_get_ifname(priv, &ifname))
-		return -1;
-
-	MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
-	      ifname, entry);
-
-	file = fopen(path, "wb");
-	if (file == NULL)
-		return -1;
-	ret = fwrite(buf, 1, size, file);
-	err = errno;
-	if (((size_t)ret < size) || (ferror(file)))
-		ret = -1;
-	else
-		ret = size;
-	fclose(file);
-	errno = err;
-	return ret;
-}
-
-/**
- * Get unsigned long sysfs property.
- *
- * @param priv
- *   Pointer to private structure.
- * @param[in] name
- *   Entry name relative to sysfs path.
- * @param[out] value
- *   Value output buffer.
- *
- * @return
- *   0 on success, -1 on failure and errno is set.
- */
-static int
-priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value)
-{
-	int ret;
-	unsigned long value_ret;
-	char value_str[32];
-
-	ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1));
-	if (ret == -1) {
-		DEBUG("cannot read %s value from sysfs: %s",
-		      name, strerror(errno));
-		return -1;
-	}
-	value_str[ret] = '\0';
-	errno = 0;
-	value_ret = strtoul(value_str, NULL, 0);
-	if (errno) {
-		DEBUG("invalid %s value `%s': %s", name, value_str,
-		      strerror(errno));
-		return -1;
-	}
-	*value = value_ret;
-	return 0;
-}
-
-/**
- * Set unsigned long sysfs property.
- *
- * @param priv
- *   Pointer to private structure.
- * @param[in] name
- *   Entry name relative to sysfs path.
- * @param value
- *   Value to set.
- *
- * @return
- *   0 on success, -1 on failure and errno is set.
- */
-static int
-priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value)
-{
-	int ret;
-	MKSTR(value_str, "%lu", value);
-
-	ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1));
-	if (ret == -1) {
-		DEBUG("cannot write %s `%s' (%lu) to sysfs: %s",
-		      name, value_str, value, strerror(errno));
-		return -1;
-	}
-	return 0;
-}
-
-/**
- * Perform ifreq ioctl() on associated Ethernet device.
- *
- * @param[in] priv
- *   Pointer to private structure.
- * @param req
- *   Request number to pass to ioctl().
- * @param[out] ifr
- *   Interface request structure output buffer.
- *
- * @return
- *   0 on success, -1 on failure and errno is set.
- */
-static int
-priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr)
-{
-	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
-	int ret = -1;
-
-	if (sock == -1)
-		return ret;
-	if (priv_get_ifname(priv, &ifr->ifr_name) == 0)
-		ret = ioctl(sock, req, ifr);
-	close(sock);
-	return ret;
-}
-
-/**
- * Get device MTU.
- *
- * @param priv
- *   Pointer to private structure.
- * @param[out] mtu
- *   MTU value output buffer.
- *
- * @return
- *   0 on success, -1 on failure and errno is set.
- */
-static int
-priv_get_mtu(struct priv *priv, uint16_t *mtu)
-{
-	unsigned long ulong_mtu;
-
-	if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1)
-		return -1;
-	*mtu = ulong_mtu;
-	return 0;
-}
-
-/**
- * Set device MTU.
- *
- * @param priv
- *   Pointer to private structure.
- * @param mtu
- *   MTU value to set.
- *
- * @return
- *   0 on success, -1 on failure and errno is set.
- */
-static int
-priv_set_mtu(struct priv *priv, uint16_t mtu)
-{
-	uint16_t new_mtu;
-
-	if (priv_set_sysfs_ulong(priv, "mtu", mtu) ||
-	    priv_get_mtu(priv, &new_mtu))
-		return -1;
-	if (new_mtu == mtu)
-		return 0;
-	errno = EINVAL;
-	return -1;
-}
-
-/**
- * Set device flags.
- *
- * @param priv
- *   Pointer to private structure.
- * @param keep
- *   Bitmask for flags that must remain untouched.
- * @param flags
- *   Bitmask for flags to modify.
- *
- * @return
- *   0 on success, -1 on failure and errno is set.
- */
-static int
-priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags)
-{
-	unsigned long tmp;
-
-	if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1)
-		return -1;
-	tmp &= keep;
-	tmp |= (flags & (~keep));
-	return priv_set_sysfs_ulong(priv, "flags", tmp);
-}
-
-/* Device configuration. */
-
-static int
-txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
-	  unsigned int socket, const struct rte_eth_txconf *conf);
-
-static void
-txq_cleanup(struct txq *txq);
-
-static int
-rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
-	  unsigned int socket, int inactive,
-	  const struct rte_eth_rxconf *conf,
-	  struct rte_mempool *mp, int children_n,
-	  struct rxq *rxq_parent);
-
-static void
-rxq_cleanup(struct rxq *rxq);
-
-/**
- * Create RSS parent queue.
- *
- * The new parent is inserted in front of the list in the private structure.
- *
- * @param priv
- *   Pointer to private structure.
- * @param queues
- *   Queues indices array, if NULL use all Rx queues.
- * @param children_n
- *   The number of entries in queues[].
- *
- * @return
- *   Pointer to a parent rxq structure, NULL on failure.
- */
-struct rxq *
-priv_parent_create(struct priv *priv,
-		   uint16_t queues[],
-		   uint16_t children_n)
-{
-	int ret;
-	uint16_t i;
-	struct rxq *parent;
-
-	parent = rte_zmalloc("parent queue",
-			     sizeof(*parent),
-			     RTE_CACHE_LINE_SIZE);
-	if (!parent) {
-		ERROR("cannot allocate memory for RSS parent queue");
-		return NULL;
-	}
-	ret = rxq_setup(priv->dev, parent, 0, 0, 0,
-			NULL, NULL, children_n, NULL);
-	if (ret) {
-		rte_free(parent);
-		return NULL;
-	}
-	parent->rss.queues_n = children_n;
-	if (queues) {
-		for (i = 0; i < children_n; ++i)
-			parent->rss.queues[i] = queues[i];
-	} else {
-		/* the default RSS ring case */
-		assert(priv->rxqs_n == children_n);
-		for (i = 0; i < priv->rxqs_n; ++i)
-			parent->rss.queues[i] = i;
-	}
-	LIST_INSERT_HEAD(&priv->parents, parent, next);
-	return parent;
-}
-
-/**
- * Clean up RX queue parent structure.
- *
- * @param parent
- *   RX queue parent structure.
- */
-void
-rxq_parent_cleanup(struct rxq *parent)
-{
-	LIST_REMOVE(parent, next);
-	rxq_cleanup(parent);
-	rte_free(parent);
-}
-
-/**
- * Clean up parent structures from the parent list.
- *
- * @param priv
- *   Pointer to private structure.
- */
-static void
-priv_parent_list_cleanup(struct priv *priv)
-{
-	while (!LIST_EMPTY(&priv->parents))
-		rxq_parent_cleanup(LIST_FIRST(&priv->parents));
-}
-
-/**
- * Ethernet device configuration.
- *
- * Prepare the driver for a given number of TX and RX queues.
- * Allocate parent RSS queue when several RX queues are requested.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-dev_configure(struct rte_eth_dev *dev)
-{
-	struct priv *priv = dev->data->dev_private;
-	unsigned int rxqs_n = dev->data->nb_rx_queues;
-	unsigned int txqs_n = dev->data->nb_tx_queues;
-	unsigned int tmp;
-
-	priv->rxqs = (void *)dev->data->rx_queues;
-	priv->txqs = (void *)dev->data->tx_queues;
-	if (txqs_n != priv->txqs_n) {
-		INFO("%p: TX queues number update: %u -> %u",
-		     (void *)dev, priv->txqs_n, txqs_n);
-		priv->txqs_n = txqs_n;
-	}
-	if (rxqs_n == priv->rxqs_n)
-		return 0;
-	if (!rte_is_power_of_2(rxqs_n) && !priv->isolated) {
-		unsigned n_active;
-
-		n_active = rte_align32pow2(rxqs_n + 1) >> 1;
-		WARN("%p: number of RX queues must be a power"
-			" of 2: %u queues among %u will be active",
-			(void *)dev, n_active, rxqs_n);
-	}
-
-	INFO("%p: RX queues number update: %u -> %u",
-	     (void *)dev, priv->rxqs_n, rxqs_n);
-	/* If RSS is enabled, disable it first. */
-	if (priv->rss) {
-		unsigned int i;
-
-		/* Only if there are no remaining child RX queues. */
-		for (i = 0; (i != priv->rxqs_n); ++i)
-			if ((*priv->rxqs)[i] != NULL)
-				return EINVAL;
-		priv_parent_list_cleanup(priv);
-		priv->rss = 0;
-		priv->rxqs_n = 0;
-	}
-	if (rxqs_n <= 1) {
-		/* Nothing else to do. */
-		priv->rxqs_n = rxqs_n;
-		return 0;
-	}
-	/* Allocate a new RSS parent queue if supported by hardware. */
-	if (!priv->hw_rss) {
-		ERROR("%p: only a single RX queue can be configured when"
-		      " hardware doesn't support RSS",
-		      (void *)dev);
-		return EINVAL;
-	}
-	/* Fail if hardware doesn't support that many RSS queues. */
-	if (rxqs_n >= priv->max_rss_tbl_sz) {
-		ERROR("%p: only %u RX queues can be configured for RSS",
-		      (void *)dev, priv->max_rss_tbl_sz);
-		return EINVAL;
-	}
-	priv->rss = 1;
-	tmp = priv->rxqs_n;
-	priv->rxqs_n = rxqs_n;
-	if (priv->isolated)
-		return 0;
-	if (priv_parent_create(priv, NULL, priv->rxqs_n))
-		return 0;
-	/* Failure, rollback. */
-	priv->rss = 0;
-	priv->rxqs_n = tmp;
-	return ENOMEM;
-}
-
 /**
  * DPDK callback for Ethernet device configuration.
  *
@@ -722,3490 +92,78 @@ dev_configure(struct rte_eth_dev *dev)
  *   Pointer to Ethernet device structure.
  *
  * @return
- *   0 on success, negative errno value on failure.
+ *   0 on success, negative errno value otherwise and rte_errno is set.
  */
 static int
 mlx4_dev_configure(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
+	struct rte_flow_error error;
 	int ret;
 
-	if (mlx4_is_secondary())
-		return -E_RTE_SECONDARY;
-	priv_lock(priv);
-	ret = dev_configure(dev);
-	assert(ret >= 0);
-	priv_unlock(priv);
-	return -ret;
-}
-
-static uint16_t mlx4_tx_burst(void *, struct rte_mbuf **, uint16_t);
-static uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
-
-/**
- * Configure secondary process queues from a private data pointer (primary
- * or secondary) and update burst callbacks. Can take place only once.
- *
- * All queues must have been previously created by the primary process to
- * avoid undefined behavior.
- *
- * @param priv
- *   Private data pointer from either primary or secondary process.
- *
- * @return
- *   Private data pointer from secondary process, NULL in case of error.
- */
-static struct priv *
-mlx4_secondary_data_setup(struct priv *priv)
-{
-	unsigned int port_id = 0;
-	struct mlx4_secondary_data *sd;
-	void **tx_queues;
-	void **rx_queues;
-	unsigned int nb_tx_queues;
-	unsigned int nb_rx_queues;
-	unsigned int i;
-
-	/* priv must be valid at this point. */
-	assert(priv != NULL);
-	/* priv->dev must also be valid but may point to local memory from
-	 * another process, possibly with the same address and must not
-	 * be dereferenced yet. */
-	assert(priv->dev != NULL);
-	/* Determine port ID by finding out where priv comes from. */
-	while (1) {
-		sd = &mlx4_secondary_data[port_id];
-		rte_spinlock_lock(&sd->lock);
-		/* Primary process? */
-		if (sd->primary_priv == priv)
-			break;
-		/* Secondary process? */
-		if (sd->data.dev_private == priv)
-			break;
-		rte_spinlock_unlock(&sd->lock);
-		if (++port_id == RTE_DIM(mlx4_secondary_data))
-			port_id = 0;
-	}
-	/* Switch to secondary private structure. If private data has already
-	 * been updated by another thread, there is nothing else to do. */
-	priv = sd->data.dev_private;
-	if (priv->dev->data == &sd->data)
-		goto end;
-	/* Sanity checks. Secondary private structure is supposed to point
-	 * to local eth_dev, itself still pointing to the shared device data
-	 * structure allocated by the primary process. */
-	assert(sd->shared_dev_data != &sd->data);
-	assert(sd->data.nb_tx_queues == 0);
-	assert(sd->data.tx_queues == NULL);
-	assert(sd->data.nb_rx_queues == 0);
-	assert(sd->data.rx_queues == NULL);
-	assert(priv != sd->primary_priv);
-	assert(priv->dev->data == sd->shared_dev_data);
-	assert(priv->txqs_n == 0);
-	assert(priv->txqs == NULL);
-	assert(priv->rxqs_n == 0);
-	assert(priv->rxqs == NULL);
-	nb_tx_queues = sd->shared_dev_data->nb_tx_queues;
-	nb_rx_queues = sd->shared_dev_data->nb_rx_queues;
-	/* Allocate local storage for queues. */
-	tx_queues = rte_zmalloc("secondary ethdev->tx_queues",
-				sizeof(sd->data.tx_queues[0]) * nb_tx_queues,
-				RTE_CACHE_LINE_SIZE);
-	rx_queues = rte_zmalloc("secondary ethdev->rx_queues",
-				sizeof(sd->data.rx_queues[0]) * nb_rx_queues,
-				RTE_CACHE_LINE_SIZE);
-	if (tx_queues == NULL || rx_queues == NULL)
-		goto error;
-	/* Lock to prevent control operations during setup. */
-	priv_lock(priv);
-	/* TX queues. */
-	for (i = 0; i != nb_tx_queues; ++i) {
-		struct txq *primary_txq = (*sd->primary_priv->txqs)[i];
-		struct txq *txq;
-
-		if (primary_txq == NULL)
-			continue;
-		txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0,
-					primary_txq->socket);
-		if (txq != NULL) {
-			if (txq_setup(priv->dev,
-				      txq,
-				      primary_txq->elts_n * MLX4_PMD_SGE_WR_N,
-				      primary_txq->socket,
-				      NULL) == 0) {
-				txq->stats.idx = primary_txq->stats.idx;
-				tx_queues[i] = txq;
-				continue;
-			}
-			rte_free(txq);
-		}
-		while (i) {
-			txq = tx_queues[--i];
-			txq_cleanup(txq);
-			rte_free(txq);
-		}
-		goto error;
-	}
-	/* RX queues. */
-	for (i = 0; i != nb_rx_queues; ++i) {
-		struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i];
-
-		if (primary_rxq == NULL)
-			continue;
-		/* Not supported yet. */
-		rx_queues[i] = NULL;
-	}
-	/* Update everything. */
-	priv->txqs = (void *)tx_queues;
-	priv->txqs_n = nb_tx_queues;
-	priv->rxqs = (void *)rx_queues;
-	priv->rxqs_n = nb_rx_queues;
-	sd->data.rx_queues = rx_queues;
-	sd->data.tx_queues = tx_queues;
-	sd->data.nb_rx_queues = nb_rx_queues;
-	sd->data.nb_tx_queues = nb_tx_queues;
-	sd->data.dev_link = sd->shared_dev_data->dev_link;
-	sd->data.mtu = sd->shared_dev_data->mtu;
-	memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state,
-	       sizeof(sd->data.rx_queue_state));
-	memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state,
-	       sizeof(sd->data.tx_queue_state));
-	sd->data.dev_flags = sd->shared_dev_data->dev_flags;
-	/* Use local data from now on. */
-	rte_mb();
-	priv->dev->data = &sd->data;
-	rte_mb();
-	priv->dev->tx_pkt_burst = mlx4_tx_burst;
-	priv->dev->rx_pkt_burst = removed_rx_burst;
-	priv_unlock(priv);
-end:
-	/* More sanity checks. */
-	assert(priv->dev->tx_pkt_burst == mlx4_tx_burst);
-	assert(priv->dev->rx_pkt_burst == removed_rx_burst);
-	assert(priv->dev->data == &sd->data);
-	rte_spinlock_unlock(&sd->lock);
-	return priv;
-error:
-	priv_unlock(priv);
-	rte_free(tx_queues);
-	rte_free(rx_queues);
-	rte_spinlock_unlock(&sd->lock);
-	return NULL;
-}
-
-/* TX queues handling. */
-
-/**
- * Allocate TX queue elements.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param elts_n
- *   Number of elements to allocate.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-txq_alloc_elts(struct txq *txq, unsigned int elts_n)
-{
-	unsigned int i;
-	struct txq_elt (*elts)[elts_n] =
-		rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
-	linear_t (*elts_linear)[elts_n] =
-		rte_calloc_socket("TXQ", 1, sizeof(*elts_linear), 0,
-				  txq->socket);
-	struct ibv_mr *mr_linear = NULL;
-	int ret = 0;
-
-	if ((elts == NULL) || (elts_linear == NULL)) {
-		ERROR("%p: can't allocate packets array", (void *)txq);
-		ret = ENOMEM;
-		goto error;
-	}
-	mr_linear =
-		ibv_reg_mr(txq->priv->pd, elts_linear, sizeof(*elts_linear),
-			   IBV_ACCESS_LOCAL_WRITE);
-	if (mr_linear == NULL) {
-		ERROR("%p: unable to configure MR, ibv_reg_mr() failed",
-		      (void *)txq);
-		ret = EINVAL;
-		goto error;
-	}
-	for (i = 0; (i != elts_n); ++i) {
-		struct txq_elt *elt = &(*elts)[i];
-
-		elt->buf = NULL;
-	}
-	DEBUG("%p: allocated and configured %u WRs", (void *)txq, elts_n);
-	txq->elts_n = elts_n;
-	txq->elts = elts;
-	txq->elts_head = 0;
-	txq->elts_tail = 0;
-	txq->elts_comp = 0;
-	/* Request send completion every MLX4_PMD_TX_PER_COMP_REQ packets or
-	 * at least 4 times per ring. */
-	txq->elts_comp_cd_init =
-		((MLX4_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
-		 MLX4_PMD_TX_PER_COMP_REQ : (elts_n / 4));
-	txq->elts_comp_cd = txq->elts_comp_cd_init;
-	txq->elts_linear = elts_linear;
-	txq->mr_linear = mr_linear;
-	assert(ret == 0);
-	return 0;
-error:
-	if (mr_linear != NULL)
-		claim_zero(ibv_dereg_mr(mr_linear));
-
-	rte_free(elts_linear);
-	rte_free(elts);
-
-	DEBUG("%p: failed, freed everything", (void *)txq);
-	assert(ret > 0);
-	return ret;
-}
-
-/**
- * Free TX queue elements.
- *
- * @param txq
- *   Pointer to TX queue structure.
- */
-static void
-txq_free_elts(struct txq *txq)
-{
-	unsigned int elts_n = txq->elts_n;
-	unsigned int elts_head = txq->elts_head;
-	unsigned int elts_tail = txq->elts_tail;
-	struct txq_elt (*elts)[elts_n] = txq->elts;
-	linear_t (*elts_linear)[elts_n] = txq->elts_linear;
-	struct ibv_mr *mr_linear = txq->mr_linear;
-
-	DEBUG("%p: freeing WRs", (void *)txq);
-	txq->elts_n = 0;
-	txq->elts_head = 0;
-	txq->elts_tail = 0;
-	txq->elts_comp = 0;
-	txq->elts_comp_cd = 0;
-	txq->elts_comp_cd_init = 0;
-	txq->elts = NULL;
-	txq->elts_linear = NULL;
-	txq->mr_linear = NULL;
-	if (mr_linear != NULL)
-		claim_zero(ibv_dereg_mr(mr_linear));
-
-	rte_free(elts_linear);
-	if (elts == NULL)
-		return;
-	while (elts_tail != elts_head) {
-		struct txq_elt *elt = &(*elts)[elts_tail];
-
-		assert(elt->buf != NULL);
-		rte_pktmbuf_free(elt->buf);
-#ifndef NDEBUG
-		/* Poisoning. */
-		memset(elt, 0x77, sizeof(*elt));
-#endif
-		if (++elts_tail == elts_n)
-			elts_tail = 0;
-	}
-	rte_free(elts);
-}
-
-
-/**
- * Clean up a TX queue.
- *
- * Destroy objects, free allocated memory and reset the structure for reuse.
- *
- * @param txq
- *   Pointer to TX queue structure.
- */
-static void
-txq_cleanup(struct txq *txq)
-{
-	struct ibv_exp_release_intf_params params;
-	size_t i;
-
-	DEBUG("cleaning up %p", (void *)txq);
-	txq_free_elts(txq);
-	if (txq->if_qp != NULL) {
-		assert(txq->priv != NULL);
-		assert(txq->priv->ctx != NULL);
-		assert(txq->qp != NULL);
-		params = (struct ibv_exp_release_intf_params){
-			.comp_mask = 0,
-		};
-		claim_zero(ibv_exp_release_intf(txq->priv->ctx,
-						txq->if_qp,
-						&params));
-	}
-	if (txq->if_cq != NULL) {
-		assert(txq->priv != NULL);
-		assert(txq->priv->ctx != NULL);
-		assert(txq->cq != NULL);
-		params = (struct ibv_exp_release_intf_params){
-			.comp_mask = 0,
-		};
-		claim_zero(ibv_exp_release_intf(txq->priv->ctx,
-						txq->if_cq,
-						&params));
-	}
-	if (txq->qp != NULL)
-		claim_zero(ibv_destroy_qp(txq->qp));
-	if (txq->cq != NULL)
-		claim_zero(ibv_destroy_cq(txq->cq));
-	if (txq->rd != NULL) {
-		struct ibv_exp_destroy_res_domain_attr attr = {
-			.comp_mask = 0,
-		};
-
-		assert(txq->priv != NULL);
-		assert(txq->priv->ctx != NULL);
-		claim_zero(ibv_exp_destroy_res_domain(txq->priv->ctx,
-						      txq->rd,
-						      &attr));
-	}
-	for (i = 0; (i != elemof(txq->mp2mr)); ++i) {
-		if (txq->mp2mr[i].mp == NULL)
-			break;
-		assert(txq->mp2mr[i].mr != NULL);
-		claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr));
-	}
-	memset(txq, 0, sizeof(*txq));
-}
-
-/**
- * Manage TX completions.
- *
- * When sending a burst, mlx4_tx_burst() posts several WRs.
- * To improve performance, a completion event is only required once every
- * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
- * for other WRs, but this information would not be used anyway.
- *
- * @param txq
- *   Pointer to TX queue structure.
- *
- * @return
- *   0 on success, -1 on failure.
- */
-static int
-txq_complete(struct txq *txq)
-{
-	unsigned int elts_comp = txq->elts_comp;
-	unsigned int elts_tail = txq->elts_tail;
-	const unsigned int elts_n = txq->elts_n;
-	int wcs_n;
-
-	if (unlikely(elts_comp == 0))
-		return 0;
-#ifdef DEBUG_SEND
-	DEBUG("%p: processing %u work requests completions",
-	      (void *)txq, elts_comp);
-#endif
-	wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp);
-	if (unlikely(wcs_n == 0))
-		return 0;
-	if (unlikely(wcs_n < 0)) {
-		DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
-		      (void *)txq, wcs_n);
-		return -1;
-	}
-	elts_comp -= wcs_n;
-	assert(elts_comp <= txq->elts_comp);
-	/*
-	 * Assume WC status is successful as nothing can be done about it
-	 * anyway.
-	 */
-	elts_tail += wcs_n * txq->elts_comp_cd_init;
-	if (elts_tail >= elts_n)
-		elts_tail -= elts_n;
-	txq->elts_tail = elts_tail;
-	txq->elts_comp = elts_comp;
-	return 0;
-}
-
-struct mlx4_check_mempool_data {
-	int ret;
-	char *start;
-	char *end;
-};
-
-/* Called by mlx4_check_mempool() when iterating the memory chunks. */
-static void mlx4_check_mempool_cb(struct rte_mempool *mp,
-	void *opaque, struct rte_mempool_memhdr *memhdr,
-	unsigned mem_idx)
-{
-	struct mlx4_check_mempool_data *data = opaque;
-
-	(void)mp;
-	(void)mem_idx;
-
-	/* It already failed, skip the next chunks. */
-	if (data->ret != 0)
-		return;
-	/* It is the first chunk. */
-	if (data->start == NULL && data->end == NULL) {
-		data->start = memhdr->addr;
-		data->end = data->start + memhdr->len;
-		return;
-	}
-	if (data->end == memhdr->addr) {
-		data->end += memhdr->len;
-		return;
-	}
-	if (data->start == (char *)memhdr->addr + memhdr->len) {
-		data->start -= memhdr->len;
-		return;
-	}
-	/* Error, mempool is not virtually contigous. */
-	data->ret = -1;
-}
-
-/**
- * Check if a mempool can be used: it must be virtually contiguous.
- *
- * @param[in] mp
- *   Pointer to memory pool.
- * @param[out] start
- *   Pointer to the start address of the mempool virtual memory area
- * @param[out] end
- *   Pointer to the end address of the mempool virtual memory area
- *
- * @return
- *   0 on success (mempool is virtually contiguous), -1 on error.
- */
-static int mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start,
-	uintptr_t *end)
-{
-	struct mlx4_check_mempool_data data;
-
-	memset(&data, 0, sizeof(data));
-	rte_mempool_mem_iter(mp, mlx4_check_mempool_cb, &data);
-	*start = (uintptr_t)data.start;
-	*end = (uintptr_t)data.end;
-
-	return data.ret;
-}
-
-/* For best performance, this function should not be inlined. */
-static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *)
-	__rte_noinline;
-
-/**
- * Register mempool as a memory region.
- *
- * @param pd
- *   Pointer to protection domain.
- * @param mp
- *   Pointer to memory pool.
- *
- * @return
- *   Memory region pointer, NULL in case of error.
- */
-static struct ibv_mr *
-mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
-{
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	uintptr_t start;
-	uintptr_t end;
-	unsigned int i;
-
-	if (mlx4_check_mempool(mp, &start, &end) != 0) {
-		ERROR("mempool %p: not virtually contiguous",
-			(void *)mp);
-		return NULL;
-	}
-
-	DEBUG("mempool %p area start=%p end=%p size=%zu",
-	      (void *)mp, (void *)start, (void *)end,
-	      (size_t)(end - start));
-	/* Round start and end to page boundary if found in memory segments. */
-	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
-		uintptr_t addr = (uintptr_t)ms[i].addr;
-		size_t len = ms[i].len;
-		unsigned int align = ms[i].hugepage_sz;
-
-		if ((start > addr) && (start < addr + len))
-			start = RTE_ALIGN_FLOOR(start, align);
-		if ((end > addr) && (end < addr + len))
-			end = RTE_ALIGN_CEIL(end, align);
-	}
-	DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
-	      (void *)mp, (void *)start, (void *)end,
-	      (size_t)(end - start));
-	return ibv_reg_mr(pd,
-			  (void *)start,
-			  end - start,
-			  IBV_ACCESS_LOCAL_WRITE);
-}
-
-/**
- * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
- * the cloned mbuf is allocated is returned instead.
- *
- * @param buf
- *   Pointer to mbuf.
- *
- * @return
- *   Memory pool where data is located for given mbuf.
- */
-static struct rte_mempool *
-txq_mb2mp(struct rte_mbuf *buf)
-{
-	if (unlikely(RTE_MBUF_INDIRECT(buf)))
-		return rte_mbuf_from_indirect(buf)->pool;
-	return buf->pool;
-}
-
-/**
- * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param[in] mp
- *   Memory Pool for which a Memory Region lkey must be returned.
- *
- * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
- */
-static uint32_t
-txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
-{
-	unsigned int i;
-	struct ibv_mr *mr;
-
-	for (i = 0; (i != elemof(txq->mp2mr)); ++i) {
-		if (unlikely(txq->mp2mr[i].mp == NULL)) {
-			/* Unknown MP, add a new MR for it. */
-			break;
-		}
-		if (txq->mp2mr[i].mp == mp) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
-			return txq->mp2mr[i].lkey;
-		}
-	}
-	/* Add a new entry, register MR first. */
-	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
-	      (void *)txq, mp->name, (void *)mp);
-	mr = mlx4_mp2mr(txq->priv->pd, mp);
-	if (unlikely(mr == NULL)) {
-		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
-		      (void *)txq);
-		return (uint32_t)-1;
-	}
-	if (unlikely(i == elemof(txq->mp2mr))) {
-		/* Table is full, remove oldest entry. */
-		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
-		      (void *)txq);
-		--i;
-		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
-		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
-			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
-	}
-	/* Store the new entry. */
-	txq->mp2mr[i].mp = mp;
-	txq->mp2mr[i].mr = mr;
-	txq->mp2mr[i].lkey = mr->lkey;
-	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
-	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
-	return txq->mp2mr[i].lkey;
-}
-
-struct txq_mp2mr_mbuf_check_data {
-	int ret;
-};
-
-/**
- * Callback function for rte_mempool_obj_iter() to check whether a given
- * mempool object looks like a mbuf.
- *
- * @param[in] mp
- *   The mempool pointer
- * @param[in] arg
- *   Context data (struct txq_mp2mr_mbuf_check_data). Contains the
- *   return value.
- * @param[in] obj
- *   Object address.
- * @param index
- *   Object index, unused.
- */
-static void
-txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
-	uint32_t index __rte_unused)
-{
-	struct txq_mp2mr_mbuf_check_data *data = arg;
-	struct rte_mbuf *buf = obj;
-
-	/* Check whether mbuf structure fits element size and whether mempool
-	 * pointer is valid. */
-	if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
-		data->ret = -1;
-}
-
-/**
- * Iterator function for rte_mempool_walk() to register existing mempools and
- * fill the MP to MR cache of a TX queue.
- *
- * @param[in] mp
- *   Memory Pool to register.
- * @param *arg
- *   Pointer to TX queue structure.
- */
-static void
-txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
-{
-	struct txq *txq = arg;
-	struct txq_mp2mr_mbuf_check_data data = {
-		.ret = 0,
-	};
-
-	/* Register mempool only if the first element looks like a mbuf. */
-	if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
-			data.ret == -1)
-		return;
-	txq_mp2mr(txq, mp);
-}
-
-#if MLX4_PMD_SGE_WR_N > 1
-
-/**
- * Copy scattered mbuf contents to a single linear buffer.
- *
- * @param[out] linear
- *   Linear output buffer.
- * @param[in] buf
- *   Scattered input buffer.
- *
- * @return
- *   Number of bytes copied to the output buffer or 0 if not large enough.
- */
-static unsigned int
-linearize_mbuf(linear_t *linear, struct rte_mbuf *buf)
-{
-	unsigned int size = 0;
-	unsigned int offset;
-
-	do {
-		unsigned int len = DATA_LEN(buf);
-
-		offset = size;
-		size += len;
-		if (unlikely(size > sizeof(*linear)))
-			return 0;
-		memcpy(&(*linear)[offset],
-		       rte_pktmbuf_mtod(buf, uint8_t *),
-		       len);
-		buf = NEXT(buf);
-	} while (buf != NULL);
-	return size;
-}
-
-/**
- * Handle scattered buffers for mlx4_tx_burst().
- *
- * @param txq
- *   TX queue structure.
- * @param segs
- *   Number of segments in buf.
- * @param elt
- *   TX queue element to fill.
- * @param[in] buf
- *   Buffer to process.
- * @param elts_head
- *   Index of the linear buffer to use if necessary (normally txq->elts_head).
- * @param[out] sges
- *   Array filled with SGEs on success.
- *
- * @return
- *   A structure containing the processed packet size in bytes and the
- *   number of SGEs. Both fields are set to (unsigned int)-1 in case of
- *   failure.
- */
-static struct tx_burst_sg_ret {
-	unsigned int length;
-	unsigned int num;
-}
-tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt,
-	    struct rte_mbuf *buf, unsigned int elts_head,
-	    struct ibv_sge (*sges)[MLX4_PMD_SGE_WR_N])
-{
-	unsigned int sent_size = 0;
-	unsigned int j;
-	int linearize = 0;
-
-	/* When there are too many segments, extra segments are
-	 * linearized in the last SGE. */
-	if (unlikely(segs > elemof(*sges))) {
-		segs = (elemof(*sges) - 1);
-		linearize = 1;
-	}
-	/* Update element. */
-	elt->buf = buf;
-	/* Register segments as SGEs. */
-	for (j = 0; (j != segs); ++j) {
-		struct ibv_sge *sge = &(*sges)[j];
-		uint32_t lkey;
-
-		/* Retrieve Memory Region key for this memory pool. */
-		lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-		if (unlikely(lkey == (uint32_t)-1)) {
-			/* MR does not exist. */
-			DEBUG("%p: unable to get MP <-> MR association",
-			      (void *)txq);
-			/* Clean up TX element. */
-			elt->buf = NULL;
-			goto stop;
-		}
-		/* Update SGE. */
-		sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		if (txq->priv->vf)
-			rte_prefetch0((volatile void *)
-				      (uintptr_t)sge->addr);
-		sge->length = DATA_LEN(buf);
-		sge->lkey = lkey;
-		sent_size += sge->length;
-		buf = NEXT(buf);
-	}
-	/* If buf is not NULL here and is not going to be linearized,
-	 * nb_segs is not valid. */
-	assert(j == segs);
-	assert((buf == NULL) || (linearize));
-	/* Linearize extra segments. */
-	if (linearize) {
-		struct ibv_sge *sge = &(*sges)[segs];
-		linear_t *linear = &(*txq->elts_linear)[elts_head];
-		unsigned int size = linearize_mbuf(linear, buf);
-
-		assert(segs == (elemof(*sges) - 1));
-		if (size == 0) {
-			/* Invalid packet. */
-			DEBUG("%p: packet too large to be linearized.",
-			      (void *)txq);
-			/* Clean up TX element. */
-			elt->buf = NULL;
-			goto stop;
-		}
-		/* If MLX4_PMD_SGE_WR_N is 1, free mbuf immediately. */
-		if (elemof(*sges) == 1) {
-			do {
-				struct rte_mbuf *next = NEXT(buf);
-
-				rte_pktmbuf_free_seg(buf);
-				buf = next;
-			} while (buf != NULL);
-			elt->buf = NULL;
-		}
-		/* Update SGE. */
-		sge->addr = (uintptr_t)&(*linear)[0];
-		sge->length = size;
-		sge->lkey = txq->mr_linear->lkey;
-		sent_size += size;
-		/* Include last segment. */
-		segs++;
-	}
-	return (struct tx_burst_sg_ret){
-		.length = sent_size,
-		.num = segs,
-	};
-stop:
-	return (struct tx_burst_sg_ret){
-		.length = -1,
-		.num = -1,
-	};
-}
-
-#endif /* MLX4_PMD_SGE_WR_N > 1 */
-
-/**
- * DPDK callback for TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct txq *txq = (struct txq *)dpdk_txq;
-	unsigned int elts_head = txq->elts_head;
-	const unsigned int elts_n = txq->elts_n;
-	unsigned int elts_comp_cd = txq->elts_comp_cd;
-	unsigned int elts_comp = 0;
-	unsigned int i;
-	unsigned int max;
-	int err;
-
-	assert(elts_comp_cd != 0);
-	txq_complete(txq);
-	max = (elts_n - (elts_head - txq->elts_tail));
-	if (max > elts_n)
-		max -= elts_n;
-	assert(max >= 1);
-	assert(max <= elts_n);
-	/* Always leave one free entry in the ring. */
-	--max;
-	if (max == 0)
-		return 0;
-	if (max > pkts_n)
-		max = pkts_n;
-	for (i = 0; (i != max); ++i) {
-		struct rte_mbuf *buf = pkts[i];
-		unsigned int elts_head_next =
-			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
-		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
-		struct txq_elt *elt = &(*txq->elts)[elts_head];
-		unsigned int segs = NB_SEGS(buf);
-#ifdef MLX4_PMD_SOFT_COUNTERS
-		unsigned int sent_size = 0;
-#endif
-		uint32_t send_flags = 0;
-
-		/* Clean up old buffer. */
-		if (likely(elt->buf != NULL)) {
-			struct rte_mbuf *tmp = elt->buf;
-
-#ifndef NDEBUG
-			/* Poisoning. */
-			memset(elt, 0x66, sizeof(*elt));
-#endif
-			/* Faster than rte_pktmbuf_free(). */
-			do {
-				struct rte_mbuf *next = NEXT(tmp);
-
-				rte_pktmbuf_free_seg(tmp);
-				tmp = next;
-			} while (tmp != NULL);
-		}
-		/* Request TX completion. */
-		if (unlikely(--elts_comp_cd == 0)) {
-			elts_comp_cd = txq->elts_comp_cd_init;
-			++elts_comp;
-			send_flags |= IBV_EXP_QP_BURST_SIGNALED;
-		}
-		/* Should we enable HW CKSUM offload */
-		if (buf->ol_flags &
-		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
-			send_flags |= IBV_EXP_QP_BURST_IP_CSUM;
-			/* HW does not support checksum offloads at arbitrary
-			 * offsets but automatically recognizes the packet
-			 * type. For inner L3/L4 checksums, only VXLAN (UDP)
-			 * tunnels are currently supported. */
-			if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type))
-				send_flags |= IBV_EXP_QP_BURST_TUNNEL;
-		}
-		if (likely(segs == 1)) {
-			uintptr_t addr;
-			uint32_t length;
-			uint32_t lkey;
-
-			/* Retrieve buffer information. */
-			addr = rte_pktmbuf_mtod(buf, uintptr_t);
-			length = DATA_LEN(buf);
-			/* Retrieve Memory Region key for this memory pool. */
-			lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-			if (unlikely(lkey == (uint32_t)-1)) {
-				/* MR does not exist. */
-				DEBUG("%p: unable to get MP <-> MR"
-				      " association", (void *)txq);
-				/* Clean up TX element. */
-				elt->buf = NULL;
-				goto stop;
-			}
-			/* Update element. */
-			elt->buf = buf;
-			if (txq->priv->vf)
-				rte_prefetch0((volatile void *)
-					      (uintptr_t)addr);
-			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-			/* Put packet into send queue. */
-#if MLX4_PMD_MAX_INLINE > 0
-			if (length <= txq->max_inline)
-				err = txq->if_qp->send_pending_inline
-					(txq->qp,
-					 (void *)addr,
-					 length,
-					 send_flags);
-			else
-#endif
-				err = txq->if_qp->send_pending
-					(txq->qp,
-					 addr,
-					 length,
-					 lkey,
-					 send_flags);
-			if (unlikely(err))
-				goto stop;
-#ifdef MLX4_PMD_SOFT_COUNTERS
-			sent_size += length;
-#endif
-		} else {
-#if MLX4_PMD_SGE_WR_N > 1
-			struct ibv_sge sges[MLX4_PMD_SGE_WR_N];
-			struct tx_burst_sg_ret ret;
-
-			ret = tx_burst_sg(txq, segs, elt, buf, elts_head,
-					  &sges);
-			if (ret.length == (unsigned int)-1)
-				goto stop;
-			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-			/* Put SG list into send queue. */
-			err = txq->if_qp->send_pending_sg_list
-				(txq->qp,
-				 sges,
-				 ret.num,
-				 send_flags);
-			if (unlikely(err))
-				goto stop;
-#ifdef MLX4_PMD_SOFT_COUNTERS
-			sent_size += ret.length;
-#endif
-#else /* MLX4_PMD_SGE_WR_N > 1 */
-			DEBUG("%p: TX scattered buffers support not"
-			      " compiled in", (void *)txq);
-			goto stop;
-#endif /* MLX4_PMD_SGE_WR_N > 1 */
-		}
-		elts_head = elts_head_next;
-#ifdef MLX4_PMD_SOFT_COUNTERS
-		/* Increment sent bytes counter. */
-		txq->stats.obytes += sent_size;
-#endif
-	}
-stop:
-	/* Take a shortcut if nothing must be sent. */
-	if (unlikely(i == 0))
-		return 0;
-#ifdef MLX4_PMD_SOFT_COUNTERS
-	/* Increment sent packets counter. */
-	txq->stats.opackets += i;
-#endif
-	/* Ring QP doorbell. */
-	err = txq->if_qp->send_flush(txq->qp);
-	if (unlikely(err)) {
-		/* A nonzero value is not supposed to be returned.
-		 * Nothing can be done about it. */
-		DEBUG("%p: send_flush() failed with error %d",
-		      (void *)txq, err);
-	}
-	txq->elts_head = elts_head;
-	txq->elts_comp += elts_comp;
-	txq->elts_comp_cd = elts_comp_cd;
-	return i;
-}
-
-/**
- * DPDK callback for TX in secondary processes.
- *
- * This function configures all queues from primary process information
- * if necessary before reverting to the normal TX burst callback.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-mlx4_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts,
-			      uint16_t pkts_n)
-{
-	struct txq *txq = dpdk_txq;
-	struct priv *priv = mlx4_secondary_data_setup(txq->priv);
-	struct priv *primary_priv;
-	unsigned int index;
-
-	if (priv == NULL)
-		return 0;
-	primary_priv =
-		mlx4_secondary_data[priv->dev->data->port_id].primary_priv;
-	/* Look for queue index in both private structures. */
-	for (index = 0; index != priv->txqs_n; ++index)
-		if (((*primary_priv->txqs)[index] == txq) ||
-		    ((*priv->txqs)[index] == txq))
-			break;
-	if (index == priv->txqs_n)
-		return 0;
-	txq = (*priv->txqs)[index];
-	return priv->dev->tx_pkt_burst(txq, pkts, pkts_n);
-}
-
-/**
- * Configure a TX queue.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param txq
- *   Pointer to TX queue structure.
- * @param desc
- *   Number of descriptors to configure in queue.
- * @param socket
- *   NUMA socket on which memory must be allocated.
- * @param[in] conf
- *   Thresholds parameters.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
-	  unsigned int socket, const struct rte_eth_txconf *conf)
-{
-	struct priv *priv = mlx4_get_priv(dev);
-	struct txq tmpl = {
-		.priv = priv,
-		.socket = socket
-	};
-	union {
-		struct ibv_exp_query_intf_params params;
-		struct ibv_exp_qp_init_attr init;
-		struct ibv_exp_res_domain_init_attr rd;
-		struct ibv_exp_cq_init_attr cq;
-		struct ibv_exp_qp_attr mod;
-	} attr;
-	enum ibv_exp_query_intf_status status;
-	int ret = 0;
-
-	(void)conf; /* Thresholds configuration (ignored). */
-	if (priv == NULL)
-		return EINVAL;
-	if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) {
-		ERROR("%p: invalid number of TX descriptors (must be a"
-		      " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N);
-		return EINVAL;
-	}
-	desc /= MLX4_PMD_SGE_WR_N;
-	/* MRs will be registered in mp2mr[] later. */
-	attr.rd = (struct ibv_exp_res_domain_init_attr){
-		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
-			      IBV_EXP_RES_DOMAIN_MSG_MODEL),
-		.thread_model = IBV_EXP_THREAD_SINGLE,
-		.msg_model = IBV_EXP_MSG_HIGH_BW,
-	};
-	tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
-	if (tmpl.rd == NULL) {
-		ret = ENOMEM;
-		ERROR("%p: RD creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	attr.cq = (struct ibv_exp_cq_init_attr){
-		.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
-		.res_domain = tmpl.rd,
-	};
-	tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
-	if (tmpl.cq == NULL) {
-		ret = ENOMEM;
-		ERROR("%p: CQ creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	DEBUG("priv->device_attr.max_qp_wr is %d",
-	      priv->device_attr.max_qp_wr);
-	DEBUG("priv->device_attr.max_sge is %d",
-	      priv->device_attr.max_sge);
-	attr.init = (struct ibv_exp_qp_init_attr){
-		/* CQ to be associated with the send queue. */
-		.send_cq = tmpl.cq,
-		/* CQ to be associated with the receive queue. */
-		.recv_cq = tmpl.cq,
-		.cap = {
-			/* Max number of outstanding WRs. */
-			.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
-					priv->device_attr.max_qp_wr :
-					desc),
-			/* Max number of scatter/gather elements in a WR. */
-			.max_send_sge = ((priv->device_attr.max_sge <
-					  MLX4_PMD_SGE_WR_N) ?
-					 priv->device_attr.max_sge :
-					 MLX4_PMD_SGE_WR_N),
-#if MLX4_PMD_MAX_INLINE > 0
-			.max_inline_data = MLX4_PMD_MAX_INLINE,
-#endif
-		},
-		.qp_type = IBV_QPT_RAW_PACKET,
-		/* Do *NOT* enable this, completions events are managed per
-		 * TX burst. */
-		.sq_sig_all = 0,
-		.pd = priv->pd,
-		.res_domain = tmpl.rd,
-		.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
-			      IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
-	};
-	tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
-	if (tmpl.qp == NULL) {
-		ret = (errno ? errno : EINVAL);
-		ERROR("%p: QP creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-#if MLX4_PMD_MAX_INLINE > 0
-	/* ibv_create_qp() updates this value. */
-	tmpl.max_inline = attr.init.cap.max_inline_data;
-#endif
-	attr.mod = (struct ibv_exp_qp_attr){
-		/* Move the QP to this state. */
-		.qp_state = IBV_QPS_INIT,
-		/* Primary port number. */
-		.port_num = priv->port
-	};
-	ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
-				(IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
-	if (ret) {
-		ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	ret = txq_alloc_elts(&tmpl, desc);
-	if (ret) {
-		ERROR("%p: TXQ allocation failed: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	attr.mod = (struct ibv_exp_qp_attr){
-		.qp_state = IBV_QPS_RTR
-	};
-	ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
-	if (ret) {
-		ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	attr.mod.qp_state = IBV_QPS_RTS;
-	ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
-	if (ret) {
-		ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	attr.params = (struct ibv_exp_query_intf_params){
-		.intf_scope = IBV_EXP_INTF_GLOBAL,
-		.intf = IBV_EXP_INTF_CQ,
-		.obj = tmpl.cq,
-	};
-	tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
-	if (tmpl.if_cq == NULL) {
-		ERROR("%p: CQ interface family query failed with status %d",
-		      (void *)dev, status);
-		goto error;
-	}
-	attr.params = (struct ibv_exp_query_intf_params){
-		.intf_scope = IBV_EXP_INTF_GLOBAL,
-		.intf = IBV_EXP_INTF_QP_BURST,
-		.obj = tmpl.qp,
-#ifdef HAVE_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK
-		/* MC loopback must be disabled when not using a VF. */
-		.family_flags =
-			(!priv->vf ?
-			 IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK :
-			 0),
-#endif
-	};
-	tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
-	if (tmpl.if_qp == NULL) {
-		ERROR("%p: QP interface family query failed with status %d",
-		      (void *)dev, status);
-		goto error;
-	}
-	/* Clean up txq in case we're reinitializing it. */
-	DEBUG("%p: cleaning-up old txq just in case", (void *)txq);
-	txq_cleanup(txq);
-	*txq = tmpl;
-	DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl);
-	/* Pre-register known mempools. */
-	rte_mempool_walk(txq_mp2mr_iter, txq);
-	assert(ret == 0);
-	return 0;
-error:
-	txq_cleanup(&tmpl);
-	assert(ret > 0);
-	return ret;
-}
-
-/**
- * DPDK callback to configure a TX queue.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param idx
- *   TX queue index.
- * @param desc
- *   Number of descriptors to configure in queue.
- * @param socket
- *   NUMA socket on which memory must be allocated.
- * @param[in] conf
- *   Thresholds parameters.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
-		    unsigned int socket, const struct rte_eth_txconf *conf)
-{
-	struct priv *priv = dev->data->dev_private;
-	struct txq *txq = (*priv->txqs)[idx];
-	int ret;
-
-	if (mlx4_is_secondary())
-		return -E_RTE_SECONDARY;
-	priv_lock(priv);
-	DEBUG("%p: configuring queue %u for %u descriptors",
-	      (void *)dev, idx, desc);
-	if (idx >= priv->txqs_n) {
-		ERROR("%p: queue index out of range (%u >= %u)",
-		      (void *)dev, idx, priv->txqs_n);
-		priv_unlock(priv);
-		return -EOVERFLOW;
-	}
-	if (txq != NULL) {
-		DEBUG("%p: reusing already allocated queue index %u (%p)",
-		      (void *)dev, idx, (void *)txq);
-		if (priv->started) {
-			priv_unlock(priv);
-			return -EEXIST;
-		}
-		(*priv->txqs)[idx] = NULL;
-		txq_cleanup(txq);
-	} else {
-		txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, socket);
-		if (txq == NULL) {
-			ERROR("%p: unable to allocate queue index %u",
-			      (void *)dev, idx);
-			priv_unlock(priv);
-			return -ENOMEM;
-		}
-	}
-	ret = txq_setup(dev, txq, desc, socket, conf);
-	if (ret)
-		rte_free(txq);
-	else {
-		txq->stats.idx = idx;
-		DEBUG("%p: adding TX queue %p to list",
-		      (void *)dev, (void *)txq);
-		(*priv->txqs)[idx] = txq;
-		/* Update send callback. */
-		dev->tx_pkt_burst = mlx4_tx_burst;
-	}
-	priv_unlock(priv);
-	return -ret;
-}
-
-/**
- * DPDK callback to release a TX queue.
- *
- * @param dpdk_txq
- *   Generic TX queue pointer.
- */
-static void
-mlx4_tx_queue_release(void *dpdk_txq)
-{
-	struct txq *txq = (struct txq *)dpdk_txq;
-	struct priv *priv;
-	unsigned int i;
-
-	if (mlx4_is_secondary())
-		return;
-	if (txq == NULL)
-		return;
-	priv = txq->priv;
-	priv_lock(priv);
-	for (i = 0; (i != priv->txqs_n); ++i)
-		if ((*priv->txqs)[i] == txq) {
-			DEBUG("%p: removing TX queue %p from list",
-			      (void *)priv->dev, (void *)txq);
-			(*priv->txqs)[i] = NULL;
-			break;
-		}
-	txq_cleanup(txq);
-	rte_free(txq);
-	priv_unlock(priv);
-}
-
-/* RX queues handling. */
-
-/**
- * Allocate RX queue elements with scattered packets support.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- * @param elts_n
- *   Number of elements to allocate.
- * @param[in] pool
- *   If not NULL, fetch buffers from this array instead of allocating them
- *   with rte_pktmbuf_alloc().
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
-		  struct rte_mbuf **pool)
-{
-	unsigned int i;
-	struct rxq_elt_sp (*elts)[elts_n] =
-		rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
-				  rxq->socket);
-	int ret = 0;
-
-	if (elts == NULL) {
-		ERROR("%p: can't allocate packets array", (void *)rxq);
-		ret = ENOMEM;
-		goto error;
-	}
-	/* For each WR (packet). */
-	for (i = 0; (i != elts_n); ++i) {
-		unsigned int j;
-		struct rxq_elt_sp *elt = &(*elts)[i];
-		struct ibv_recv_wr *wr = &elt->wr;
-		struct ibv_sge (*sges)[(elemof(elt->sges))] = &elt->sges;
-
-		/* These two arrays must have the same size. */
-		assert(elemof(elt->sges) == elemof(elt->bufs));
-		/* Configure WR. */
-		wr->wr_id = i;
-		wr->next = &(*elts)[(i + 1)].wr;
-		wr->sg_list = &(*sges)[0];
-		wr->num_sge = elemof(*sges);
-		/* For each SGE (segment). */
-		for (j = 0; (j != elemof(elt->bufs)); ++j) {
-			struct ibv_sge *sge = &(*sges)[j];
-			struct rte_mbuf *buf;
-
-			if (pool != NULL) {
-				buf = *(pool++);
-				assert(buf != NULL);
-				rte_pktmbuf_reset(buf);
-			} else
-				buf = rte_pktmbuf_alloc(rxq->mp);
-			if (buf == NULL) {
-				assert(pool == NULL);
-				ERROR("%p: empty mbuf pool", (void *)rxq);
-				ret = ENOMEM;
-				goto error;
-			}
-			elt->bufs[j] = buf;
-			/* Headroom is reserved by rte_pktmbuf_alloc(). */
-			assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
-			/* Buffer is supposed to be empty. */
-			assert(rte_pktmbuf_data_len(buf) == 0);
-			assert(rte_pktmbuf_pkt_len(buf) == 0);
-			/* sge->addr must be able to store a pointer. */
-			assert(sizeof(sge->addr) >= sizeof(uintptr_t));
-			if (j == 0) {
-				/* The first SGE keeps its headroom. */
-				sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
-				sge->length = (buf->buf_len -
-					       RTE_PKTMBUF_HEADROOM);
-			} else {
-				/* Subsequent SGEs lose theirs. */
-				assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
-				SET_DATA_OFF(buf, 0);
-				sge->addr = (uintptr_t)buf->buf_addr;
-				sge->length = buf->buf_len;
-			}
-			sge->lkey = rxq->mr->lkey;
-			/* Redundant check for tailroom. */
-			assert(sge->length == rte_pktmbuf_tailroom(buf));
-		}
-	}
-	/* The last WR pointer must be NULL. */
-	(*elts)[(i - 1)].wr.next = NULL;
-	DEBUG("%p: allocated and configured %u WRs (%zu segments)",
-	      (void *)rxq, elts_n, (elts_n * elemof((*elts)[0].sges)));
-	rxq->elts_n = elts_n;
-	rxq->elts_head = 0;
-	rxq->elts.sp = elts;
-	assert(ret == 0);
-	return 0;
-error:
-	if (elts != NULL) {
-		assert(pool == NULL);
-		for (i = 0; (i != elemof(*elts)); ++i) {
-			unsigned int j;
-			struct rxq_elt_sp *elt = &(*elts)[i];
-
-			for (j = 0; (j != elemof(elt->bufs)); ++j) {
-				struct rte_mbuf *buf = elt->bufs[j];
-
-				if (buf != NULL)
-					rte_pktmbuf_free_seg(buf);
-			}
-		}
-		rte_free(elts);
-	}
-	DEBUG("%p: failed, freed everything", (void *)rxq);
-	assert(ret > 0);
-	return ret;
-}
-
-/**
- * Free RX queue elements with scattered packets support.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- */
-static void
-rxq_free_elts_sp(struct rxq *rxq)
-{
-	unsigned int i;
-	unsigned int elts_n = rxq->elts_n;
-	struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
-
-	DEBUG("%p: freeing WRs", (void *)rxq);
-	rxq->elts_n = 0;
-	rxq->elts.sp = NULL;
-	if (elts == NULL)
-		return;
-	for (i = 0; (i != elemof(*elts)); ++i) {
-		unsigned int j;
-		struct rxq_elt_sp *elt = &(*elts)[i];
-
-		for (j = 0; (j != elemof(elt->bufs)); ++j) {
-			struct rte_mbuf *buf = elt->bufs[j];
-
-			if (buf != NULL)
-				rte_pktmbuf_free_seg(buf);
-		}
-	}
-	rte_free(elts);
-}
-
-/**
- * Allocate RX queue elements.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- * @param elts_n
- *   Number of elements to allocate.
- * @param[in] pool
- *   If not NULL, fetch buffers from this array instead of allocating them
- *   with rte_pktmbuf_alloc().
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
-{
-	unsigned int i;
-	struct rxq_elt (*elts)[elts_n] =
-		rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
-				  rxq->socket);
-	int ret = 0;
-
-	if (elts == NULL) {
-		ERROR("%p: can't allocate packets array", (void *)rxq);
-		ret = ENOMEM;
-		goto error;
-	}
-	/* For each WR (packet). */
-	for (i = 0; (i != elts_n); ++i) {
-		struct rxq_elt *elt = &(*elts)[i];
-		struct ibv_recv_wr *wr = &elt->wr;
-		struct ibv_sge *sge = &(*elts)[i].sge;
-		struct rte_mbuf *buf;
-
-		if (pool != NULL) {
-			buf = *(pool++);
-			assert(buf != NULL);
-			rte_pktmbuf_reset(buf);
-		} else
-			buf = rte_pktmbuf_alloc(rxq->mp);
-		if (buf == NULL) {
-			assert(pool == NULL);
-			ERROR("%p: empty mbuf pool", (void *)rxq);
-			ret = ENOMEM;
-			goto error;
-		}
-		/* Configure WR. Work request ID contains its own index in
-		 * the elts array and the offset between SGE buffer header and
-		 * its data. */
-		WR_ID(wr->wr_id).id = i;
-		WR_ID(wr->wr_id).offset =
-			(((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) -
-			 (uintptr_t)buf);
-		wr->next = &(*elts)[(i + 1)].wr;
-		wr->sg_list = sge;
-		wr->num_sge = 1;
-		/* Headroom is reserved by rte_pktmbuf_alloc(). */
-		assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
-		/* Buffer is supposed to be empty. */
-		assert(rte_pktmbuf_data_len(buf) == 0);
-		assert(rte_pktmbuf_pkt_len(buf) == 0);
-		/* sge->addr must be able to store a pointer. */
-		assert(sizeof(sge->addr) >= sizeof(uintptr_t));
-		/* SGE keeps its headroom. */
-		sge->addr = (uintptr_t)
-			((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
-		sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
-		sge->lkey = rxq->mr->lkey;
-		/* Redundant check for tailroom. */
-		assert(sge->length == rte_pktmbuf_tailroom(buf));
-		/* Make sure elts index and SGE mbuf pointer can be deduced
-		 * from WR ID. */
-		if ((WR_ID(wr->wr_id).id != i) ||
-		    ((void *)((uintptr_t)sge->addr -
-			WR_ID(wr->wr_id).offset) != buf)) {
-			ERROR("%p: cannot store index and offset in WR ID",
-			      (void *)rxq);
-			sge->addr = 0;
-			rte_pktmbuf_free(buf);
-			ret = EOVERFLOW;
-			goto error;
-		}
-	}
-	/* The last WR pointer must be NULL. */
-	(*elts)[(i - 1)].wr.next = NULL;
-	DEBUG("%p: allocated and configured %u single-segment WRs",
-	      (void *)rxq, elts_n);
-	rxq->elts_n = elts_n;
-	rxq->elts_head = 0;
-	rxq->elts.no_sp = elts;
-	assert(ret == 0);
-	return 0;
-error:
-	if (elts != NULL) {
-		assert(pool == NULL);
-		for (i = 0; (i != elemof(*elts)); ++i) {
-			struct rxq_elt *elt = &(*elts)[i];
-			struct rte_mbuf *buf;
-
-			if (elt->sge.addr == 0)
-				continue;
-			assert(WR_ID(elt->wr.wr_id).id == i);
-			buf = (void *)((uintptr_t)elt->sge.addr -
-				WR_ID(elt->wr.wr_id).offset);
-			rte_pktmbuf_free_seg(buf);
-		}
-		rte_free(elts);
-	}
-	DEBUG("%p: failed, freed everything", (void *)rxq);
-	assert(ret > 0);
-	return ret;
-}
-
-/**
- * Free RX queue elements.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- */
-static void
-rxq_free_elts(struct rxq *rxq)
-{
-	unsigned int i;
-	unsigned int elts_n = rxq->elts_n;
-	struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
-
-	DEBUG("%p: freeing WRs", (void *)rxq);
-	rxq->elts_n = 0;
-	rxq->elts.no_sp = NULL;
-	if (elts == NULL)
-		return;
-	for (i = 0; (i != elemof(*elts)); ++i) {
-		struct rxq_elt *elt = &(*elts)[i];
-		struct rte_mbuf *buf;
-
-		if (elt->sge.addr == 0)
-			continue;
-		assert(WR_ID(elt->wr.wr_id).id == i);
-		buf = (void *)((uintptr_t)elt->sge.addr -
-			WR_ID(elt->wr.wr_id).offset);
-		rte_pktmbuf_free_seg(buf);
-	}
-	rte_free(elts);
-}
-
-/**
- * Delete flow steering rule.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- * @param mac_index
- *   MAC address index.
- * @param vlan_index
- *   VLAN index.
- */
-static void
-rxq_del_flow(struct rxq *rxq, unsigned int mac_index, unsigned int vlan_index)
-{
-#ifndef NDEBUG
-	struct priv *priv = rxq->priv;
-	const uint8_t (*mac)[ETHER_ADDR_LEN] =
-		(const uint8_t (*)[ETHER_ADDR_LEN])
-		priv->mac[mac_index].addr_bytes;
-#endif
-	assert(rxq->mac_flow[mac_index][vlan_index] != NULL);
-	DEBUG("%p: removing MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u"
-	      " (VLAN ID %" PRIu16 ")",
-	      (void *)rxq,
-	      (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5],
-	      mac_index, priv->vlan_filter[vlan_index].id);
-	claim_zero(ibv_destroy_flow(rxq->mac_flow[mac_index][vlan_index]));
-	rxq->mac_flow[mac_index][vlan_index] = NULL;
-}
-
-/**
- * Unregister a MAC address from a RX queue.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- * @param mac_index
- *   MAC address index.
- */
-static void
-rxq_mac_addr_del(struct rxq *rxq, unsigned int mac_index)
-{
-	struct priv *priv = rxq->priv;
-	unsigned int i;
-	unsigned int vlans = 0;
-
-	assert(mac_index < elemof(priv->mac));
-	if (!BITFIELD_ISSET(rxq->mac_configured, mac_index))
-		return;
-	for (i = 0; (i != elemof(priv->vlan_filter)); ++i) {
-		if (!priv->vlan_filter[i].enabled)
-			continue;
-		rxq_del_flow(rxq, mac_index, i);
-		vlans++;
-	}
-	if (!vlans) {
-		rxq_del_flow(rxq, mac_index, 0);
-	}
-	BITFIELD_RESET(rxq->mac_configured, mac_index);
-}
-
-/**
- * Unregister all MAC addresses from a RX queue.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- */
-static void
-rxq_mac_addrs_del(struct rxq *rxq)
-{
-	struct priv *priv = rxq->priv;
-	unsigned int i;
-
-	for (i = 0; (i != elemof(priv->mac)); ++i)
-		rxq_mac_addr_del(rxq, i);
-}
-
-static int rxq_promiscuous_enable(struct rxq *);
-static void rxq_promiscuous_disable(struct rxq *);
-
-/**
- * Add single flow steering rule.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- * @param mac_index
- *   MAC address index to register.
- * @param vlan_index
- *   VLAN index. Use -1 for a flow without VLAN.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-rxq_add_flow(struct rxq *rxq, unsigned int mac_index, unsigned int vlan_index)
-{
-	struct ibv_flow *flow;
-	struct priv *priv = rxq->priv;
-	const uint8_t (*mac)[ETHER_ADDR_LEN] =
-			(const uint8_t (*)[ETHER_ADDR_LEN])
-			priv->mac[mac_index].addr_bytes;
-
-	/* Allocate flow specification on the stack. */
-	struct __attribute__((packed)) {
-		struct ibv_flow_attr attr;
-		struct ibv_flow_spec_eth spec;
-	} data;
-	struct ibv_flow_attr *attr = &data.attr;
-	struct ibv_flow_spec_eth *spec = &data.spec;
-
-	assert(mac_index < elemof(priv->mac));
-	assert((vlan_index < elemof(priv->vlan_filter)) || (vlan_index == -1u));
-	/*
-	 * No padding must be inserted by the compiler between attr and spec.
-	 * This layout is expected by libibverbs.
-	 */
-	assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec);
-	*attr = (struct ibv_flow_attr){
-		.type = IBV_FLOW_ATTR_NORMAL,
-		.priority = 3,
-		.num_of_specs = 1,
-		.port = priv->port,
-		.flags = 0
-	};
-	*spec = (struct ibv_flow_spec_eth){
-		.type = IBV_FLOW_SPEC_ETH,
-		.size = sizeof(*spec),
-		.val = {
-			.dst_mac = {
-				(*mac)[0], (*mac)[1], (*mac)[2],
-				(*mac)[3], (*mac)[4], (*mac)[5]
-			},
-			.vlan_tag = ((vlan_index != -1u) ?
-				     htons(priv->vlan_filter[vlan_index].id) :
-				     0),
-		},
-		.mask = {
-			.dst_mac = "\xff\xff\xff\xff\xff\xff",
-			.vlan_tag = ((vlan_index != -1u) ? htons(0xfff) : 0),
-		}
-	};
-	DEBUG("%p: adding MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u"
-	      " (VLAN %s %" PRIu16 ")",
-	      (void *)rxq,
-	      (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5],
-	      mac_index,
-	      ((vlan_index != -1u) ? "ID" : "index"),
-	      ((vlan_index != -1u) ? priv->vlan_filter[vlan_index].id : -1u));
-	/* Create related flow. */
-	errno = 0;
-	flow = ibv_create_flow(rxq->qp, attr);
-	if (flow == NULL) {
-		/* It's not clear whether errno is always set in this case. */
-		ERROR("%p: flow configuration failed, errno=%d: %s",
-		      (void *)rxq, errno,
-		      (errno ? strerror(errno) : "Unknown error"));
-		if (errno)
-			return errno;
-		return EINVAL;
-	}
-	if (vlan_index == -1u)
-		vlan_index = 0;
-	assert(rxq->mac_flow[mac_index][vlan_index] == NULL);
-	rxq->mac_flow[mac_index][vlan_index] = flow;
-	return 0;
-}
-
-/**
- * Register a MAC address in a RX queue.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- * @param mac_index
- *   MAC address index to register.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-rxq_mac_addr_add(struct rxq *rxq, unsigned int mac_index)
-{
-	struct priv *priv = rxq->priv;
-	unsigned int i;
-	unsigned int vlans = 0;
-	int ret;
-
-	assert(mac_index < elemof(priv->mac));
-	if (BITFIELD_ISSET(rxq->mac_configured, mac_index))
-		rxq_mac_addr_del(rxq, mac_index);
-	/* Fill VLAN specifications. */
-	for (i = 0; (i != elemof(priv->vlan_filter)); ++i) {
-		if (!priv->vlan_filter[i].enabled)
-			continue;
-		/* Create related flow. */
-		ret = rxq_add_flow(rxq, mac_index, i);
-		if (!ret) {
-			vlans++;
-			continue;
-		}
-		/* Failure, rollback. */
-		while (i != 0)
-			if (priv->vlan_filter[--i].enabled)
-				rxq_del_flow(rxq, mac_index, i);
-		assert(ret > 0);
-		return ret;
-	}
-	/* In case there is no VLAN filter. */
-	if (!vlans) {
-		ret = rxq_add_flow(rxq, mac_index, -1);
-		if (ret)
-			return ret;
-	}
-	BITFIELD_SET(rxq->mac_configured, mac_index);
-	return 0;
-}
-
-/**
- * Register all MAC addresses in a RX queue.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-rxq_mac_addrs_add(struct rxq *rxq)
-{
-	struct priv *priv = rxq->priv;
-	unsigned int i;
-	int ret;
-
-	for (i = 0; (i != elemof(priv->mac)); ++i) {
-		if (!BITFIELD_ISSET(priv->mac_configured, i))
-			continue;
-		ret = rxq_mac_addr_add(rxq, i);
-		if (!ret)
-			continue;
-		/* Failure, rollback. */
-		while (i != 0)
-			rxq_mac_addr_del(rxq, --i);
-		assert(ret > 0);
-		return ret;
-	}
-	return 0;
-}
-
-/**
- * Unregister a MAC address.
- *
- * In RSS mode, the MAC address is unregistered from the parent queue,
- * otherwise it is unregistered from each queue directly.
- *
- * @param priv
- *   Pointer to private structure.
- * @param mac_index
- *   MAC address index.
- */
-static void
-priv_mac_addr_del(struct priv *priv, unsigned int mac_index)
-{
-	unsigned int i;
-
-	assert(!priv->isolated);
-	assert(mac_index < elemof(priv->mac));
-	if (!BITFIELD_ISSET(priv->mac_configured, mac_index))
-		return;
-	if (priv->rss) {
-		rxq_mac_addr_del(LIST_FIRST(&priv->parents), mac_index);
-		goto end;
-	}
-	for (i = 0; (i != priv->dev->data->nb_rx_queues); ++i)
-		rxq_mac_addr_del((*priv->rxqs)[i], mac_index);
-end:
-	BITFIELD_RESET(priv->mac_configured, mac_index);
-}
-
-/**
- * Register a MAC address.
- *
- * In RSS mode, the MAC address is registered in the parent queue,
- * otherwise it is registered in each queue directly.
- *
- * @param priv
- *   Pointer to private structure.
- * @param mac_index
- *   MAC address index to use.
- * @param mac
- *   MAC address to register.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-priv_mac_addr_add(struct priv *priv, unsigned int mac_index,
-		  const uint8_t (*mac)[ETHER_ADDR_LEN])
-{
-	unsigned int i;
-	int ret;
-
-	assert(mac_index < elemof(priv->mac));
-	/* First, make sure this address isn't already configured. */
-	for (i = 0; (i != elemof(priv->mac)); ++i) {
-		/* Skip this index, it's going to be reconfigured. */
-		if (i == mac_index)
-			continue;
-		if (!BITFIELD_ISSET(priv->mac_configured, i))
-			continue;
-		if (memcmp(priv->mac[i].addr_bytes, *mac, sizeof(*mac)))
-			continue;
-		/* Address already configured elsewhere, return with error. */
-		return EADDRINUSE;
-	}
-	if (BITFIELD_ISSET(priv->mac_configured, mac_index))
-		priv_mac_addr_del(priv, mac_index);
-	priv->mac[mac_index] = (struct ether_addr){
-		{
-			(*mac)[0], (*mac)[1], (*mac)[2],
-			(*mac)[3], (*mac)[4], (*mac)[5]
-		}
-	};
-	/* If device isn't started, this is all we need to do. */
-	if (!priv->started) {
-#ifndef NDEBUG
-		/* Verify that all queues have this index disabled. */
-		for (i = 0; (i != priv->rxqs_n); ++i) {
-			if ((*priv->rxqs)[i] == NULL)
-				continue;
-			assert(!BITFIELD_ISSET
-			       ((*priv->rxqs)[i]->mac_configured, mac_index));
-		}
-#endif
-		goto end;
-	}
-	if (priv->rss) {
-		ret = rxq_mac_addr_add(LIST_FIRST(&priv->parents), mac_index);
-		if (ret)
-			return ret;
-		goto end;
-	}
-	for (i = 0; (i != priv->rxqs_n); ++i) {
-		if ((*priv->rxqs)[i] == NULL)
-			continue;
-		ret = rxq_mac_addr_add((*priv->rxqs)[i], mac_index);
-		if (!ret)
-			continue;
-		/* Failure, rollback. */
-		while (i != 0)
-			if ((*priv->rxqs)[(--i)] != NULL)
-				rxq_mac_addr_del((*priv->rxqs)[i], mac_index);
-		return ret;
-	}
-end:
-	BITFIELD_SET(priv->mac_configured, mac_index);
-	return 0;
-}
-
-/**
- * Enable allmulti mode in a RX queue.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-rxq_allmulticast_enable(struct rxq *rxq)
-{
-	struct ibv_flow *flow;
-	struct ibv_flow_attr attr = {
-		.type = IBV_FLOW_ATTR_MC_DEFAULT,
-		.num_of_specs = 0,
-		.port = rxq->priv->port,
-		.flags = 0
-	};
-
-	DEBUG("%p: enabling allmulticast mode", (void *)rxq);
-	if (rxq->allmulti_flow != NULL)
-		return EBUSY;
-	errno = 0;
-	flow = ibv_create_flow(rxq->qp, &attr);
-	if (flow == NULL) {
-		/* It's not clear whether errno is always set in this case. */
-		ERROR("%p: flow configuration failed, errno=%d: %s",
-		      (void *)rxq, errno,
-		      (errno ? strerror(errno) : "Unknown error"));
-		if (errno)
-			return errno;
-		return EINVAL;
-	}
-	rxq->allmulti_flow = flow;
-	DEBUG("%p: allmulticast mode enabled", (void *)rxq);
-	return 0;
-}
-
-/**
- * Disable allmulti mode in a RX queue.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- */
-static void
-rxq_allmulticast_disable(struct rxq *rxq)
-{
-	DEBUG("%p: disabling allmulticast mode", (void *)rxq);
-	if (rxq->allmulti_flow == NULL)
-		return;
-	claim_zero(ibv_destroy_flow(rxq->allmulti_flow));
-	rxq->allmulti_flow = NULL;
-	DEBUG("%p: allmulticast mode disabled", (void *)rxq);
-}
-
-/**
- * Enable promiscuous mode in a RX queue.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-rxq_promiscuous_enable(struct rxq *rxq)
-{
-	struct ibv_flow *flow;
-	struct ibv_flow_attr attr = {
-		.type = IBV_FLOW_ATTR_ALL_DEFAULT,
-		.num_of_specs = 0,
-		.port = rxq->priv->port,
-		.flags = 0
-	};
-
-	if (rxq->priv->vf)
-		return 0;
-	DEBUG("%p: enabling promiscuous mode", (void *)rxq);
-	if (rxq->promisc_flow != NULL)
-		return EBUSY;
-	errno = 0;
-	flow = ibv_create_flow(rxq->qp, &attr);
-	if (flow == NULL) {
-		/* It's not clear whether errno is always set in this case. */
-		ERROR("%p: flow configuration failed, errno=%d: %s",
-		      (void *)rxq, errno,
-		      (errno ? strerror(errno) : "Unknown error"));
-		if (errno)
-			return errno;
-		return EINVAL;
-	}
-	rxq->promisc_flow = flow;
-	DEBUG("%p: promiscuous mode enabled", (void *)rxq);
-	return 0;
-}
-
-/**
- * Disable promiscuous mode in a RX queue.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- */
-static void
-rxq_promiscuous_disable(struct rxq *rxq)
-{
-	if (rxq->priv->vf)
-		return;
-	DEBUG("%p: disabling promiscuous mode", (void *)rxq);
-	if (rxq->promisc_flow == NULL)
-		return;
-	claim_zero(ibv_destroy_flow(rxq->promisc_flow));
-	rxq->promisc_flow = NULL;
-	DEBUG("%p: promiscuous mode disabled", (void *)rxq);
-}
-
-/**
- * Clean up a RX queue.
- *
- * Destroy objects, free allocated memory and reset the structure for reuse.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- */
-static void
-rxq_cleanup(struct rxq *rxq)
-{
-	struct ibv_exp_release_intf_params params;
-
-	DEBUG("cleaning up %p", (void *)rxq);
-	if (rxq->sp)
-		rxq_free_elts_sp(rxq);
-	else
-		rxq_free_elts(rxq);
-	if (rxq->if_qp != NULL) {
-		assert(rxq->priv != NULL);
-		assert(rxq->priv->ctx != NULL);
-		assert(rxq->qp != NULL);
-		params = (struct ibv_exp_release_intf_params){
-			.comp_mask = 0,
-		};
-		claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
-						rxq->if_qp,
-						&params));
-	}
-	if (rxq->if_cq != NULL) {
-		assert(rxq->priv != NULL);
-		assert(rxq->priv->ctx != NULL);
-		assert(rxq->cq != NULL);
-		params = (struct ibv_exp_release_intf_params){
-			.comp_mask = 0,
-		};
-		claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
-						rxq->if_cq,
-						&params));
-	}
-	if (rxq->qp != NULL && !rxq->priv->isolated) {
-		rxq_promiscuous_disable(rxq);
-		rxq_allmulticast_disable(rxq);
-		rxq_mac_addrs_del(rxq);
-	}
-	if (rxq->qp != NULL)
-		claim_zero(ibv_destroy_qp(rxq->qp));
-	if (rxq->cq != NULL)
-		claim_zero(ibv_destroy_cq(rxq->cq));
-	if (rxq->channel != NULL)
-		claim_zero(ibv_destroy_comp_channel(rxq->channel));
-	if (rxq->rd != NULL) {
-		struct ibv_exp_destroy_res_domain_attr attr = {
-			.comp_mask = 0,
-		};
-
-		assert(rxq->priv != NULL);
-		assert(rxq->priv->ctx != NULL);
-		claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
-						      rxq->rd,
-						      &attr));
-	}
-	if (rxq->mr != NULL)
-		claim_zero(ibv_dereg_mr(rxq->mr));
-	memset(rxq, 0, sizeof(*rxq));
-}
-
-/**
- * Translate RX completion flags to packet type.
- *
- * @param flags
- *   RX completion flags returned by poll_length_flags().
- *
- * @note: fix mlx4_dev_supported_ptypes_get() if any change here.
- *
- * @return
- *   Packet type for struct rte_mbuf.
- */
-static inline uint32_t
-rxq_cq_to_pkt_type(uint32_t flags)
-{
-	uint32_t pkt_type;
-
-	if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET)
-		pkt_type =
-			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_OUTER_IPV4_PACKET,
-				  RTE_PTYPE_L3_IPV4_EXT_UNKNOWN) |
-			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_OUTER_IPV6_PACKET,
-				  RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) |
-			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_IPV4_PACKET,
-				  RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) |
-			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_IPV6_PACKET,
-				  RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN);
-	else
-		pkt_type =
-			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_IPV4_PACKET,
-				  RTE_PTYPE_L3_IPV4_EXT_UNKNOWN) |
-			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_IPV6_PACKET,
-				  RTE_PTYPE_L3_IPV6_EXT_UNKNOWN);
-	return pkt_type;
-}
-
-/**
- * Translate RX completion flags to offload flags.
- *
- * @param[in] rxq
- *   Pointer to RX queue structure.
- * @param flags
- *   RX completion flags returned by poll_length_flags().
- *
- * @return
- *   Offload flags (ol_flags) for struct rte_mbuf.
- */
-static inline uint32_t
-rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
-{
-	uint32_t ol_flags = 0;
-
-	if (rxq->csum)
-		ol_flags |=
-			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_IP_CSUM_OK,
-				  PKT_RX_IP_CKSUM_GOOD) |
-			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
-				  PKT_RX_L4_CKSUM_GOOD);
-	if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
-		ol_flags |=
-			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
-				  PKT_RX_IP_CKSUM_GOOD) |
-			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
-				  PKT_RX_L4_CKSUM_GOOD);
-	return ol_flags;
-}
-
-static uint16_t
-mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
-
-/**
- * DPDK callback for RX with scattered packets support.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct rxq *rxq = (struct rxq *)dpdk_rxq;
-	struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
-	const unsigned int elts_n = rxq->elts_n;
-	unsigned int elts_head = rxq->elts_head;
-	struct ibv_recv_wr head;
-	struct ibv_recv_wr **next = &head.next;
-	struct ibv_recv_wr *bad_wr;
-	unsigned int i;
-	unsigned int pkts_ret = 0;
-	int ret;
-
-	if (unlikely(!rxq->sp))
-		return mlx4_rx_burst(dpdk_rxq, pkts, pkts_n);
-	if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */
-		return 0;
-	for (i = 0; (i != pkts_n); ++i) {
-		struct rxq_elt_sp *elt = &(*elts)[elts_head];
-		struct ibv_recv_wr *wr = &elt->wr;
-		uint64_t wr_id = wr->wr_id;
-		unsigned int len;
-		unsigned int pkt_buf_len;
-		struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */
-		struct rte_mbuf **pkt_buf_next = &pkt_buf;
-		unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM;
-		unsigned int j = 0;
-		uint32_t flags;
-
-		/* Sanity checks. */
-#ifdef NDEBUG
-		(void)wr_id;
-#endif
-		assert(wr_id < rxq->elts_n);
-		assert(wr->sg_list == elt->sges);
-		assert(wr->num_sge == elemof(elt->sges));
-		assert(elts_head < rxq->elts_n);
-		assert(rxq->elts_head < rxq->elts_n);
-		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
-						    &flags);
-		if (unlikely(ret < 0)) {
-			struct ibv_wc wc;
-			int wcs_n;
-
-			DEBUG("rxq=%p, poll_length() failed (ret=%d)",
-			      (void *)rxq, ret);
-			/* ibv_poll_cq() must be used in case of failure. */
-			wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
-			if (unlikely(wcs_n == 0))
-				break;
-			if (unlikely(wcs_n < 0)) {
-				DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
-				      (void *)rxq, wcs_n);
-				break;
-			}
-			assert(wcs_n == 1);
-			if (unlikely(wc.status != IBV_WC_SUCCESS)) {
-				/* Whatever, just repost the offending WR. */
-				DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
-				      " completion status (%d): %s",
-				      (void *)rxq, wc.wr_id, wc.status,
-				      ibv_wc_status_str(wc.status));
-#ifdef MLX4_PMD_SOFT_COUNTERS
-				/* Increment dropped packets counter. */
-				++rxq->stats.idropped;
-#endif
-				/* Link completed WRs together for repost. */
-				*next = wr;
-				next = &wr->next;
-				goto repost;
-			}
-			ret = wc.byte_len;
-		}
-		if (ret == 0)
-			break;
-		len = ret;
-		pkt_buf_len = len;
-		/* Link completed WRs together for repost. */
-		*next = wr;
-		next = &wr->next;
-		/*
-		 * Replace spent segments with new ones, concatenate and
-		 * return them as pkt_buf.
-		 */
-		while (1) {
-			struct ibv_sge *sge = &elt->sges[j];
-			struct rte_mbuf *seg = elt->bufs[j];
-			struct rte_mbuf *rep;
-			unsigned int seg_tailroom;
-
-			/*
-			 * Fetch initial bytes of packet descriptor into a
-			 * cacheline while allocating rep.
-			 */
-			rte_prefetch0(seg);
-			rep = rte_mbuf_raw_alloc(rxq->mp);
-			if (unlikely(rep == NULL)) {
-				/*
-				 * Unable to allocate a replacement mbuf,
-				 * repost WR.
-				 */
-				DEBUG("rxq=%p, wr_id=%" PRIu64 ":"
-				      " can't allocate a new mbuf",
-				      (void *)rxq, wr_id);
-				if (pkt_buf != NULL) {
-					*pkt_buf_next = NULL;
-					rte_pktmbuf_free(pkt_buf);
-				}
-				/* Increase out of memory counters. */
-				++rxq->stats.rx_nombuf;
-				++rxq->priv->dev->data->rx_mbuf_alloc_failed;
-				goto repost;
-			}
-#ifndef NDEBUG
-			/* Poison user-modifiable fields in rep. */
-			NEXT(rep) = (void *)((uintptr_t)-1);
-			SET_DATA_OFF(rep, 0xdead);
-			DATA_LEN(rep) = 0xd00d;
-			PKT_LEN(rep) = 0xdeadd00d;
-			NB_SEGS(rep) = 0x2a;
-			PORT(rep) = 0x2a;
-			rep->ol_flags = -1;
-			/*
-			 * Clear special flags in mbuf to avoid
-			 * crashing while freeing.
-			 */
-			rep->ol_flags &=
-				~(uint64_t)(IND_ATTACHED_MBUF |
-					    CTRL_MBUF_FLAG);
-#endif
-			assert(rep->buf_len == seg->buf_len);
-			/* Reconfigure sge to use rep instead of seg. */
-			assert(sge->lkey == rxq->mr->lkey);
-			sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom);
-			elt->bufs[j] = rep;
-			++j;
-			/* Update pkt_buf if it's the first segment, or link
-			 * seg to the previous one and update pkt_buf_next. */
-			*pkt_buf_next = seg;
-			pkt_buf_next = &NEXT(seg);
-			/* Update seg information. */
-			seg_tailroom = (seg->buf_len - seg_headroom);
-			assert(sge->length == seg_tailroom);
-			SET_DATA_OFF(seg, seg_headroom);
-			if (likely(len <= seg_tailroom)) {
-				/* Last segment. */
-				DATA_LEN(seg) = len;
-				PKT_LEN(seg) = len;
-				/* Sanity check. */
-				assert(rte_pktmbuf_headroom(seg) ==
-				       seg_headroom);
-				assert(rte_pktmbuf_tailroom(seg) ==
-				       (seg_tailroom - len));
-				break;
-			}
-			DATA_LEN(seg) = seg_tailroom;
-			PKT_LEN(seg) = seg_tailroom;
-			/* Sanity check. */
-			assert(rte_pktmbuf_headroom(seg) == seg_headroom);
-			assert(rte_pktmbuf_tailroom(seg) == 0);
-			/* Fix len and clear headroom for next segments. */
-			len -= seg_tailroom;
-			seg_headroom = 0;
-		}
-		/* Update head and tail segments. */
-		*pkt_buf_next = NULL;
-		assert(pkt_buf != NULL);
-		assert(j != 0);
-		NB_SEGS(pkt_buf) = j;
-		PORT(pkt_buf) = rxq->port_id;
-		PKT_LEN(pkt_buf) = pkt_buf_len;
-		pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
-		pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-
-		/* Return packet. */
-		*(pkts++) = pkt_buf;
-		++pkts_ret;
-#ifdef MLX4_PMD_SOFT_COUNTERS
-		/* Increase bytes counter. */
-		rxq->stats.ibytes += pkt_buf_len;
-#endif
-repost:
-		if (++elts_head >= elts_n)
-			elts_head = 0;
-		continue;
-	}
-	if (unlikely(i == 0))
-		return 0;
-	*next = NULL;
-	/* Repost WRs. */
-#ifdef DEBUG_RECV
-	DEBUG("%p: reposting %d WRs", (void *)rxq, i);
-#endif
-	ret = ibv_post_recv(rxq->qp, head.next, &bad_wr);
-	if (unlikely(ret)) {
-		/* Inability to repost WRs is fatal. */
-		DEBUG("%p: ibv_post_recv(): failed for WR %p: %s",
-		      (void *)rxq->priv,
-		      (void *)bad_wr,
-		      strerror(ret));
-		abort();
-	}
-	rxq->elts_head = elts_head;
-#ifdef MLX4_PMD_SOFT_COUNTERS
-	/* Increase packets counter. */
-	rxq->stats.ipackets += pkts_ret;
-#endif
-	return pkts_ret;
-}
-
-/**
- * DPDK callback for RX.
- *
- * The following function is the same as mlx4_rx_burst_sp(), except it doesn't
- * manage scattered packets. Improves performance when MRU is lower than the
- * size of the first segment.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct rxq *rxq = (struct rxq *)dpdk_rxq;
-	struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
-	const unsigned int elts_n = rxq->elts_n;
-	unsigned int elts_head = rxq->elts_head;
-	struct ibv_sge sges[pkts_n];
-	unsigned int i;
-	unsigned int pkts_ret = 0;
-	int ret;
-
-	if (unlikely(rxq->sp))
-		return mlx4_rx_burst_sp(dpdk_rxq, pkts, pkts_n);
-	for (i = 0; (i != pkts_n); ++i) {
-		struct rxq_elt *elt = &(*elts)[elts_head];
-		struct ibv_recv_wr *wr = &elt->wr;
-		uint64_t wr_id = wr->wr_id;
-		unsigned int len;
-		struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr -
-			WR_ID(wr_id).offset);
-		struct rte_mbuf *rep;
-		uint32_t flags;
-
-		/* Sanity checks. */
-		assert(WR_ID(wr_id).id < rxq->elts_n);
-		assert(wr->sg_list == &elt->sge);
-		assert(wr->num_sge == 1);
-		assert(elts_head < rxq->elts_n);
-		assert(rxq->elts_head < rxq->elts_n);
-		/*
-		 * Fetch initial bytes of packet descriptor into a
-		 * cacheline while allocating rep.
-		 */
-		rte_mbuf_prefetch_part1(seg);
-		rte_mbuf_prefetch_part2(seg);
-		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
-						    &flags);
-		if (unlikely(ret < 0)) {
-			struct ibv_wc wc;
-			int wcs_n;
-
-			DEBUG("rxq=%p, poll_length() failed (ret=%d)",
-			      (void *)rxq, ret);
-			/* ibv_poll_cq() must be used in case of failure. */
-			wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
-			if (unlikely(wcs_n == 0))
-				break;
-			if (unlikely(wcs_n < 0)) {
-				DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
-				      (void *)rxq, wcs_n);
-				break;
-			}
-			assert(wcs_n == 1);
-			if (unlikely(wc.status != IBV_WC_SUCCESS)) {
-				/* Whatever, just repost the offending WR. */
-				DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
-				      " completion status (%d): %s",
-				      (void *)rxq, wc.wr_id, wc.status,
-				      ibv_wc_status_str(wc.status));
-#ifdef MLX4_PMD_SOFT_COUNTERS
-				/* Increment dropped packets counter. */
-				++rxq->stats.idropped;
-#endif
-				/* Add SGE to array for repost. */
-				sges[i] = elt->sge;
-				goto repost;
-			}
-			ret = wc.byte_len;
-		}
-		if (ret == 0)
-			break;
-		len = ret;
-		rep = rte_mbuf_raw_alloc(rxq->mp);
-		if (unlikely(rep == NULL)) {
-			/*
-			 * Unable to allocate a replacement mbuf,
-			 * repost WR.
-			 */
-			DEBUG("rxq=%p, wr_id=%" PRIu32 ":"
-			      " can't allocate a new mbuf",
-			      (void *)rxq, WR_ID(wr_id).id);
-			/* Increase out of memory counters. */
-			++rxq->stats.rx_nombuf;
-			++rxq->priv->dev->data->rx_mbuf_alloc_failed;
-			/* Add SGE to array for repost. */
-			sges[i] = elt->sge;
-			goto repost;
-		}
-
-		/* Reconfigure sge to use rep instead of seg. */
-		elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
-		assert(elt->sge.lkey == rxq->mr->lkey);
-		WR_ID(wr->wr_id).offset =
-			(((uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM) -
-			 (uintptr_t)rep);
-		assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id);
-
-		/* Add SGE to array for repost. */
-		sges[i] = elt->sge;
-
-		/* Update seg information. */
-		SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
-		NB_SEGS(seg) = 1;
-		PORT(seg) = rxq->port_id;
-		NEXT(seg) = NULL;
-		PKT_LEN(seg) = len;
-		DATA_LEN(seg) = len;
-		seg->packet_type = rxq_cq_to_pkt_type(flags);
-		seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-
-		/* Return packet. */
-		*(pkts++) = seg;
-		++pkts_ret;
-#ifdef MLX4_PMD_SOFT_COUNTERS
-		/* Increase bytes counter. */
-		rxq->stats.ibytes += len;
-#endif
-repost:
-		if (++elts_head >= elts_n)
-			elts_head = 0;
-		continue;
-	}
-	if (unlikely(i == 0))
-		return 0;
-	/* Repost WRs. */
-#ifdef DEBUG_RECV
-	DEBUG("%p: reposting %u WRs", (void *)rxq, i);
-#endif
-	ret = rxq->if_qp->recv_burst(rxq->qp, sges, i);
-	if (unlikely(ret)) {
-		/* Inability to repost WRs is fatal. */
-		DEBUG("%p: recv_burst(): failed (ret=%d)",
-		      (void *)rxq->priv,
-		      ret);
-		abort();
-	}
-	rxq->elts_head = elts_head;
-#ifdef MLX4_PMD_SOFT_COUNTERS
-	/* Increase packets counter. */
-	rxq->stats.ipackets += pkts_ret;
-#endif
-	return pkts_ret;
-}
-
-/**
- * DPDK callback for RX in secondary processes.
- *
- * This function configures all queues from primary process information
- * if necessary before reverting to the normal RX burst callback.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-mlx4_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
-			      uint16_t pkts_n)
-{
-	struct rxq *rxq = dpdk_rxq;
-	struct priv *priv = mlx4_secondary_data_setup(rxq->priv);
-	struct priv *primary_priv;
-	unsigned int index;
-
-	if (priv == NULL)
-		return 0;
-	primary_priv =
-		mlx4_secondary_data[priv->dev->data->port_id].primary_priv;
-	/* Look for queue index in both private structures. */
-	for (index = 0; index != priv->rxqs_n; ++index)
-		if (((*primary_priv->rxqs)[index] == rxq) ||
-		    ((*priv->rxqs)[index] == rxq))
-			break;
-	if (index == priv->rxqs_n)
-		return 0;
-	rxq = (*priv->rxqs)[index];
-	return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
-}
-
-/**
- * Allocate a Queue Pair.
- * Optionally setup inline receive if supported.
- *
- * @param priv
- *   Pointer to private structure.
- * @param cq
- *   Completion queue to associate with QP.
- * @param desc
- *   Number of descriptors in QP (hint only).
- *
- * @return
- *   QP pointer or NULL in case of error.
- */
-static struct ibv_qp *
-rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
-	     struct ibv_exp_res_domain *rd)
-{
-	struct ibv_exp_qp_init_attr attr = {
-		/* CQ to be associated with the send queue. */
-		.send_cq = cq,
-		/* CQ to be associated with the receive queue. */
-		.recv_cq = cq,
-		.cap = {
-			/* Max number of outstanding WRs. */
-			.max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
-					priv->device_attr.max_qp_wr :
-					desc),
-			/* Max number of scatter/gather elements in a WR. */
-			.max_recv_sge = ((priv->device_attr.max_sge <
-					  MLX4_PMD_SGE_WR_N) ?
-					 priv->device_attr.max_sge :
-					 MLX4_PMD_SGE_WR_N),
-		},
-		.qp_type = IBV_QPT_RAW_PACKET,
-		.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
-			      IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
-		.pd = priv->pd,
-		.res_domain = rd,
-	};
-
-#ifdef INLINE_RECV
-	attr.max_inl_recv = priv->inl_recv_size;
-	attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV;
-#endif
-	return ibv_exp_create_qp(priv->ctx, &attr);
-}
-
-#ifdef RSS_SUPPORT
-
-/**
- * Allocate a RSS Queue Pair.
- * Optionally setup inline receive if supported.
- *
- * @param priv
- *   Pointer to private structure.
- * @param cq
- *   Completion queue to associate with QP.
- * @param desc
- *   Number of descriptors in QP (hint only).
- * @param children_n
- *   If nonzero, a number of children for parent QP and zero for a child.
- * @param rxq_parent
- *   Pointer for a parent in a child case, NULL otherwise.
- *
- * @return
- *   QP pointer or NULL in case of error.
- */
-static struct ibv_qp *
-rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
-		 int children_n, struct ibv_exp_res_domain *rd,
-		 struct rxq *rxq_parent)
-{
-	struct ibv_exp_qp_init_attr attr = {
-		/* CQ to be associated with the send queue. */
-		.send_cq = cq,
-		/* CQ to be associated with the receive queue. */
-		.recv_cq = cq,
-		.cap = {
-			/* Max number of outstanding WRs. */
-			.max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
-					priv->device_attr.max_qp_wr :
-					desc),
-			/* Max number of scatter/gather elements in a WR. */
-			.max_recv_sge = ((priv->device_attr.max_sge <
-					  MLX4_PMD_SGE_WR_N) ?
-					 priv->device_attr.max_sge :
-					 MLX4_PMD_SGE_WR_N),
-		},
-		.qp_type = IBV_QPT_RAW_PACKET,
-		.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
-			      IBV_EXP_QP_INIT_ATTR_RES_DOMAIN |
-			      IBV_EXP_QP_INIT_ATTR_QPG),
-		.pd = priv->pd,
-		.res_domain = rd,
-	};
-
-#ifdef INLINE_RECV
-	attr.max_inl_recv = priv->inl_recv_size,
-	attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV;
-#endif
-	if (children_n > 0) {
-		attr.qpg.qpg_type = IBV_EXP_QPG_PARENT;
-		/* TSS isn't necessary. */
-		attr.qpg.parent_attrib.tss_child_count = 0;
-		attr.qpg.parent_attrib.rss_child_count =
-			rte_align32pow2(children_n + 1) >> 1;
-		DEBUG("initializing parent RSS queue");
-	} else {
-		attr.qpg.qpg_type = IBV_EXP_QPG_CHILD_RX;
-		attr.qpg.qpg_parent = rxq_parent->qp;
-		DEBUG("initializing child RSS queue");
-	}
-	return ibv_exp_create_qp(priv->ctx, &attr);
-}
-
-#endif /* RSS_SUPPORT */
-
-/**
- * Reconfigure a RX queue with new parameters.
- *
- * rxq_rehash() does not allocate mbufs, which, if not done from the right
- * thread (such as a control thread), may corrupt the pool.
- * In case of failure, the queue is left untouched.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param rxq
- *   RX queue pointer.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
-{
-	struct priv *priv = rxq->priv;
-	struct rxq tmpl = *rxq;
-	unsigned int mbuf_n;
-	unsigned int desc_n;
-	struct rte_mbuf **pool;
-	unsigned int i, k;
-	struct ibv_exp_qp_attr mod;
-	struct ibv_recv_wr *bad_wr;
-	unsigned int mb_len;
-	int err;
-
-	mb_len = rte_pktmbuf_data_room_size(rxq->mp);
-	DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
-	/* Number of descriptors and mbufs currently allocated. */
-	desc_n = (tmpl.elts_n * (tmpl.sp ? MLX4_PMD_SGE_WR_N : 1));
-	mbuf_n = desc_n;
-	/* Toggle RX checksum offload if hardware supports it. */
-	if (priv->hw_csum) {
-		tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
-		rxq->csum = tmpl.csum;
-	}
-	if (priv->hw_csum_l2tun) {
-		tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
-		rxq->csum_l2tun = tmpl.csum_l2tun;
-	}
-	/* Enable scattered packets support for this queue if necessary. */
-	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
-	if (dev->data->dev_conf.rxmode.enable_scatter &&
-	    (dev->data->dev_conf.rxmode.max_rx_pkt_len >
-	     (mb_len - RTE_PKTMBUF_HEADROOM))) {
-		tmpl.sp = 1;
-		desc_n /= MLX4_PMD_SGE_WR_N;
-	} else
-		tmpl.sp = 0;
-	DEBUG("%p: %s scattered packets support (%u WRs)",
-	      (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
-	/* If scatter mode is the same as before, nothing to do. */
-	if (tmpl.sp == rxq->sp) {
-		DEBUG("%p: nothing to do", (void *)dev);
-		return 0;
-	}
-	/* Remove attached flows if RSS is disabled (no parent queue). */
-	if (!priv->rss && !priv->isolated) {
-		rxq_allmulticast_disable(&tmpl);
-		rxq_promiscuous_disable(&tmpl);
-		rxq_mac_addrs_del(&tmpl);
-		/* Update original queue in case of failure. */
-		rxq->allmulti_flow = tmpl.allmulti_flow;
-		rxq->promisc_flow = tmpl.promisc_flow;
-		memcpy(rxq->mac_configured, tmpl.mac_configured,
-		       sizeof(rxq->mac_configured));
-		memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow));
-	}
-	/* From now on, any failure will render the queue unusable.
-	 * Reinitialize QP. */
-	if (!tmpl.qp)
-		goto skip_init;
-	mod = (struct ibv_exp_qp_attr){ .qp_state = IBV_QPS_RESET };
-	err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE);
-	if (err) {
-		ERROR("%p: cannot reset QP: %s", (void *)dev, strerror(err));
-		assert(err > 0);
-		return err;
-	}
-	mod = (struct ibv_exp_qp_attr){
-		/* Move the QP to this state. */
-		.qp_state = IBV_QPS_INIT,
-		/* Primary port number. */
-		.port_num = priv->port
-	};
-	err = ibv_exp_modify_qp(tmpl.qp, &mod,
-				(IBV_EXP_QP_STATE |
-				 IBV_EXP_QP_PORT));
-	if (err) {
-		ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
-		      (void *)dev, strerror(err));
-		assert(err > 0);
-		return err;
-	};
-skip_init:
-	err = ibv_resize_cq(tmpl.cq, desc_n);
-	if (err) {
-		ERROR("%p: cannot resize CQ: %s", (void *)dev, strerror(err));
-		assert(err > 0);
-		return err;
-	}
-	/* Reconfigure flows. Do not care for errors. */
-	if (!priv->rss && !priv->isolated) {
-		rxq_mac_addrs_add(&tmpl);
-		if (priv->promisc)
-			rxq_promiscuous_enable(&tmpl);
-		if (priv->allmulti)
-			rxq_allmulticast_enable(&tmpl);
-		/* Update original queue in case of failure. */
-		rxq->allmulti_flow = tmpl.allmulti_flow;
-		rxq->promisc_flow = tmpl.promisc_flow;
-		memcpy(rxq->mac_configured, tmpl.mac_configured,
-		       sizeof(rxq->mac_configured));
-		memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow));
-	}
-	/* Allocate pool. */
-	pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
-	if (pool == NULL) {
-		ERROR("%p: cannot allocate memory", (void *)dev);
-		return ENOBUFS;
-	}
-	/* Snatch mbufs from original queue. */
-	k = 0;
-	if (rxq->sp) {
-		struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
-
-		for (i = 0; (i != elemof(*elts)); ++i) {
-			struct rxq_elt_sp *elt = &(*elts)[i];
-			unsigned int j;
-
-			for (j = 0; (j != elemof(elt->bufs)); ++j) {
-				assert(elt->bufs[j] != NULL);
-				pool[k++] = elt->bufs[j];
-			}
-		}
-	} else {
-		struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
-
-		for (i = 0; (i != elemof(*elts)); ++i) {
-			struct rxq_elt *elt = &(*elts)[i];
-			struct rte_mbuf *buf = (void *)
-				((uintptr_t)elt->sge.addr -
-				 WR_ID(elt->wr.wr_id).offset);
-
-			assert(WR_ID(elt->wr.wr_id).id == i);
-			pool[k++] = buf;
-		}
-	}
-	assert(k == mbuf_n);
-	tmpl.elts_n = 0;
-	tmpl.elts.sp = NULL;
-	assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
-	err = ((tmpl.sp) ?
-	       rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
-	       rxq_alloc_elts(&tmpl, desc_n, pool));
-	if (err) {
-		ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
-		rte_free(pool);
-		assert(err > 0);
-		return err;
-	}
-	assert(tmpl.elts_n == desc_n);
-	assert(tmpl.elts.sp != NULL);
-	rte_free(pool);
-	/* Clean up original data. */
-	rxq->elts_n = 0;
-	rte_free(rxq->elts.sp);
-	rxq->elts.sp = NULL;
-	if (!tmpl.qp)
-		goto skip_rtr;
-	/* Post WRs. */
-	err = ibv_post_recv(tmpl.qp,
-			    (tmpl.sp ?
-			     &(*tmpl.elts.sp)[0].wr :
-			     &(*tmpl.elts.no_sp)[0].wr),
-			    &bad_wr);
-	if (err) {
-		ERROR("%p: ibv_post_recv() failed for WR %p: %s",
-		      (void *)dev,
-		      (void *)bad_wr,
-		      strerror(err));
-		goto skip_rtr;
-	}
-	mod = (struct ibv_exp_qp_attr){
-		.qp_state = IBV_QPS_RTR
-	};
-	err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE);
-	if (err)
-		ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
-		      (void *)dev, strerror(err));
-skip_rtr:
-	*rxq = tmpl;
-	assert(err >= 0);
-	return err;
-}
-
-/**
- * Create verbs QP resources associated with a rxq.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- * @param desc
- *   Number of descriptors to configure in queue.
- * @param inactive
- *   If true, the queue is disabled because its index is higher or
- *   equal to the real number of queues, which must be a power of 2.
- * @param children_n
- *   The number of children in a parent case, zero for a child.
- * @param rxq_parent
- *   The pointer to a parent RX structure for a child in RSS case,
- *   NULL for parent.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-int
-rxq_create_qp(struct rxq *rxq,
-	      uint16_t desc,
-	      int inactive,
-	      int children_n,
-	      struct rxq *rxq_parent)
-{
-	int ret;
-	struct ibv_exp_qp_attr mod;
-	struct ibv_exp_query_intf_params params;
-	enum ibv_exp_query_intf_status status;
-	struct ibv_recv_wr *bad_wr;
-	int parent = (children_n > 0);
-	struct priv *priv = rxq->priv;
-
-#ifdef RSS_SUPPORT
-	if (priv->rss && !inactive && (rxq_parent || parent))
-		rxq->qp = rxq_setup_qp_rss(priv, rxq->cq, desc,
-					   children_n, rxq->rd,
-					   rxq_parent);
-	else
-#endif /* RSS_SUPPORT */
-		rxq->qp = rxq_setup_qp(priv, rxq->cq, desc, rxq->rd);
-	if (rxq->qp == NULL) {
-		ret = (errno ? errno : EINVAL);
-		ERROR("QP creation failure: %s",
-		      strerror(ret));
-		return ret;
-	}
-	mod = (struct ibv_exp_qp_attr){
-		/* Move the QP to this state. */
-		.qp_state = IBV_QPS_INIT,
-		/* Primary port number. */
-		.port_num = priv->port
-	};
-	ret = ibv_exp_modify_qp(rxq->qp, &mod,
-				(IBV_EXP_QP_STATE |
-#ifdef RSS_SUPPORT
-				 (parent ? IBV_EXP_QP_GROUP_RSS : 0) |
-#endif /* RSS_SUPPORT */
-				 IBV_EXP_QP_PORT));
-	if (ret) {
-		ERROR("QP state to IBV_QPS_INIT failed: %s",
-		      strerror(ret));
-		return ret;
-	}
-	if (!priv->isolated && (parent || !priv->rss)) {
-		/* Configure MAC and broadcast addresses. */
-		ret = rxq_mac_addrs_add(rxq);
-		if (ret) {
-			ERROR("QP flow attachment failed: %s",
-			      strerror(ret));
-			return ret;
-		}
-	}
-	if (!parent) {
-		ret = ibv_post_recv(rxq->qp,
-				    (rxq->sp ?
-				     &(*rxq->elts.sp)[0].wr :
-				     &(*rxq->elts.no_sp)[0].wr),
-				    &bad_wr);
-		if (ret) {
-			ERROR("ibv_post_recv() failed for WR %p: %s",
-			      (void *)bad_wr,
-			      strerror(ret));
-			return ret;
-		}
-	}
-	mod = (struct ibv_exp_qp_attr){
-		.qp_state = IBV_QPS_RTR
-	};
-	ret = ibv_exp_modify_qp(rxq->qp, &mod, IBV_EXP_QP_STATE);
-	if (ret) {
-		ERROR("QP state to IBV_QPS_RTR failed: %s",
-		      strerror(ret));
-		return ret;
-	}
-	params = (struct ibv_exp_query_intf_params){
-		.intf_scope = IBV_EXP_INTF_GLOBAL,
-		.intf = IBV_EXP_INTF_QP_BURST,
-		.obj = rxq->qp,
-	};
-	rxq->if_qp = ibv_exp_query_intf(priv->ctx, &params, &status);
-	if (rxq->if_qp == NULL) {
-		ERROR("QP interface family query failed with status %d",
-		      status);
-		return errno;
-	}
-	return 0;
-}
-
-/**
- * Configure a RX queue.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param rxq
- *   Pointer to RX queue structure.
- * @param desc
- *   Number of descriptors to configure in queue.
- * @param socket
- *   NUMA socket on which memory must be allocated.
- * @param inactive
- *   If true, the queue is disabled because its index is higher or
- *   equal to the real number of queues, which must be a power of 2.
- * @param[in] conf
- *   Thresholds parameters.
- * @param mp
- *   Memory pool for buffer allocations.
- * @param children_n
- *   The number of children in a parent case, zero for a child.
- * @param rxq_parent
- *   The pointer to a parent RX structure (or NULL) in a child case,
- *   NULL for parent.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
-	  unsigned int socket, int inactive,
-	  const struct rte_eth_rxconf *conf,
-	  struct rte_mempool *mp, int children_n,
-	  struct rxq *rxq_parent)
-{
-	struct priv *priv = dev->data->dev_private;
-	struct rxq tmpl = {
-		.priv = priv,
-		.mp = mp,
-		.socket = socket
-	};
-	union {
-		struct ibv_exp_query_intf_params params;
-		struct ibv_exp_cq_init_attr cq;
-		struct ibv_exp_res_domain_init_attr rd;
-	} attr;
-	enum ibv_exp_query_intf_status status;
-	unsigned int mb_len;
-	int ret = 0;
-	int parent = (children_n > 0);
-
-	(void)conf; /* Thresholds configuration (ignored). */
-	/*
-	 * If this is a parent queue, hardware must support RSS and
-	 * RSS must be enabled.
-	 */
-	assert((!parent) || ((priv->hw_rss) && (priv->rss)));
-	if (parent) {
-		/* Even if unused, ibv_create_cq() requires at least one
-		 * descriptor. */
-		desc = 1;
-		goto skip_mr;
-	}
-	mb_len = rte_pktmbuf_data_room_size(mp);
-	if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) {
-		ERROR("%p: invalid number of RX descriptors (must be a"
-		      " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N);
-		return EINVAL;
-	}
-	/* Toggle RX checksum offload if hardware supports it. */
-	if (priv->hw_csum)
-		tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
-	if (priv->hw_csum_l2tun)
-		tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
-	/* Enable scattered packets support for this queue if necessary. */
-	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
-	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
-	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
-		tmpl.sp = 0;
-	} else if (dev->data->dev_conf.rxmode.enable_scatter) {
-		tmpl.sp = 1;
-		desc /= MLX4_PMD_SGE_WR_N;
-	} else {
-		WARN("%p: the requested maximum Rx packet size (%u) is"
-		     " larger than a single mbuf (%u) and scattered"
-		     " mode has not been requested",
-		     (void *)dev,
-		     dev->data->dev_conf.rxmode.max_rx_pkt_len,
-		     mb_len - RTE_PKTMBUF_HEADROOM);
-	}
-	DEBUG("%p: %s scattered packets support (%u WRs)",
-	      (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
-	/* Use the entire RX mempool as the memory region. */
-	tmpl.mr = mlx4_mp2mr(priv->pd, mp);
-	if (tmpl.mr == NULL) {
-		ret = EINVAL;
-		ERROR("%p: MR creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-skip_mr:
-	attr.rd = (struct ibv_exp_res_domain_init_attr){
-		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
-			      IBV_EXP_RES_DOMAIN_MSG_MODEL),
-		.thread_model = IBV_EXP_THREAD_SINGLE,
-		.msg_model = IBV_EXP_MSG_HIGH_BW,
-	};
-	tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
-	if (tmpl.rd == NULL) {
-		ret = ENOMEM;
-		ERROR("%p: RD creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	if (dev->data->dev_conf.intr_conf.rxq) {
-		tmpl.channel = ibv_create_comp_channel(priv->ctx);
-		if (tmpl.channel == NULL) {
-			ret = ENOMEM;
-			ERROR("%p: Rx interrupt completion channel creation"
-			      " failure: %s",
-			      (void *)dev, strerror(ret));
-			goto error;
-		}
-	}
-	attr.cq = (struct ibv_exp_cq_init_attr){
-		.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
-		.res_domain = tmpl.rd,
-	};
-	tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, tmpl.channel, 0,
-				    &attr.cq);
-	if (tmpl.cq == NULL) {
-		ret = ENOMEM;
-		ERROR("%p: CQ creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	DEBUG("priv->device_attr.max_qp_wr is %d",
-	      priv->device_attr.max_qp_wr);
-	DEBUG("priv->device_attr.max_sge is %d",
-	      priv->device_attr.max_sge);
-	/* Allocate descriptors for RX queues, except for the RSS parent. */
-	if (parent)
-		goto skip_alloc;
-	if (tmpl.sp)
-		ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
-	else
-		ret = rxq_alloc_elts(&tmpl, desc, NULL);
+	/* Prepare internal flow rules. */
+	ret = mlx4_flow_sync(priv, &error);
 	if (ret) {
-		ERROR("%p: RXQ allocation failed: %s",
-		      (void *)dev, strerror(ret));
-		return ret;
+		ERROR("cannot set up internal flow rules (code %d, \"%s\"),"
+		      " flow error type %d, cause %p, message: %s",
+		      -ret, strerror(-ret), error.type, error.cause,
+		      error.message ? error.message : "(unspecified)");
 	}
-skip_alloc:
-	if (parent || rxq_parent || !priv->rss) {
-		ret = rxq_create_qp(&tmpl, desc, inactive,
-				    children_n, rxq_parent);
-		if (ret)
-			goto error;
-	}
-	/* Save port ID. */
-	tmpl.port_id = dev->data->port_id;
-	DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
-	attr.params = (struct ibv_exp_query_intf_params){
-		.intf_scope = IBV_EXP_INTF_GLOBAL,
-		.intf = IBV_EXP_INTF_CQ,
-		.obj = tmpl.cq,
-	};
-	tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
-	if (tmpl.if_cq == NULL) {
-		ret = EINVAL;
-		ERROR("%p: CQ interface family query failed with status %d",
-		      (void *)dev, status);
-		goto error;
-	}
-	/* Clean up rxq in case we're reinitializing it. */
-	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
-	rxq_cleanup(rxq);
-	*rxq = tmpl;
-	DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
-	assert(ret == 0);
-	return 0;
-error:
-	rxq_cleanup(&tmpl);
-	assert(ret > 0);
 	return ret;
 }
 
 /**
- * DPDK callback to configure a RX queue.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param idx
- *   RX queue index.
- * @param desc
- *   Number of descriptors to configure in queue.
- * @param socket
- *   NUMA socket on which memory must be allocated.
- * @param[in] conf
- *   Thresholds parameters.
- * @param mp
- *   Memory pool for buffer allocations.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
-		    unsigned int socket, const struct rte_eth_rxconf *conf,
-		    struct rte_mempool *mp)
-{
-	struct rxq *parent;
-	struct priv *priv = dev->data->dev_private;
-	struct rxq *rxq = (*priv->rxqs)[idx];
-	int inactive = 0;
-	int ret;
-
-	if (mlx4_is_secondary())
-		return -E_RTE_SECONDARY;
-	priv_lock(priv);
-	DEBUG("%p: configuring queue %u for %u descriptors",
-	      (void *)dev, idx, desc);
-	if (idx >= priv->rxqs_n) {
-		ERROR("%p: queue index out of range (%u >= %u)",
-		      (void *)dev, idx, priv->rxqs_n);
-		priv_unlock(priv);
-		return -EOVERFLOW;
-	}
-	if (rxq != NULL) {
-		DEBUG("%p: reusing already allocated queue index %u (%p)",
-		      (void *)dev, idx, (void *)rxq);
-		if (priv->started) {
-			priv_unlock(priv);
-			return -EEXIST;
-		}
-		(*priv->rxqs)[idx] = NULL;
-		rxq_cleanup(rxq);
-	} else {
-		rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
-		if (rxq == NULL) {
-			ERROR("%p: unable to allocate queue index %u",
-			      (void *)dev, idx);
-			priv_unlock(priv);
-			return -ENOMEM;
-		}
-	}
-	if (priv->rss && !priv->isolated) {
-		/* The list consists of the single default one. */
-		parent = LIST_FIRST(&priv->parents);
-		if (idx >= rte_align32pow2(priv->rxqs_n + 1) >> 1)
-			inactive = 1;
-	} else {
-		parent = NULL;
-	}
-	ret = rxq_setup(dev, rxq, desc, socket,
-			inactive, conf, mp, 0, parent);
-	if (ret)
-		rte_free(rxq);
-	else {
-		rxq->stats.idx = idx;
-		DEBUG("%p: adding RX queue %p to list",
-		      (void *)dev, (void *)rxq);
-		(*priv->rxqs)[idx] = rxq;
-		/* Update receive callback. */
-		if (rxq->sp)
-			dev->rx_pkt_burst = mlx4_rx_burst_sp;
-		else
-			dev->rx_pkt_burst = mlx4_rx_burst;
-	}
-	priv_unlock(priv);
-	return -ret;
-}
-
-/**
- * DPDK callback to release a RX queue.
- *
- * @param dpdk_rxq
- *   Generic RX queue pointer.
- */
-static void
-mlx4_rx_queue_release(void *dpdk_rxq)
-{
-	struct rxq *rxq = (struct rxq *)dpdk_rxq;
-	struct priv *priv;
-	unsigned int i;
-
-	if (mlx4_is_secondary())
-		return;
-	if (rxq == NULL)
-		return;
-	priv = rxq->priv;
-	priv_lock(priv);
-	for (i = 0; (i != priv->rxqs_n); ++i)
-		if ((*priv->rxqs)[i] == rxq) {
-			DEBUG("%p: removing RX queue %p from list",
-			      (void *)priv->dev, (void *)rxq);
-			(*priv->rxqs)[i] = NULL;
-			break;
-		}
-	rxq_cleanup(rxq);
-	rte_free(rxq);
-	priv_unlock(priv);
-}
-
-static int
-priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *);
-
-static int
-priv_dev_removal_interrupt_handler_install(struct priv *, struct rte_eth_dev *);
-
-static int
-priv_dev_link_interrupt_handler_install(struct priv *, struct rte_eth_dev *);
-
-/**
  * DPDK callback to start the device.
  *
- * Simulate device start by attaching all configured flows.
+ * Simulate device start by initializing common RSS resources and attaching
+ * all configured flows.
  *
  * @param dev
  *   Pointer to Ethernet device structure.
  *
  * @return
- *   0 on success, negative errno value on failure.
+ *   0 on success, negative errno value otherwise and rte_errno is set.
  */
 static int
 mlx4_dev_start(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
-	unsigned int i = 0;
-	unsigned int r;
-	struct rxq *rxq;
+	struct rte_flow_error error;
 	int ret;
 
-	if (mlx4_is_secondary())
-		return -E_RTE_SECONDARY;
-	priv_lock(priv);
-	if (priv->started) {
-		priv_unlock(priv);
+	if (priv->started)
 		return 0;
-	}
 	DEBUG("%p: attaching configured flows to all RX queues", (void *)dev);
 	priv->started = 1;
-	if (priv->isolated) {
-		rxq = NULL;
-		r = 1;
-	} else if (priv->rss) {
-		rxq = LIST_FIRST(&priv->parents);
-		r = 1;
-	} else {
-		rxq = (*priv->rxqs)[0];
-		r = priv->rxqs_n;
-	}
-	/* Iterate only once when RSS is enabled. */
-	do {
-		/* Ignore nonexistent RX queues. */
-		if (rxq == NULL)
-			continue;
-		ret = rxq_mac_addrs_add(rxq);
-		if (!ret && priv->promisc)
-			ret = rxq_promiscuous_enable(rxq);
-		if (!ret && priv->allmulti)
-			ret = rxq_allmulticast_enable(rxq);
-		if (!ret)
-			continue;
-		WARN("%p: QP flow attachment failed: %s",
-		     (void *)dev, strerror(ret));
-		goto err;
-	} while ((--r) && ((rxq = (*priv->rxqs)[++i]), i));
-	ret = priv_dev_link_interrupt_handler_install(priv, dev);
+	ret = mlx4_rss_init(priv);
 	if (ret) {
-		ERROR("%p: LSC handler install failed",
-		     (void *)dev);
+		ERROR("%p: cannot initialize RSS resources: %s",
+		      (void *)dev, strerror(-ret));
 		goto err;
 	}
-	ret = priv_dev_removal_interrupt_handler_install(priv, dev);
+	ret = mlx4_intr_install(priv);
 	if (ret) {
-		ERROR("%p: RMV handler install failed",
+		ERROR("%p: interrupt handler installation failed",
 		     (void *)dev);
 		goto err;
 	}
-	ret = priv_rx_intr_vec_enable(priv);
-	if (ret) {
-		ERROR("%p: Rx interrupt vector creation failed",
-		      (void *)dev);
-		goto err;
-	}
-	ret = mlx4_priv_flow_start(priv);
+	ret = mlx4_flow_sync(priv, &error);
 	if (ret) {
-		ERROR("%p: flow start failed: %s",
-		      (void *)dev, strerror(ret));
+		ERROR("%p: cannot attach flow rules (code %d, \"%s\"),"
+		      " flow error type %d, cause %p, message: %s",
+		      (void *)dev,
+		      -ret, strerror(-ret), error.type, error.cause,
+		      error.message ? error.message : "(unspecified)");
 		goto err;
 	}
-	priv_unlock(priv);
+	rte_wmb();
+	dev->tx_pkt_burst = mlx4_tx_burst;
+	dev->rx_pkt_burst = mlx4_rx_burst;
 	return 0;
 err:
 	/* Rollback. */
-	while (i != 0) {
-		rxq = (*priv->rxqs)[i--];
-		if (rxq != NULL) {
-			rxq_allmulticast_disable(rxq);
-			rxq_promiscuous_disable(rxq);
-			rxq_mac_addrs_del(rxq);
-		}
-	}
 	priv->started = 0;
-	priv_unlock(priv);
-	return -ret;
+	return ret;
 }
 
 /**
@@ -4220,102 +178,19 @@ static void
 mlx4_dev_stop(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
-	unsigned int i = 0;
-	unsigned int r;
-	struct rxq *rxq;
 
-	if (mlx4_is_secondary())
-		return;
-	priv_lock(priv);
-	if (!priv->started) {
-		priv_unlock(priv);
+	if (!priv->started)
 		return;
-	}
 	DEBUG("%p: detaching flows from all RX queues", (void *)dev);
 	priv->started = 0;
-	if (priv->isolated) {
-		rxq = NULL;
-		r = 1;
-	} else if (priv->rss) {
-		rxq = LIST_FIRST(&priv->parents);
-		r = 1;
-	} else {
-		rxq = (*priv->rxqs)[0];
-		r = priv->rxqs_n;
-	}
-	mlx4_priv_flow_stop(priv);
-	/* Iterate only once when RSS is enabled. */
-	do {
-		/* Ignore nonexistent RX queues. */
-		if (rxq == NULL)
-			continue;
-		rxq_allmulticast_disable(rxq);
-		rxq_promiscuous_disable(rxq);
-		rxq_mac_addrs_del(rxq);
-	} while ((--r) && ((rxq = (*priv->rxqs)[++i]), i));
-	priv_unlock(priv);
-}
-
-/**
- * Dummy DPDK callback for TX.
- *
- * This function is used to temporarily replace the real callback during
- * unsafe control operations on the queue, or in case of error.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	(void)dpdk_txq;
-	(void)pkts;
-	(void)pkts_n;
-	return 0;
-}
-
-/**
- * Dummy DPDK callback for RX.
- *
- * This function is used to temporarily replace the real callback during
- * unsafe control operations on the queue, or in case of error.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	(void)dpdk_rxq;
-	(void)pkts;
-	(void)pkts_n;
-	return 0;
+	dev->tx_pkt_burst = mlx4_tx_burst_removed;
+	dev->rx_pkt_burst = mlx4_rx_burst_removed;
+	rte_wmb();
+	mlx4_flow_sync(priv, NULL);
+	mlx4_intr_uninstall(priv);
+	mlx4_rss_deinit(priv);
 }
 
-static int
-priv_dev_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *);
-
-static int
-priv_dev_removal_interrupt_handler_uninstall(struct priv *,
-					     struct rte_eth_dev *);
-
-static int
-priv_dev_link_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *);
-
 /**
  * DPDK callback to close the device.
  *
@@ -4327,1047 +202,58 @@ priv_dev_link_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *);
 static void
 mlx4_dev_close(struct rte_eth_dev *dev)
 {
-	struct priv *priv = mlx4_get_priv(dev);
-	void *tmp;
+	struct priv *priv = dev->data->dev_private;
 	unsigned int i;
 
-	if (priv == NULL)
-		return;
-	priv_lock(priv);
 	DEBUG("%p: closing device \"%s\"",
 	      (void *)dev,
 	      ((priv->ctx != NULL) ? priv->ctx->device->name : ""));
-	/* Prevent crashes when queues are still in use. This is unfortunately
-	 * still required for DPDK 1.3 because some programs (such as testpmd)
-	 * never release them before closing the device. */
-	dev->rx_pkt_burst = removed_rx_burst;
-	dev->tx_pkt_burst = removed_tx_burst;
-	if (priv->rxqs != NULL) {
-		/* XXX race condition if mlx4_rx_burst() is still running. */
-		usleep(1000);
-		for (i = 0; (i != priv->rxqs_n); ++i) {
-			tmp = (*priv->rxqs)[i];
-			if (tmp == NULL)
-				continue;
-			(*priv->rxqs)[i] = NULL;
-			rxq_cleanup(tmp);
-			rte_free(tmp);
-		}
-		priv->rxqs_n = 0;
-		priv->rxqs = NULL;
-	}
-	if (priv->txqs != NULL) {
-		/* XXX race condition if mlx4_tx_burst() is still running. */
-		usleep(1000);
-		for (i = 0; (i != priv->txqs_n); ++i) {
-			tmp = (*priv->txqs)[i];
-			if (tmp == NULL)
-				continue;
-			(*priv->txqs)[i] = NULL;
-			txq_cleanup(tmp);
-			rte_free(tmp);
-		}
-		priv->txqs_n = 0;
-		priv->txqs = NULL;
-	}
-	if (priv->rss)
-		priv_parent_list_cleanup(priv);
+	dev->rx_pkt_burst = mlx4_rx_burst_removed;
+	dev->tx_pkt_burst = mlx4_tx_burst_removed;
+	rte_wmb();
+	mlx4_flow_clean(priv);
+	for (i = 0; i != dev->data->nb_rx_queues; ++i)
+		mlx4_rx_queue_release(dev->data->rx_queues[i]);
+	for (i = 0; i != dev->data->nb_tx_queues; ++i)
+		mlx4_tx_queue_release(dev->data->tx_queues[i]);
 	if (priv->pd != NULL) {
 		assert(priv->ctx != NULL);
 		claim_zero(ibv_dealloc_pd(priv->pd));
 		claim_zero(ibv_close_device(priv->ctx));
 	} else
 		assert(priv->ctx == NULL);
-	priv_dev_removal_interrupt_handler_uninstall(priv, dev);
-	priv_dev_link_interrupt_handler_uninstall(priv, dev);
-	priv_rx_intr_vec_disable(priv);
-	priv_unlock(priv);
+	mlx4_intr_uninstall(priv);
 	memset(priv, 0, sizeof(*priv));
 }
 
-/**
- * Change the link state (UP / DOWN).
- *
- * @param priv
- *   Pointer to Ethernet device private data.
- * @param up
- *   Nonzero for link up, otherwise link down.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-priv_set_link(struct priv *priv, int up)
-{
-	struct rte_eth_dev *dev = priv->dev;
-	int err;
-	unsigned int i;
-
-	if (up) {
-		err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
-		if (err)
-			return err;
-		for (i = 0; i < priv->rxqs_n; i++)
-			if ((*priv->rxqs)[i]->sp)
-				break;
-		/* Check if an sp queue exists.
-		 * Note: Some old frames might be received.
-		 */
-		if (i == priv->rxqs_n)
-			dev->rx_pkt_burst = mlx4_rx_burst;
-		else
-			dev->rx_pkt_burst = mlx4_rx_burst_sp;
-		dev->tx_pkt_burst = mlx4_tx_burst;
-	} else {
-		err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
-		if (err)
-			return err;
-		dev->rx_pkt_burst = removed_rx_burst;
-		dev->tx_pkt_burst = removed_tx_burst;
-	}
-	return 0;
-}
-
-/**
- * DPDK callback to bring the link DOWN.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-mlx4_set_link_down(struct rte_eth_dev *dev)
-{
-	struct priv *priv = dev->data->dev_private;
-	int err;
-
-	priv_lock(priv);
-	err = priv_set_link(priv, 0);
-	priv_unlock(priv);
-	return err;
-}
-
-/**
- * DPDK callback to bring the link UP.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-mlx4_set_link_up(struct rte_eth_dev *dev)
-{
-	struct priv *priv = dev->data->dev_private;
-	int err;
-
-	priv_lock(priv);
-	err = priv_set_link(priv, 1);
-	priv_unlock(priv);
-	return err;
-}
-/**
- * DPDK callback to get information about the device.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param[out] info
- *   Info structure output buffer.
- */
-static void
-mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
-{
-	struct priv *priv = mlx4_get_priv(dev);
-	unsigned int max;
-	char ifname[IF_NAMESIZE];
-
-	info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
-
-	if (priv == NULL)
-		return;
-	priv_lock(priv);
-	/* FIXME: we should ask the device for these values. */
-	info->min_rx_bufsize = 32;
-	info->max_rx_pktlen = 65536;
-	/*
-	 * Since we need one CQ per QP, the limit is the minimum number
-	 * between the two values.
-	 */
-	max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ?
-	       priv->device_attr.max_qp : priv->device_attr.max_cq);
-	/* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
-	if (max >= 65535)
-		max = 65535;
-	info->max_rx_queues = max;
-	info->max_tx_queues = max;
-	/* Last array entry is reserved for broadcast. */
-	info->max_mac_addrs = (elemof(priv->mac) - 1);
-	info->rx_offload_capa =
-		(priv->hw_csum ?
-		 (DEV_RX_OFFLOAD_IPV4_CKSUM |
-		  DEV_RX_OFFLOAD_UDP_CKSUM |
-		  DEV_RX_OFFLOAD_TCP_CKSUM) :
-		 0);
-	info->tx_offload_capa =
-		(priv->hw_csum ?
-		 (DEV_TX_OFFLOAD_IPV4_CKSUM |
-		  DEV_TX_OFFLOAD_UDP_CKSUM |
-		  DEV_TX_OFFLOAD_TCP_CKSUM) :
-		 0);
-	if (priv_get_ifname(priv, &ifname) == 0)
-		info->if_index = if_nametoindex(ifname);
-	info->speed_capa =
-			ETH_LINK_SPEED_1G |
-			ETH_LINK_SPEED_10G |
-			ETH_LINK_SPEED_20G |
-			ETH_LINK_SPEED_40G |
-			ETH_LINK_SPEED_56G;
-	priv_unlock(priv);
-}
-
-static const uint32_t *
-mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev)
-{
-	static const uint32_t ptypes[] = {
-		/* refers to rxq_cq_to_pkt_type() */
-		RTE_PTYPE_L3_IPV4,
-		RTE_PTYPE_L3_IPV6,
-		RTE_PTYPE_INNER_L3_IPV4,
-		RTE_PTYPE_INNER_L3_IPV6,
-		RTE_PTYPE_UNKNOWN
-	};
-
-	if (dev->rx_pkt_burst == mlx4_rx_burst ||
-	    dev->rx_pkt_burst == mlx4_rx_burst_sp)
-		return ptypes;
-	return NULL;
-}
-
-/**
- * DPDK callback to get device statistics.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param[out] stats
- *   Stats structure output buffer.
- */
-static void
-mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
-{
-	struct priv *priv = mlx4_get_priv(dev);
-	struct rte_eth_stats tmp = {0};
-	unsigned int i;
-	unsigned int idx;
-
-	if (priv == NULL)
-		return;
-	priv_lock(priv);
-	/* Add software counters. */
-	for (i = 0; (i != priv->rxqs_n); ++i) {
-		struct rxq *rxq = (*priv->rxqs)[i];
-
-		if (rxq == NULL)
-			continue;
-		idx = rxq->stats.idx;
-		if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
-#ifdef MLX4_PMD_SOFT_COUNTERS
-			tmp.q_ipackets[idx] += rxq->stats.ipackets;
-			tmp.q_ibytes[idx] += rxq->stats.ibytes;
-#endif
-			tmp.q_errors[idx] += (rxq->stats.idropped +
-					      rxq->stats.rx_nombuf);
-		}
-#ifdef MLX4_PMD_SOFT_COUNTERS
-		tmp.ipackets += rxq->stats.ipackets;
-		tmp.ibytes += rxq->stats.ibytes;
-#endif
-		tmp.ierrors += rxq->stats.idropped;
-		tmp.rx_nombuf += rxq->stats.rx_nombuf;
-	}
-	for (i = 0; (i != priv->txqs_n); ++i) {
-		struct txq *txq = (*priv->txqs)[i];
-
-		if (txq == NULL)
-			continue;
-		idx = txq->stats.idx;
-		if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
-#ifdef MLX4_PMD_SOFT_COUNTERS
-			tmp.q_opackets[idx] += txq->stats.opackets;
-			tmp.q_obytes[idx] += txq->stats.obytes;
-#endif
-			tmp.q_errors[idx] += txq->stats.odropped;
-		}
-#ifdef MLX4_PMD_SOFT_COUNTERS
-		tmp.opackets += txq->stats.opackets;
-		tmp.obytes += txq->stats.obytes;
-#endif
-		tmp.oerrors += txq->stats.odropped;
-	}
-#ifndef MLX4_PMD_SOFT_COUNTERS
-	/* FIXME: retrieve and add hardware counters. */
-#endif
-	*stats = tmp;
-	priv_unlock(priv);
-}
-
-/**
- * DPDK callback to clear device statistics.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- */
-static void
-mlx4_stats_reset(struct rte_eth_dev *dev)
-{
-	struct priv *priv = mlx4_get_priv(dev);
-	unsigned int i;
-	unsigned int idx;
-
-	if (priv == NULL)
-		return;
-	priv_lock(priv);
-	for (i = 0; (i != priv->rxqs_n); ++i) {
-		if ((*priv->rxqs)[i] == NULL)
-			continue;
-		idx = (*priv->rxqs)[i]->stats.idx;
-		(*priv->rxqs)[i]->stats =
-			(struct mlx4_rxq_stats){ .idx = idx };
-	}
-	for (i = 0; (i != priv->txqs_n); ++i) {
-		if ((*priv->txqs)[i] == NULL)
-			continue;
-		idx = (*priv->txqs)[i]->stats.idx;
-		(*priv->txqs)[i]->stats =
-			(struct mlx4_txq_stats){ .idx = idx };
-	}
-#ifndef MLX4_PMD_SOFT_COUNTERS
-	/* FIXME: reset hardware counters. */
-#endif
-	priv_unlock(priv);
-}
-
-/**
- * DPDK callback to remove a MAC address.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param index
- *   MAC address index.
- */
-static void
-mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
-{
-	struct priv *priv = dev->data->dev_private;
-
-	if (mlx4_is_secondary())
-		return;
-	priv_lock(priv);
-	if (priv->isolated)
-		goto end;
-	DEBUG("%p: removing MAC address from index %" PRIu32,
-	      (void *)dev, index);
-	/* Last array entry is reserved for broadcast. */
-	if (index >= (elemof(priv->mac) - 1))
-		goto end;
-	priv_mac_addr_del(priv, index);
-end:
-	priv_unlock(priv);
-}
-
-/**
- * DPDK callback to add a MAC address.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param mac_addr
- *   MAC address to register.
- * @param index
- *   MAC address index.
- * @param vmdq
- *   VMDq pool index to associate address with (ignored).
- */
-static int
-mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
-		  uint32_t index, uint32_t vmdq)
-{
-	struct priv *priv = dev->data->dev_private;
-	int re;
-
-	if (mlx4_is_secondary())
-		return -ENOTSUP;
-	(void)vmdq;
-	priv_lock(priv);
-	if (priv->isolated) {
-		DEBUG("%p: cannot add MAC address, "
-		      "device is in isolated mode", (void *)dev);
-		re = EPERM;
-		goto end;
-	}
-	DEBUG("%p: adding MAC address at index %" PRIu32,
-	      (void *)dev, index);
-	/* Last array entry is reserved for broadcast. */
-	if (index >= (elemof(priv->mac) - 1)) {
-		re = EINVAL;
-		goto end;
-	}
-	re = priv_mac_addr_add(priv, index,
-			       (const uint8_t (*)[ETHER_ADDR_LEN])
-			       mac_addr->addr_bytes);
-end:
-	priv_unlock(priv);
-	return -re;
-}
-
-/**
- * DPDK callback to set the primary MAC address.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param mac_addr
- *   MAC address to register.
- */
-static void
-mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
-{
-	DEBUG("%p: setting primary MAC address", (void *)dev);
-	mlx4_mac_addr_remove(dev, 0);
-	mlx4_mac_addr_add(dev, mac_addr, 0, 0);
-}
-
-/**
- * DPDK callback to enable promiscuous mode.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- */
-static void
-mlx4_promiscuous_enable(struct rte_eth_dev *dev)
-{
-	struct priv *priv = dev->data->dev_private;
-	unsigned int i;
-	int ret;
-
-	if (mlx4_is_secondary())
-		return;
-	priv_lock(priv);
-	if (priv->isolated) {
-		DEBUG("%p: cannot enable promiscuous, "
-		      "device is in isolated mode", (void *)dev);
-		priv_unlock(priv);
-		return;
-	}
-	if (priv->promisc) {
-		priv_unlock(priv);
-		return;
-	}
-	/* If device isn't started, this is all we need to do. */
-	if (!priv->started)
-		goto end;
-	if (priv->rss) {
-		ret = rxq_promiscuous_enable(LIST_FIRST(&priv->parents));
-		if (ret) {
-			priv_unlock(priv);
-			return;
-		}
-		goto end;
-	}
-	for (i = 0; (i != priv->rxqs_n); ++i) {
-		if ((*priv->rxqs)[i] == NULL)
-			continue;
-		ret = rxq_promiscuous_enable((*priv->rxqs)[i]);
-		if (!ret)
-			continue;
-		/* Failure, rollback. */
-		while (i != 0)
-			if ((*priv->rxqs)[--i] != NULL)
-				rxq_promiscuous_disable((*priv->rxqs)[i]);
-		priv_unlock(priv);
-		return;
-	}
-end:
-	priv->promisc = 1;
-	priv_unlock(priv);
-}
-
-/**
- * DPDK callback to disable promiscuous mode.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- */
-static void
-mlx4_promiscuous_disable(struct rte_eth_dev *dev)
-{
-	struct priv *priv = dev->data->dev_private;
-	unsigned int i;
-
-	if (mlx4_is_secondary())
-		return;
-	priv_lock(priv);
-	if (!priv->promisc || priv->isolated) {
-		priv_unlock(priv);
-		return;
-	}
-	if (priv->rss) {
-		rxq_promiscuous_disable(LIST_FIRST(&priv->parents));
-		goto end;
-	}
-	for (i = 0; (i != priv->rxqs_n); ++i)
-		if ((*priv->rxqs)[i] != NULL)
-			rxq_promiscuous_disable((*priv->rxqs)[i]);
-end:
-	priv->promisc = 0;
-	priv_unlock(priv);
-}
-
-/**
- * DPDK callback to enable allmulti mode.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- */
-static void
-mlx4_allmulticast_enable(struct rte_eth_dev *dev)
-{
-	struct priv *priv = dev->data->dev_private;
-	unsigned int i;
-	int ret;
-
-	if (mlx4_is_secondary())
-		return;
-	priv_lock(priv);
-	if (priv->isolated) {
-		DEBUG("%p: cannot enable allmulticast, "
-		      "device is in isolated mode", (void *)dev);
-		priv_unlock(priv);
-		return;
-	}
-	if (priv->allmulti) {
-		priv_unlock(priv);
-		return;
-	}
-	/* If device isn't started, this is all we need to do. */
-	if (!priv->started)
-		goto end;
-	if (priv->rss) {
-		ret = rxq_allmulticast_enable(LIST_FIRST(&priv->parents));
-		if (ret) {
-			priv_unlock(priv);
-			return;
-		}
-		goto end;
-	}
-	for (i = 0; (i != priv->rxqs_n); ++i) {
-		if ((*priv->rxqs)[i] == NULL)
-			continue;
-		ret = rxq_allmulticast_enable((*priv->rxqs)[i]);
-		if (!ret)
-			continue;
-		/* Failure, rollback. */
-		while (i != 0)
-			if ((*priv->rxqs)[--i] != NULL)
-				rxq_allmulticast_disable((*priv->rxqs)[i]);
-		priv_unlock(priv);
-		return;
-	}
-end:
-	priv->allmulti = 1;
-	priv_unlock(priv);
-}
-
-/**
- * DPDK callback to disable allmulti mode.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- */
-static void
-mlx4_allmulticast_disable(struct rte_eth_dev *dev)
-{
-	struct priv *priv = dev->data->dev_private;
-	unsigned int i;
-
-	if (mlx4_is_secondary())
-		return;
-	priv_lock(priv);
-	if (!priv->allmulti || priv->isolated) {
-		priv_unlock(priv);
-		return;
-	}
-	if (priv->rss) {
-		rxq_allmulticast_disable(LIST_FIRST(&priv->parents));
-		goto end;
-	}
-	for (i = 0; (i != priv->rxqs_n); ++i)
-		if ((*priv->rxqs)[i] != NULL)
-			rxq_allmulticast_disable((*priv->rxqs)[i]);
-end:
-	priv->allmulti = 0;
-	priv_unlock(priv);
-}
-
-/**
- * DPDK callback to retrieve physical link information.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param wait_to_complete
- *   Wait for request completion (ignored).
- */
-static int
-mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete)
-{
-	const struct priv *priv = mlx4_get_priv(dev);
-	struct ethtool_cmd edata = {
-		.cmd = ETHTOOL_GSET
-	};
-	struct ifreq ifr;
-	struct rte_eth_link dev_link;
-	int link_speed = 0;
-
-	/* priv_lock() is not taken to allow concurrent calls. */
-
-	if (priv == NULL)
-		return -EINVAL;
-	(void)wait_to_complete;
-	if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
-		WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno));
-		return -1;
-	}
-	memset(&dev_link, 0, sizeof(dev_link));
-	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
-				(ifr.ifr_flags & IFF_RUNNING));
-	ifr.ifr_data = (void *)&edata;
-	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
-		WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
-		     strerror(errno));
-		return -1;
-	}
-	link_speed = ethtool_cmd_speed(&edata);
-	if (link_speed == -1)
-		dev_link.link_speed = 0;
-	else
-		dev_link.link_speed = link_speed;
-	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
-				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
-	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
-			ETH_LINK_SPEED_FIXED);
-	if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
-		/* Link status changed. */
-		dev->data->dev_link = dev_link;
-		return 0;
-	}
-	/* Link status is still the same. */
-	return -1;
-}
-
-static int
-mlx4_ibv_device_to_pci_addr(const struct ibv_device *device,
-			    struct rte_pci_addr *pci_addr);
-
-/**
- * DPDK callback to change the MTU.
- *
- * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be
- * received). Use this as a hint to enable/disable scattered packets support
- * and improve performance when not needed.
- * Since failure is not an option, reconfiguring queues on the fly is not
- * recommended.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param in_mtu
- *   New MTU.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-mlx4_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
-{
-	struct priv *priv = dev->data->dev_private;
-	int ret = 0;
-	unsigned int i;
-	uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) =
-		mlx4_rx_burst;
-
-	if (mlx4_is_secondary())
-		return -E_RTE_SECONDARY;
-	priv_lock(priv);
-	/* Set kernel interface MTU first. */
-	if (priv_set_mtu(priv, mtu)) {
-		ret = errno;
-		WARN("cannot set port %u MTU to %u: %s", priv->port, mtu,
-		     strerror(ret));
-		goto out;
-	} else
-		DEBUG("adapter port %u MTU set to %u", priv->port, mtu);
-	priv->mtu = mtu;
-	/* Temporarily replace RX handler with a fake one, assuming it has not
-	 * been copied elsewhere. */
-	dev->rx_pkt_burst = removed_rx_burst;
-	/* Make sure everyone has left mlx4_rx_burst() and uses
-	 * removed_rx_burst() instead. */
-	rte_wmb();
-	usleep(1000);
-	/* Reconfigure each RX queue. */
-	for (i = 0; (i != priv->rxqs_n); ++i) {
-		struct rxq *rxq = (*priv->rxqs)[i];
-		unsigned int max_frame_len;
-
-		if (rxq == NULL)
-			continue;
-		/* Calculate new maximum frame length according to MTU. */
-		max_frame_len = (priv->mtu + ETHER_HDR_LEN +
-				 (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN));
-		/* Provide new values to rxq_setup(). */
-		dev->data->dev_conf.rxmode.jumbo_frame =
-			(max_frame_len > ETHER_MAX_LEN);
-		dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
-		ret = rxq_rehash(dev, rxq);
-		if (ret) {
-			/* Force SP RX if that queue requires it and abort. */
-			if (rxq->sp)
-				rx_func = mlx4_rx_burst_sp;
-			break;
-		}
-		/* Reenable non-RSS queue attributes. No need to check
-		 * for errors at this stage. */
-		if (!priv->rss && !priv->isolated) {
-			rxq_mac_addrs_add(rxq);
-			if (priv->promisc)
-				rxq_promiscuous_enable(rxq);
-			if (priv->allmulti)
-				rxq_allmulticast_enable(rxq);
-		}
-		/* Scattered burst function takes priority. */
-		if (rxq->sp)
-			rx_func = mlx4_rx_burst_sp;
-	}
-	/* Burst functions can now be called again. */
-	rte_wmb();
-	dev->rx_pkt_burst = rx_func;
-out:
-	priv_unlock(priv);
-	assert(ret >= 0);
-	return -ret;
-}
-
-/**
- * DPDK callback to get flow control status.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param[out] fc_conf
- *   Flow control output buffer.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-mlx4_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
-{
-	struct priv *priv = dev->data->dev_private;
-	struct ifreq ifr;
-	struct ethtool_pauseparam ethpause = {
-		.cmd = ETHTOOL_GPAUSEPARAM
-	};
-	int ret;
-
-	if (mlx4_is_secondary())
-		return -E_RTE_SECONDARY;
-	ifr.ifr_data = (void *)&ethpause;
-	priv_lock(priv);
-	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
-		ret = errno;
-		WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)"
-		     " failed: %s",
-		     strerror(ret));
-		goto out;
-	}
-
-	fc_conf->autoneg = ethpause.autoneg;
-	if (ethpause.rx_pause && ethpause.tx_pause)
-		fc_conf->mode = RTE_FC_FULL;
-	else if (ethpause.rx_pause)
-		fc_conf->mode = RTE_FC_RX_PAUSE;
-	else if (ethpause.tx_pause)
-		fc_conf->mode = RTE_FC_TX_PAUSE;
-	else
-		fc_conf->mode = RTE_FC_NONE;
-	ret = 0;
-
-out:
-	priv_unlock(priv);
-	assert(ret >= 0);
-	return -ret;
-}
-
-/**
- * DPDK callback to modify flow control parameters.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param[in] fc_conf
- *   Flow control parameters.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-mlx4_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
-{
-	struct priv *priv = dev->data->dev_private;
-	struct ifreq ifr;
-	struct ethtool_pauseparam ethpause = {
-		.cmd = ETHTOOL_SPAUSEPARAM
-	};
-	int ret;
-
-	if (mlx4_is_secondary())
-		return -E_RTE_SECONDARY;
-	ifr.ifr_data = (void *)&ethpause;
-	ethpause.autoneg = fc_conf->autoneg;
-	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
-	    (fc_conf->mode & RTE_FC_RX_PAUSE))
-		ethpause.rx_pause = 1;
-	else
-		ethpause.rx_pause = 0;
-
-	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
-	    (fc_conf->mode & RTE_FC_TX_PAUSE))
-		ethpause.tx_pause = 1;
-	else
-		ethpause.tx_pause = 0;
-
-	priv_lock(priv);
-	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
-		ret = errno;
-		WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
-		     " failed: %s",
-		     strerror(ret));
-		goto out;
-	}
-	ret = 0;
-
-out:
-	priv_unlock(priv);
-	assert(ret >= 0);
-	return -ret;
-}
-
-/**
- * Configure a VLAN filter.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param vlan_id
- *   VLAN ID to filter.
- * @param on
- *   Toggle filter.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
-{
-	struct priv *priv = dev->data->dev_private;
-	unsigned int i;
-	unsigned int j = -1;
-
-	DEBUG("%p: %s VLAN filter ID %" PRIu16,
-	      (void *)dev, (on ? "enable" : "disable"), vlan_id);
-	for (i = 0; (i != elemof(priv->vlan_filter)); ++i) {
-		if (!priv->vlan_filter[i].enabled) {
-			/* Unused index, remember it. */
-			j = i;
-			continue;
-		}
-		if (priv->vlan_filter[i].id != vlan_id)
-			continue;
-		/* This VLAN ID is already known, use its index. */
-		j = i;
-		break;
-	}
-	/* Check if there's room for another VLAN filter. */
-	if (j == (unsigned int)-1)
-		return ENOMEM;
-	/*
-	 * VLAN filters apply to all configured MAC addresses, flow
-	 * specifications must be reconfigured accordingly.
-	 */
-	priv->vlan_filter[j].id = vlan_id;
-	if ((on) && (!priv->vlan_filter[j].enabled)) {
-		/*
-		 * Filter is disabled, enable it.
-		 * Rehashing flows in all RX queues is necessary.
-		 */
-		if (priv->rss)
-			rxq_mac_addrs_del(LIST_FIRST(&priv->parents));
-		else
-			for (i = 0; (i != priv->rxqs_n); ++i)
-				if ((*priv->rxqs)[i] != NULL)
-					rxq_mac_addrs_del((*priv->rxqs)[i]);
-		priv->vlan_filter[j].enabled = 1;
-		if (priv->started) {
-			if (priv->rss)
-				rxq_mac_addrs_add(LIST_FIRST(&priv->parents));
-			else
-				for (i = 0; (i != priv->rxqs_n); ++i) {
-					if ((*priv->rxqs)[i] == NULL)
-						continue;
-					rxq_mac_addrs_add((*priv->rxqs)[i]);
-				}
-		}
-	} else if ((!on) && (priv->vlan_filter[j].enabled)) {
-		/*
-		 * Filter is enabled, disable it.
-		 * Rehashing flows in all RX queues is necessary.
-		 */
-		if (priv->rss)
-			rxq_mac_addrs_del(LIST_FIRST(&priv->parents));
-		else
-			for (i = 0; (i != priv->rxqs_n); ++i)
-				if ((*priv->rxqs)[i] != NULL)
-					rxq_mac_addrs_del((*priv->rxqs)[i]);
-		priv->vlan_filter[j].enabled = 0;
-		if (priv->started) {
-			if (priv->rss)
-				rxq_mac_addrs_add(LIST_FIRST(&priv->parents));
-			else
-				for (i = 0; (i != priv->rxqs_n); ++i) {
-					if ((*priv->rxqs)[i] == NULL)
-						continue;
-					rxq_mac_addrs_add((*priv->rxqs)[i]);
-				}
-		}
-	}
-	return 0;
-}
-
-/**
- * DPDK callback to configure a VLAN filter.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param vlan_id
- *   VLAN ID to filter.
- * @param on
- *   Toggle filter.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
-{
-	struct priv *priv = dev->data->dev_private;
-	int ret;
-
-	if (mlx4_is_secondary())
-		return -E_RTE_SECONDARY;
-	priv_lock(priv);
-	if (priv->isolated) {
-		DEBUG("%p: cannot set vlan filter, "
-		      "device is in isolated mode", (void *)dev);
-		priv_unlock(priv);
-		return -EINVAL;
-	}
-	ret = vlan_filter_set(dev, vlan_id, on);
-	priv_unlock(priv);
-	assert(ret >= 0);
-	return -ret;
-}
-
-const struct rte_flow_ops mlx4_flow_ops = {
-	.validate = mlx4_flow_validate,
-	.create = mlx4_flow_create,
-	.destroy = mlx4_flow_destroy,
-	.flush = mlx4_flow_flush,
-	.query = NULL,
-	.isolate = mlx4_flow_isolate,
-};
-
-/**
- * Manage filter operations.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param filter_type
- *   Filter type.
- * @param filter_op
- *   Operation to perform.
- * @param arg
- *   Pointer to operation-specific structure.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-mlx4_dev_filter_ctrl(struct rte_eth_dev *dev,
-		     enum rte_filter_type filter_type,
-		     enum rte_filter_op filter_op,
-		     void *arg)
-{
-	int ret = EINVAL;
-
-	switch (filter_type) {
-	case RTE_ETH_FILTER_GENERIC:
-		if (filter_op != RTE_ETH_FILTER_GET)
-			return -EINVAL;
-		*(const void **)arg = &mlx4_flow_ops;
-		return 0;
-	case RTE_ETH_FILTER_FDIR:
-		DEBUG("%p: filter type FDIR is not supported by this PMD",
-		      (void *)dev);
-		break;
-	default:
-		ERROR("%p: filter type (%d) not supported",
-		      (void *)dev, filter_type);
-		break;
-	}
-	return -ret;
-}
-
 static const struct eth_dev_ops mlx4_dev_ops = {
 	.dev_configure = mlx4_dev_configure,
 	.dev_start = mlx4_dev_start,
 	.dev_stop = mlx4_dev_stop,
-	.dev_set_link_down = mlx4_set_link_down,
-	.dev_set_link_up = mlx4_set_link_up,
+	.dev_set_link_down = mlx4_dev_set_link_down,
+	.dev_set_link_up = mlx4_dev_set_link_up,
 	.dev_close = mlx4_dev_close,
+	.link_update = mlx4_link_update,
 	.promiscuous_enable = mlx4_promiscuous_enable,
 	.promiscuous_disable = mlx4_promiscuous_disable,
 	.allmulticast_enable = mlx4_allmulticast_enable,
 	.allmulticast_disable = mlx4_allmulticast_disable,
-	.link_update = mlx4_link_update,
+	.mac_addr_remove = mlx4_mac_addr_remove,
+	.mac_addr_add = mlx4_mac_addr_add,
+	.mac_addr_set = mlx4_mac_addr_set,
 	.stats_get = mlx4_stats_get,
 	.stats_reset = mlx4_stats_reset,
-	.queue_stats_mapping_set = NULL,
 	.dev_infos_get = mlx4_dev_infos_get,
 	.dev_supported_ptypes_get = mlx4_dev_supported_ptypes_get,
 	.vlan_filter_set = mlx4_vlan_filter_set,
-	.vlan_tpid_set = NULL,
-	.vlan_strip_queue_set = NULL,
-	.vlan_offload_set = NULL,
 	.rx_queue_setup = mlx4_rx_queue_setup,
 	.tx_queue_setup = mlx4_tx_queue_setup,
 	.rx_queue_release = mlx4_rx_queue_release,
 	.tx_queue_release = mlx4_tx_queue_release,
-	.dev_led_on = NULL,
-	.dev_led_off = NULL,
-	.flow_ctrl_get = mlx4_dev_get_flow_ctrl,
-	.flow_ctrl_set = mlx4_dev_set_flow_ctrl,
-	.priority_flow_ctrl_set = NULL,
-	.mac_addr_remove = mlx4_mac_addr_remove,
-	.mac_addr_add = mlx4_mac_addr_add,
-	.mac_addr_set = mlx4_mac_addr_set,
-	.mtu_set = mlx4_dev_set_mtu,
-	.filter_ctrl = mlx4_dev_filter_ctrl,
+	.flow_ctrl_get = mlx4_flow_ctrl_get,
+	.flow_ctrl_set = mlx4_flow_ctrl_set,
+	.mtu_set = mlx4_mtu_set,
+	.filter_ctrl = mlx4_filter_ctrl,
 	.rx_queue_intr_enable = mlx4_rx_intr_enable,
 	.rx_queue_intr_disable = mlx4_rx_intr_disable,
 };
@@ -5381,7 +267,7 @@ static const struct eth_dev_ops mlx4_dev_ops = {
  *   PCI bus address output buffer.
  *
  * @return
- *   0 on success, -1 on failure and errno is set.
+ *   0 on success, negative errno value otherwise and rte_errno is set.
  */
 static int
 mlx4_ibv_device_to_pci_addr(const struct ibv_device *device,
@@ -5392,8 +278,10 @@ mlx4_ibv_device_to_pci_addr(const struct ibv_device *device,
 	MKSTR(path, "%s/device/uevent", device->ibdev_path);
 
 	file = fopen(path, "rb");
-	if (file == NULL)
-		return -1;
+	if (file == NULL) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
 	while (fgets(line, sizeof(line), file) == line) {
 		size_t len = strlen(line);
 		int ret;
@@ -5423,572 +311,48 @@ mlx4_ibv_device_to_pci_addr(const struct ibv_device *device,
 }
 
 /**
- * Get MAC address by querying netdevice.
- *
- * @param[in] priv
- *   struct priv for the requested device.
- * @param[out] mac
- *   MAC address output buffer.
- *
- * @return
- *   0 on success, -1 on failure and errno is set.
- */
-static int
-priv_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN])
-{
-	struct ifreq request;
-
-	if (priv_ifreq(priv, SIOCGIFHWADDR, &request))
-		return -1;
-	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
-	return 0;
-}
-
-/* Support up to 32 adapters. */
-static struct {
-	struct rte_pci_addr pci_addr; /* associated PCI address */
-	uint32_t ports; /* physical ports bitfield. */
-} mlx4_dev[32];
-
-/**
- * Get device index in mlx4_dev[] from PCI bus address.
- *
- * @param[in] pci_addr
- *   PCI bus address to look for.
- *
- * @return
- *   mlx4_dev[] index on success, -1 on failure.
- */
-static int
-mlx4_dev_idx(struct rte_pci_addr *pci_addr)
-{
-	unsigned int i;
-	int ret = -1;
-
-	assert(pci_addr != NULL);
-	for (i = 0; (i != elemof(mlx4_dev)); ++i) {
-		if ((mlx4_dev[i].pci_addr.domain == pci_addr->domain) &&
-		    (mlx4_dev[i].pci_addr.bus == pci_addr->bus) &&
-		    (mlx4_dev[i].pci_addr.devid == pci_addr->devid) &&
-		    (mlx4_dev[i].pci_addr.function == pci_addr->function))
-			return i;
-		if ((mlx4_dev[i].ports == 0) && (ret == -1))
-			ret = i;
-	}
-	return ret;
-}
-
-/**
- * Retrieve integer value from environment variable.
- *
- * @param[in] name
- *   Environment variable name.
- *
- * @return
- *   Integer value, 0 if the variable is not set.
- */
-static int
-mlx4_getenv_int(const char *name)
-{
-	const char *val = getenv(name);
-
-	if (val == NULL)
-		return 0;
-	return atoi(val);
-}
-
-static void
-mlx4_dev_link_status_handler(void *);
-static void
-mlx4_dev_interrupt_handler(void *);
-
-/**
- * Link/device status handler.
- *
- * @param priv
- *   Pointer to private structure.
- * @param dev
- *   Pointer to the rte_eth_dev structure.
- * @param events
- *   Pointer to event flags holder.
- *
- * @return
- *   Number of events
- */
-static int
-priv_dev_status_handler(struct priv *priv, struct rte_eth_dev *dev,
-			uint32_t *events)
-{
-	struct ibv_async_event event;
-	int port_change = 0;
-	struct rte_eth_link *link = &dev->data->dev_link;
-	int ret = 0;
-
-	*events = 0;
-	/* Read all message and acknowledge them. */
-	for (;;) {
-		if (ibv_get_async_event(priv->ctx, &event))
-			break;
-		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
-		     event.event_type == IBV_EVENT_PORT_ERR) &&
-		    (priv->intr_conf.lsc == 1)) {
-			port_change = 1;
-			ret++;
-		} else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
-			   priv->intr_conf.rmv == 1) {
-			*events |= (1 << RTE_ETH_EVENT_INTR_RMV);
-			ret++;
-		} else
-			DEBUG("event type %d on port %d not handled",
-			      event.event_type, event.element.port_num);
-		ibv_ack_async_event(&event);
-	}
-	if (!port_change)
-		return ret;
-	mlx4_link_update(dev, 0);
-	if (((link->link_speed == 0) && link->link_status) ||
-	    ((link->link_speed != 0) && !link->link_status)) {
-		if (!priv->pending_alarm) {
-			/* Inconsistent status, check again later. */
-			priv->pending_alarm = 1;
-			rte_eal_alarm_set(MLX4_ALARM_TIMEOUT_US,
-					  mlx4_dev_link_status_handler,
-					  dev);
-		}
-	} else {
-		*events |= (1 << RTE_ETH_EVENT_INTR_LSC);
-	}
-	return ret;
-}
-
-/**
- * Handle delayed link status event.
- *
- * @param arg
- *   Registered argument.
- */
-static void
-mlx4_dev_link_status_handler(void *arg)
-{
-	struct rte_eth_dev *dev = arg;
-	struct priv *priv = dev->data->dev_private;
-	uint32_t events;
-	int ret;
-
-	priv_lock(priv);
-	assert(priv->pending_alarm == 1);
-	priv->pending_alarm = 0;
-	ret = priv_dev_status_handler(priv, dev, &events);
-	priv_unlock(priv);
-	if (ret > 0 && events & (1 << RTE_ETH_EVENT_INTR_LSC))
-		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
-					      NULL);
-}
-
-/**
- * Handle interrupts from the NIC.
- *
- * @param[in] intr_handle
- *   Interrupt handler.
- * @param cb_arg
- *   Callback argument.
- */
-static void
-mlx4_dev_interrupt_handler(void *cb_arg)
-{
-	struct rte_eth_dev *dev = cb_arg;
-	struct priv *priv = dev->data->dev_private;
-	int ret;
-	uint32_t ev;
-	int i;
-
-	priv_lock(priv);
-	ret = priv_dev_status_handler(priv, dev, &ev);
-	priv_unlock(priv);
-	if (ret > 0) {
-		for (i = RTE_ETH_EVENT_UNKNOWN;
-		     i < RTE_ETH_EVENT_MAX;
-		     i++) {
-			if (ev & (1 << i)) {
-				ev &= ~(1 << i);
-				_rte_eth_dev_callback_process(dev, i, NULL,
-							      NULL);
-				ret--;
-			}
-		}
-		if (ret)
-			WARN("%d event%s not processed", ret,
-			     (ret > 1 ? "s were" : " was"));
-	}
-}
-
-/**
- * Uninstall interrupt handler.
- *
- * @param priv
- *   Pointer to private structure.
- * @param dev
- *   Pointer to the rte_eth_dev structure.
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
-{
-	int ret;
-
-	if (priv->intr_conf.lsc ||
-	    priv->intr_conf.rmv)
-		return 0;
-	ret = rte_intr_callback_unregister(&priv->intr_handle,
-					   mlx4_dev_interrupt_handler,
-					   dev);
-	if (ret < 0) {
-		ERROR("rte_intr_callback_unregister failed with %d"
-		      "%s%s%s", ret,
-		      (errno ? " (errno: " : ""),
-		      (errno ? strerror(errno) : ""),
-		      (errno ? ")" : ""));
-	}
-	priv->intr_handle.fd = 0;
-	priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
-	return ret;
-}
-
-/**
- * Install interrupt handler.
- *
- * @param priv
- *   Pointer to private structure.
- * @param dev
- *   Pointer to the rte_eth_dev structure.
- * @return
- *   0 on success, negative errno value on failure.
- */
-static int
-priv_dev_interrupt_handler_install(struct priv *priv,
-				   struct rte_eth_dev *dev)
-{
-	int flags;
-	int rc;
-
-	/* Check whether the interrupt handler has already been installed
-	 * for either type of interrupt
-	 */
-	if (priv->intr_conf.lsc &&
-	    priv->intr_conf.rmv &&
-	    priv->intr_handle.fd)
-		return 0;
-	assert(priv->ctx->async_fd > 0);
-	flags = fcntl(priv->ctx->async_fd, F_GETFL);
-	rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
-	if (rc < 0) {
-		INFO("failed to change file descriptor async event queue");
-		dev->data->dev_conf.intr_conf.lsc = 0;
-		dev->data->dev_conf.intr_conf.rmv = 0;
-		return -errno;
-	} else {
-		priv->intr_handle.fd = priv->ctx->async_fd;
-		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
-		rc = rte_intr_callback_register(&priv->intr_handle,
-						 mlx4_dev_interrupt_handler,
-						 dev);
-		if (rc) {
-			ERROR("rte_intr_callback_register failed "
-			      " (errno: %s)", strerror(errno));
-			return rc;
-		}
-	}
-	return 0;
-}
-
-/**
- * Uninstall interrupt handler.
- *
- * @param priv
- *   Pointer to private structure.
- * @param dev
- *   Pointer to the rte_eth_dev structure.
- * @return
- *   0 on success, negative value on error.
- */
-static int
-priv_dev_removal_interrupt_handler_uninstall(struct priv *priv,
-					    struct rte_eth_dev *dev)
-{
-	if (dev->data->dev_conf.intr_conf.rmv) {
-		priv->intr_conf.rmv = 0;
-		return priv_dev_interrupt_handler_uninstall(priv, dev);
-	}
-	return 0;
-}
-
-/**
- * Uninstall interrupt handler.
- *
- * @param priv
- *   Pointer to private structure.
- * @param dev
- *   Pointer to the rte_eth_dev structure.
- * @return
- *   0 on success, negative value on error,
- */
-static int
-priv_dev_link_interrupt_handler_uninstall(struct priv *priv,
-					  struct rte_eth_dev *dev)
-{
-	int ret = 0;
-
-	if (dev->data->dev_conf.intr_conf.lsc) {
-		priv->intr_conf.lsc = 0;
-		ret = priv_dev_interrupt_handler_uninstall(priv, dev);
-		if (ret)
-			return ret;
-	}
-	if (priv->pending_alarm)
-		if (rte_eal_alarm_cancel(mlx4_dev_link_status_handler,
-					 dev)) {
-			ERROR("rte_eal_alarm_cancel failed "
-			      " (errno: %s)", strerror(rte_errno));
-			return -rte_errno;
-		}
-	priv->pending_alarm = 0;
-	return 0;
-}
-
-/**
- * Install link interrupt handler.
- *
- * @param priv
- *   Pointer to private structure.
- * @param dev
- *   Pointer to the rte_eth_dev structure.
- * @return
- *   0 on success, negative value on error.
- */
-static int
-priv_dev_link_interrupt_handler_install(struct priv *priv,
-					struct rte_eth_dev *dev)
-{
-	int ret;
-
-	if (dev->data->dev_conf.intr_conf.lsc) {
-		ret = priv_dev_interrupt_handler_install(priv, dev);
-		if (ret)
-			return ret;
-		priv->intr_conf.lsc = 1;
-	}
-	return 0;
-}
-
-/**
- * Install removal interrupt handler.
- *
- * @param priv
- *   Pointer to private structure.
- * @param dev
- *   Pointer to the rte_eth_dev structure.
- * @return
- *   0 on success, negative value on error.
- */
-static int
-priv_dev_removal_interrupt_handler_install(struct priv *priv,
-					   struct rte_eth_dev *dev)
-{
-	int ret;
-
-	if (dev->data->dev_conf.intr_conf.rmv) {
-		ret = priv_dev_interrupt_handler_install(priv, dev);
-		if (ret)
-			return ret;
-		priv->intr_conf.rmv = 1;
-	}
-	return 0;
-}
-
-/**
- * Allocate queue vector and fill epoll fd list for Rx interrupts.
- *
- * @param priv
- *   Pointer to private structure.
- *
- * @return
- *   0 on success, negative on failure.
- */
-static int
-priv_rx_intr_vec_enable(struct priv *priv)
-{
-	unsigned int i;
-	unsigned int rxqs_n = priv->rxqs_n;
-	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
-	unsigned int count = 0;
-	struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
-
-	if (!priv->dev->data->dev_conf.intr_conf.rxq)
-		return 0;
-	priv_rx_intr_vec_disable(priv);
-	intr_handle->intr_vec = malloc(sizeof(intr_handle->intr_vec[rxqs_n]));
-	if (intr_handle->intr_vec == NULL) {
-		ERROR("failed to allocate memory for interrupt vector,"
-		      " Rx interrupts will not be supported");
-		return -ENOMEM;
-	}
-	intr_handle->type = RTE_INTR_HANDLE_EXT;
-	for (i = 0; i != n; ++i) {
-		struct rxq *rxq = (*priv->rxqs)[i];
-		int fd;
-		int flags;
-		int rc;
-
-		/* Skip queues that cannot request interrupts. */
-		if (!rxq || !rxq->channel) {
-			/* Use invalid intr_vec[] index to disable entry. */
-			intr_handle->intr_vec[i] =
-				RTE_INTR_VEC_RXTX_OFFSET +
-				RTE_MAX_RXTX_INTR_VEC_ID;
-			continue;
-		}
-		if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
-			ERROR("too many Rx queues for interrupt vector size"
-			      " (%d), Rx interrupts cannot be enabled",
-			      RTE_MAX_RXTX_INTR_VEC_ID);
-			priv_rx_intr_vec_disable(priv);
-			return -1;
-		}
-		fd = rxq->channel->fd;
-		flags = fcntl(fd, F_GETFL);
-		rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
-		if (rc < 0) {
-			ERROR("failed to make Rx interrupt file descriptor"
-			      " %d non-blocking for queue index %d", fd, i);
-			priv_rx_intr_vec_disable(priv);
-			return rc;
-		}
-		intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
-		intr_handle->efds[count] = fd;
-		count++;
-	}
-	if (!count)
-		priv_rx_intr_vec_disable(priv);
-	else
-		intr_handle->nb_efd = count;
-	return 0;
-}
-
-/**
- * Clean up Rx interrupts handler.
- *
- * @param priv
- *   Pointer to private structure.
- */
-static void
-priv_rx_intr_vec_disable(struct priv *priv)
-{
-	struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
-
-	rte_intr_free_epoll_fd(intr_handle);
-	free(intr_handle->intr_vec);
-	intr_handle->nb_efd = 0;
-	intr_handle->intr_vec = NULL;
-}
-
-/**
- * DPDK callback for Rx queue interrupt enable.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param idx
- *   Rx queue index.
- *
- * @return
- *   0 on success, negative on failure.
- */
-static int
-mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx)
-{
-	struct priv *priv = dev->data->dev_private;
-	struct rxq *rxq = (*priv->rxqs)[idx];
-	int ret;
-
-	if (!rxq || !rxq->channel)
-		ret = EINVAL;
-	else
-		ret = ibv_req_notify_cq(rxq->cq, 0);
-	if (ret)
-		WARN("unable to arm interrupt on rx queue %d", idx);
-	return -ret;
-}
-
-/**
- * DPDK callback for Rx queue interrupt disable.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param idx
- *   Rx queue index.
- *
- * @return
- *   0 on success, negative on failure.
- */
-static int
-mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx)
-{
-	struct priv *priv = dev->data->dev_private;
-	struct rxq *rxq = (*priv->rxqs)[idx];
-	struct ibv_cq *ev_cq;
-	void *ev_ctx;
-	int ret;
-
-	if (!rxq || !rxq->channel) {
-		ret = EINVAL;
-	} else {
-		ret = ibv_get_cq_event(rxq->cq->channel, &ev_cq, &ev_ctx);
-		if (ret || ev_cq != rxq->cq)
-			ret = EINVAL;
-	}
-	if (ret)
-		WARN("unable to disable interrupt on rx queue %d",
-		     idx);
-	else
-		ibv_ack_cq_events(rxq->cq, 1);
-	return -ret;
-}
-
-/**
  * Verify and store value for device argument.
  *
  * @param[in] key
  *   Key argument to verify.
  * @param[in] val
  *   Value associated with key.
- * @param out
- *   User data.
+ * @param[in, out] conf
+ *   Shared configuration data.
  *
  * @return
- *   0 on success, negative errno value on failure.
+ *   0 on success, negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx4_arg_parse(const char *key, const char *val, void *out)
+mlx4_arg_parse(const char *key, const char *val, struct mlx4_conf *conf)
 {
-	struct mlx4_conf *conf = out;
 	unsigned long tmp;
 
 	errno = 0;
 	tmp = strtoul(val, NULL, 0);
 	if (errno) {
+		rte_errno = errno;
 		WARN("%s: \"%s\" is not a valid integer", key, val);
-		return -errno;
+		return -rte_errno;
 	}
 	if (strcmp(MLX4_PMD_PORT_KVARG, key) == 0) {
-		if (tmp >= MLX4_PMD_MAX_PHYS_PORTS) {
-			ERROR("invalid port index %lu (max: %u)",
-				tmp, MLX4_PMD_MAX_PHYS_PORTS - 1);
+		uint32_t ports = rte_log2_u32(conf->ports.present);
+
+		if (tmp >= ports) {
+			ERROR("port index %lu outside range [0,%" PRIu32 ")",
+			      tmp, ports);
 			return -EINVAL;
 		}
-		conf->active_ports |= 1 << tmp;
+		if (!(conf->ports.present & (1 << tmp))) {
+			rte_errno = EINVAL;
+			ERROR("invalid port index %lu", tmp);
+			return -rte_errno;
+		}
+		conf->ports.enabled |= 1 << tmp;
 	} else {
+		rte_errno = EINVAL;
 		WARN("%s: unknown parameter", key);
-		return -EINVAL;
+		return -rte_errno;
 	}
 	return 0;
 }
@@ -6000,7 +364,7 @@ mlx4_arg_parse(const char *key, const char *val, void *out)
  *   Device arguments structure.
  *
  * @return
- *   0 on success, negative errno value on failure.
+ *   0 on success, negative errno value otherwise and rte_errno is set.
  */
 static int
 mlx4_args(struct rte_devargs *devargs, struct mlx4_conf *conf)
@@ -6014,15 +378,21 @@ mlx4_args(struct rte_devargs *devargs, struct mlx4_conf *conf)
 		return 0;
 	kvlist = rte_kvargs_parse(devargs->args, pmd_mlx4_init_params);
 	if (kvlist == NULL) {
+		rte_errno = EINVAL;
 		ERROR("failed to parse kvargs");
-		return -EINVAL;
+		return -rte_errno;
 	}
 	/* Process parameters. */
 	for (i = 0; pmd_mlx4_init_params[i]; ++i) {
 		arg_count = rte_kvargs_count(kvlist, MLX4_PMD_PORT_KVARG);
 		while (arg_count-- > 0) {
-			ret = rte_kvargs_process(kvlist, MLX4_PMD_PORT_KVARG,
-					mlx4_arg_parse, conf);
+			ret = rte_kvargs_process(kvlist,
+						 MLX4_PMD_PORT_KVARG,
+						 (int (*)(const char *,
+							  const char *,
+							  void *))
+						 mlx4_arg_parse,
+						 conf);
 			if (ret != 0)
 				goto free_kvlist;
 		}
@@ -6046,7 +416,7 @@ static struct rte_pci_driver mlx4_driver;
  *   PCI device information.
  *
  * @return
- *   0 on success, negative errno value on failure.
+ *   0 on success, negative errno value otherwise and rte_errno is set.
  */
 static int
 mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
@@ -6057,30 +427,20 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 	struct ibv_context *attr_ctx = NULL;
 	struct ibv_device_attr device_attr;
 	struct mlx4_conf conf = {
-		.active_ports = 0,
+		.ports.present = 0,
 	};
 	unsigned int vf;
-	int idx;
 	int i;
 
 	(void)pci_drv;
 	assert(pci_drv == &mlx4_driver);
-	/* Get mlx4_dev[] index. */
-	idx = mlx4_dev_idx(&pci_dev->addr);
-	if (idx == -1) {
-		ERROR("this driver cannot support any more adapters");
-		return -ENOMEM;
-	}
-	DEBUG("using driver device index %d", idx);
-
-	/* Save PCI address. */
-	mlx4_dev[idx].pci_addr = pci_dev->addr;
 	list = ibv_get_device_list(&i);
 	if (list == NULL) {
-		assert(errno);
-		if (errno == ENOSYS)
+		rte_errno = errno;
+		assert(rte_errno);
+		if (rte_errno == ENOSYS)
 			ERROR("cannot list devices, is ib_uverbs loaded?");
-		return -errno;
+		return -rte_errno;
 	}
 	assert(i >= 0);
 	/*
@@ -6111,190 +471,112 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		ibv_free_device_list(list);
 		switch (err) {
 		case 0:
+			rte_errno = ENODEV;
 			ERROR("cannot access device, is mlx4_ib loaded?");
-			return -ENODEV;
+			return -rte_errno;
 		case EINVAL:
+			rte_errno = EINVAL;
 			ERROR("cannot use device, are drivers up to date?");
-			return -EINVAL;
+			return -rte_errno;
 		}
 		assert(err > 0);
-		return -err;
+		rte_errno = err;
+		return -rte_errno;
 	}
 	ibv_dev = list[i];
-
 	DEBUG("device opened");
 	if (ibv_query_device(attr_ctx, &device_attr)) {
-		err = ENODEV;
+		rte_errno = ENODEV;
 		goto error;
 	}
 	INFO("%u port(s) detected", device_attr.phys_port_cnt);
-
+	conf.ports.present |= (UINT64_C(1) << device_attr.phys_port_cnt) - 1;
 	if (mlx4_args(pci_dev->device.devargs, &conf)) {
 		ERROR("failed to process device arguments");
-		err = EINVAL;
+		rte_errno = EINVAL;
 		goto error;
 	}
 	/* Use all ports when none are defined */
-	if (conf.active_ports == 0) {
-		for (i = 0; i < MLX4_PMD_MAX_PHYS_PORTS; i++)
-			conf.active_ports |= 1 << i;
-	}
+	if (!conf.ports.enabled)
+		conf.ports.enabled = conf.ports.present;
 	for (i = 0; i < device_attr.phys_port_cnt; i++) {
 		uint32_t port = i + 1; /* ports are indexed from one */
-		uint32_t test = (1 << i);
 		struct ibv_context *ctx = NULL;
 		struct ibv_port_attr port_attr;
 		struct ibv_pd *pd = NULL;
 		struct priv *priv = NULL;
 		struct rte_eth_dev *eth_dev = NULL;
-#ifdef HAVE_EXP_QUERY_DEVICE
-		struct ibv_exp_device_attr exp_device_attr;
-#endif /* HAVE_EXP_QUERY_DEVICE */
 		struct ether_addr mac;
 
-		/* If port is not active, skip. */
-		if (!(conf.active_ports & (1 << i)))
+		/* If port is not enabled, skip. */
+		if (!(conf.ports.enabled & (1 << i)))
 			continue;
-#ifdef HAVE_EXP_QUERY_DEVICE
-		exp_device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS;
-#ifdef RSS_SUPPORT
-		exp_device_attr.comp_mask |= IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ;
-#endif /* RSS_SUPPORT */
-#endif /* HAVE_EXP_QUERY_DEVICE */
-
-		DEBUG("using port %u (%08" PRIx32 ")", port, test);
-
+		DEBUG("using port %u", port);
 		ctx = ibv_open_device(ibv_dev);
 		if (ctx == NULL) {
-			err = ENODEV;
+			rte_errno = ENODEV;
 			goto port_error;
 		}
-
 		/* Check port status. */
 		err = ibv_query_port(ctx, port, &port_attr);
 		if (err) {
-			ERROR("port query failed: %s", strerror(err));
-			err = ENODEV;
+			rte_errno = err;
+			ERROR("port query failed: %s", strerror(rte_errno));
 			goto port_error;
 		}
-
 		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+			rte_errno = ENOTSUP;
 			ERROR("port %d is not configured in Ethernet mode",
 			      port);
-			err = EINVAL;
 			goto port_error;
 		}
-
 		if (port_attr.state != IBV_PORT_ACTIVE)
 			DEBUG("port %d is not active: \"%s\" (%d)",
 			      port, ibv_port_state_str(port_attr.state),
 			      port_attr.state);
-
+		/* Make asynchronous FD non-blocking to handle interrupts. */
+		if (mlx4_fd_set_non_blocking(ctx->async_fd) < 0) {
+			ERROR("cannot make asynchronous FD non-blocking: %s",
+			      strerror(rte_errno));
+			goto port_error;
+		}
 		/* Allocate protection domain. */
 		pd = ibv_alloc_pd(ctx);
 		if (pd == NULL) {
+			rte_errno = ENOMEM;
 			ERROR("PD allocation failure");
-			err = ENOMEM;
 			goto port_error;
 		}
-
-		mlx4_dev[idx].ports |= test;
-
 		/* from rte_ethdev.c */
 		priv = rte_zmalloc("ethdev private structure",
 				   sizeof(*priv),
 				   RTE_CACHE_LINE_SIZE);
 		if (priv == NULL) {
+			rte_errno = ENOMEM;
 			ERROR("priv allocation failure");
-			err = ENOMEM;
 			goto port_error;
 		}
-
 		priv->ctx = ctx;
 		priv->device_attr = device_attr;
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
-#ifdef HAVE_EXP_QUERY_DEVICE
-		if (ibv_exp_query_device(ctx, &exp_device_attr)) {
-			ERROR("ibv_exp_query_device() failed");
-			err = ENODEV;
-			goto port_error;
-		}
-#ifdef RSS_SUPPORT
-		if ((exp_device_attr.exp_device_cap_flags &
-		     IBV_EXP_DEVICE_QPG) &&
-		    (exp_device_attr.exp_device_cap_flags &
-		     IBV_EXP_DEVICE_UD_RSS) &&
-		    (exp_device_attr.comp_mask &
-		     IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ) &&
-		    (exp_device_attr.max_rss_tbl_sz > 0)) {
-			priv->hw_qpg = 1;
-			priv->hw_rss = 1;
-			priv->max_rss_tbl_sz = exp_device_attr.max_rss_tbl_sz;
-		} else {
-			priv->hw_qpg = 0;
-			priv->hw_rss = 0;
-			priv->max_rss_tbl_sz = 0;
-		}
-		priv->hw_tss = !!(exp_device_attr.exp_device_cap_flags &
-				  IBV_EXP_DEVICE_UD_TSS);
-		DEBUG("device flags: %s%s%s",
-		      (priv->hw_qpg ? "IBV_DEVICE_QPG " : ""),
-		      (priv->hw_tss ? "IBV_DEVICE_TSS " : ""),
-		      (priv->hw_rss ? "IBV_DEVICE_RSS " : ""));
-		if (priv->hw_rss)
-			DEBUG("maximum RSS indirection table size: %u",
-			      exp_device_attr.max_rss_tbl_sz);
-#endif /* RSS_SUPPORT */
-
-		priv->hw_csum =
-			((exp_device_attr.exp_device_cap_flags &
-			  IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT) &&
-			 (exp_device_attr.exp_device_cap_flags &
-			  IBV_EXP_DEVICE_RX_CSUM_IP_PKT));
+		priv->vf = vf;
+		priv->hw_csum =	!!(device_attr.device_cap_flags &
+				   IBV_DEVICE_RAW_IP_CSUM);
 		DEBUG("checksum offloading is %ssupported",
 		      (priv->hw_csum ? "" : "not "));
-
-		priv->hw_csum_l2tun = !!(exp_device_attr.exp_device_cap_flags &
-					 IBV_EXP_DEVICE_VXLAN_SUPPORT);
+		/* Only ConnectX-3 Pro supports tunneling. */
+		priv->hw_csum_l2tun =
+			priv->hw_csum &&
+			(device_attr.vendor_part_id ==
+			 PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO);
 		DEBUG("L2 tunnel checksum offloads are %ssupported",
 		      (priv->hw_csum_l2tun ? "" : "not "));
-
-#ifdef INLINE_RECV
-		priv->inl_recv_size = mlx4_getenv_int("MLX4_INLINE_RECV_SIZE");
-
-		if (priv->inl_recv_size) {
-			exp_device_attr.comp_mask =
-				IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ;
-			if (ibv_exp_query_device(ctx, &exp_device_attr)) {
-				INFO("Couldn't query device for inline-receive"
-				     " capabilities.");
-				priv->inl_recv_size = 0;
-			} else {
-				if ((unsigned)exp_device_attr.inline_recv_sz <
-				    priv->inl_recv_size) {
-					INFO("Max inline-receive (%d) <"
-					     " requested inline-receive (%u)",
-					     exp_device_attr.inline_recv_sz,
-					     priv->inl_recv_size);
-					priv->inl_recv_size =
-						exp_device_attr.inline_recv_sz;
-				}
-			}
-			INFO("Set inline receive size to %u",
-			     priv->inl_recv_size);
-		}
-#endif /* INLINE_RECV */
-#endif /* HAVE_EXP_QUERY_DEVICE */
-
-		(void)mlx4_getenv_int;
-		priv->vf = vf;
 		/* Configure the first MAC address by default. */
-		if (priv_get_mac(priv, &mac.addr_bytes)) {
+		if (mlx4_get_mac(priv, &mac.addr_bytes)) {
 			ERROR("cannot get MAC address, is mlx4_en loaded?"
-			      " (errno: %s)", strerror(errno));
-			err = ENODEV;
+			      " (rte_errno: %s)", strerror(rte_errno));
 			goto port_error;
 		}
 		INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
@@ -6302,18 +584,13 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		     mac.addr_bytes[0], mac.addr_bytes[1],
 		     mac.addr_bytes[2], mac.addr_bytes[3],
 		     mac.addr_bytes[4], mac.addr_bytes[5]);
-		/* Register MAC and broadcast addresses. */
-		claim_zero(priv_mac_addr_add(priv, 0,
-					     (const uint8_t (*)[ETHER_ADDR_LEN])
-					     mac.addr_bytes));
-		claim_zero(priv_mac_addr_add(priv, (elemof(priv->mac) - 1),
-					     &(const uint8_t [ETHER_ADDR_LEN])
-					     { "\xff\xff\xff\xff\xff\xff" }));
+		/* Register MAC address. */
+		priv->mac[0] = mac;
 #ifndef NDEBUG
 		{
 			char ifname[IF_NAMESIZE];
 
-			if (priv_get_ifname(priv, &ifname) == 0)
+			if (mlx4_get_ifname(priv, &ifname) == 0)
 				DEBUG("port %u ifname is \"%s\"",
 				      priv->port, ifname);
 			else
@@ -6321,9 +598,8 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		}
 #endif
 		/* Get actual MTU if possible. */
-		priv_get_mtu(priv, &priv->mtu);
+		mlx4_mtu_get(priv, &priv->mtu);
 		DEBUG("port %u MTU is %u", priv->port, priv->mtu);
-
 		/* from rte_ethdev.c */
 		{
 			char name[RTE_ETH_NAME_MAX_LEN];
@@ -6334,67 +610,41 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		}
 		if (eth_dev == NULL) {
 			ERROR("can not allocate rte ethdev");
-			err = ENOMEM;
+			rte_errno = ENOMEM;
 			goto port_error;
 		}
-
-		/* Secondary processes have to use local storage for their
-		 * private data as well as a copy of eth_dev->data, but this
-		 * pointer must not be modified before burst functions are
-		 * actually called. */
-		if (mlx4_is_secondary()) {
-			struct mlx4_secondary_data *sd =
-				&mlx4_secondary_data[eth_dev->data->port_id];
-
-			sd->primary_priv = eth_dev->data->dev_private;
-			if (sd->primary_priv == NULL) {
-				ERROR("no private data for port %u",
-				      eth_dev->data->port_id);
-				err = EINVAL;
-				goto port_error;
-			}
-			sd->shared_dev_data = eth_dev->data;
-			rte_spinlock_init(&sd->lock);
-			memcpy(sd->data.name, sd->shared_dev_data->name,
-			       sizeof(sd->data.name));
-			sd->data.dev_private = priv;
-			sd->data.rx_mbuf_alloc_failed = 0;
-			sd->data.mtu = ETHER_MTU;
-			sd->data.port_id = sd->shared_dev_data->port_id;
-			sd->data.mac_addrs = priv->mac;
-			eth_dev->tx_pkt_burst = mlx4_tx_burst_secondary_setup;
-			eth_dev->rx_pkt_burst = mlx4_rx_burst_secondary_setup;
-		} else {
-			eth_dev->data->dev_private = priv;
-			eth_dev->data->mac_addrs = priv->mac;
-		}
+		eth_dev->data->dev_private = priv;
+		eth_dev->data->mac_addrs = priv->mac;
 		eth_dev->device = &pci_dev->device;
-
 		rte_eth_copy_pci_info(eth_dev, pci_dev);
-
 		eth_dev->device->driver = &mlx4_driver.driver;
-
+		/* Initialize local interrupt handle for current port. */
+		priv->intr_handle = (struct rte_intr_handle){
+			.fd = -1,
+			.type = RTE_INTR_HANDLE_EXT,
+		};
 		/*
-		 * Copy and override interrupt handle to prevent it from
-		 * being shared between all ethdev instances of a given PCI
-		 * device. This is required to properly handle Rx interrupts
-		 * on all ports.
+		 * Override ethdev interrupt handle pointer with private
+		 * handle instead of that of the parent PCI device used by
+		 * default. This prevents it from being shared between all
+		 * ports of the same PCI device since each of them is
+		 * associated its own Verbs context.
+		 *
+		 * Rx interrupts in particular require this as the PMD has
+		 * no control over the registration of queue interrupts
+		 * besides setting up eth_dev->intr_handle, the rest is
+		 * handled by rte_intr_rx_ctl().
 		 */
-		priv->intr_handle_dev = *eth_dev->intr_handle;
-		eth_dev->intr_handle = &priv->intr_handle_dev;
-
+		eth_dev->intr_handle = &priv->intr_handle;
 		priv->dev = eth_dev;
 		eth_dev->dev_ops = &mlx4_dev_ops;
-		eth_dev->data->dev_flags |= RTE_ETH_DEV_DETACHABLE;
-
 		/* Bring Ethernet device up. */
 		DEBUG("forcing Ethernet interface up");
-		priv_set_flags(priv, ~IFF_UP, IFF_UP);
+		mlx4_dev_set_link_up(priv->dev);
 		/* Update link status once if waiting for LSC. */
 		if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
 			mlx4_link_update(eth_dev, 0);
 		continue;
-
 port_error:
 		rte_free(priv);
 		if (pd)
@@ -6405,27 +655,21 @@ port_error:
 			rte_eth_dev_release_port(eth_dev);
 		break;
 	}
-
+	if (i == device_attr.phys_port_cnt)
+		return 0;
 	/*
 	 * XXX if something went wrong in the loop above, there is a resource
 	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
 	 * long as the dpdk does not provide a way to deallocate a ethdev and a
 	 * way to enumerate the registered ethdevs to free the previous ones.
 	 */
-
-	/* no port found, complain */
-	if (!mlx4_dev[idx].ports) {
-		err = ENODEV;
-		goto error;
-	}
-
 error:
 	if (attr_ctx)
 		claim_zero(ibv_close_device(attr_ctx));
 	if (list)
 		ibv_free_device_list(list);
-	assert(err >= 0);
-	return -err;
+	assert(rte_errno >= 0);
+	return -rte_errno;
 }
 
 static const struct rte_pci_id mlx4_pci_id_map[] = {
@@ -6463,7 +707,6 @@ RTE_INIT(rte_mlx4_pmd_init);
 static void
 rte_mlx4_pmd_init(void)
 {
-	RTE_BUILD_BUG_ON(sizeof(wr_id_t) != sizeof(uint64_t));
 	/*
 	 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
 	 * huge pages. Calling ibv_fork_init() during init allows
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index c0ade4f1..3aeef87e 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -1,8 +1,8 @@
 /*-
  *   BSD LICENSE
  *
- *   Copyright 2012-2017 6WIND S.A.
- *   Copyright 2012-2017 Mellanox.
+ *   Copyright 2012 6WIND S.A.
+ *   Copyright 2012 Mellanox
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -34,29 +34,11 @@
 #ifndef RTE_PMD_MLX4_H_
 #define RTE_PMD_MLX4_H_
 
-#include <stddef.h>
+#include <net/if.h>
 #include <stdint.h>
-#include <limits.h>
+#include <sys/queue.h>
 
-/*
- * Runtime logging through RTE_LOG() is enabled when not in debugging mode.
- * Intermediate LOG_*() macros add the required end-of-line characters.
- */
-#ifndef NDEBUG
-#define INFO(...) DEBUG(__VA_ARGS__)
-#define WARN(...) DEBUG(__VA_ARGS__)
-#define ERROR(...) DEBUG(__VA_ARGS__)
-#else
-#define LOG__(level, m, ...) \
-	RTE_LOG(level, PMD, MLX4_DRIVER_NAME ": " m "%c", __VA_ARGS__)
-#define LOG_(level, ...) LOG__(level, __VA_ARGS__, '\n')
-#define INFO(...) LOG_(INFO, __VA_ARGS__)
-#define WARN(...) LOG_(WARNING, __VA_ARGS__)
-#define ERROR(...) LOG_(ERR, __VA_ARGS__)
-#endif
-
-/* Verbs header. */
-/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+/* Verbs headers do not support -pedantic. */
 #ifdef PEDANTIC
 #pragma GCC diagnostic ignored "-Wpedantic"
 #endif
@@ -65,36 +47,25 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
-/*
- * Maximum number of simultaneous MAC addresses supported.
- *
- * According to ConnectX's Programmer Reference Manual:
- *   The L2 Address Match is implemented by comparing a MAC/VLAN combination
- *   of 128 MAC addresses and 127 VLAN values, comprising 128x127 possible
- *   L2 addresses.
- */
-#define MLX4_MAX_MAC_ADDRESSES 128
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+#include <rte_interrupts.h>
+#include <rte_mempool.h>
+#include <rte_spinlock.h>
 
-/* Maximum number of simultaneous VLAN filters supported. See above. */
-#define MLX4_MAX_VLAN_IDS 127
+/** Maximum number of simultaneous MAC addresses. This value is arbitrary. */
+#define MLX4_MAX_MAC_ADDRESSES 128
 
-/* Request send completion once in every 64 sends, might be less. */
+/** Request send completion once in every 64 sends, might be less. */
 #define MLX4_PMD_TX_PER_COMP_REQ 64
 
-/* Maximum number of physical ports. */
-#define MLX4_PMD_MAX_PHYS_PORTS 2
-
-/* Maximum number of Scatter/Gather Elements per Work Request. */
-#ifndef MLX4_PMD_SGE_WR_N
-#define MLX4_PMD_SGE_WR_N 4
-#endif
-
-/* Maximum size for inline data. */
-#ifndef MLX4_PMD_MAX_INLINE
+/** Maximum size for inline data. */
 #define MLX4_PMD_MAX_INLINE 0
-#endif
 
-/*
+/** Fixed RSS hash key size in bytes. Cannot be modified. */
+#define MLX4_RSS_HASH_KEY_SIZE 40
+
+/**
  * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
  * from which buffers are to be transmitted will have to be mapped by this
  * driver to their own Memory Region (MR). This is a slow operation.
@@ -105,18 +76,10 @@
 #define MLX4_PMD_TX_MP_CACHE 8
 #endif
 
-/*
- * If defined, only use software counters. The PMD will never ask the hardware
- * for these, and many of them won't be available.
- */
-#ifndef MLX4_PMD_SOFT_COUNTERS
-#define MLX4_PMD_SOFT_COUNTERS 1
-#endif
-
-/* Alarm timeout. */
-#define MLX4_ALARM_TIMEOUT_US 100000
+/** Interrupt alarm timeout value in microseconds. */
+#define MLX4_INTR_ALARM_TIMEOUT 100000
 
-/* Port parameter. */
+/** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
 enum {
@@ -129,258 +92,92 @@ enum {
 	PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO = 0x1007,
 };
 
+/** Driver name reported to lower layers and used in log output. */
 #define MLX4_DRIVER_NAME "net_mlx4"
 
-/* Bit-field manipulation. */
-#define BITFIELD_DECLARE(bf, type, size)				\
-	type bf[(((size_t)(size) / (sizeof(type) * CHAR_BIT)) +		\
-		 !!((size_t)(size) % (sizeof(type) * CHAR_BIT)))]
-#define BITFIELD_DEFINE(bf, type, size)					\
-	BITFIELD_DECLARE((bf), type, (size)) = { 0 }
-#define BITFIELD_SET(bf, b)						\
-	(assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)),			\
-	 (void)((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] |=		\
-		((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT)))))
-#define BITFIELD_RESET(bf, b)						\
-	(assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)),			\
-	 (void)((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] &=		\
-		~((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT)))))
-#define BITFIELD_ISSET(bf, b)						\
-	(assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)),			\
-	 !!(((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] &		\
-	     ((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT))))))
-
-/* Number of elements in array. */
-#define elemof(a) (sizeof(a) / sizeof((a)[0]))
-
-/* Cast pointer p to structure member m to its parent structure of type t. */
-#define containerof(p, t, m) ((t *)((uint8_t *)(p) - offsetof(t, m)))
-
-/* Branch prediction helpers. */
-#ifndef likely
-#define likely(c) __builtin_expect(!!(c), 1)
-#endif
-#ifndef unlikely
-#define unlikely(c) __builtin_expect(!!(c), 0)
-#endif
-
-/* Debugging */
-#ifndef NDEBUG
-#include <stdio.h>
-#define DEBUG__(m, ...)						\
-	(fprintf(stderr, "%s:%d: %s(): " m "%c",		\
-		 __FILE__, __LINE__, __func__, __VA_ARGS__),	\
-	 fflush(stderr),					\
-	 (void)0)
-/*
- * Save/restore errno around DEBUG__().
- * XXX somewhat undefined behavior, but works.
- */
-#define DEBUG_(...)				\
-	(errno = ((int []){			\
-		*(volatile int *)&errno,	\
-		(DEBUG__(__VA_ARGS__), 0)	\
-	})[0])
-#define DEBUG(...) DEBUG_(__VA_ARGS__, '\n')
-#ifndef MLX4_PMD_DEBUG_BROKEN_VERBS
-#define claim_zero(...) assert((__VA_ARGS__) == 0)
-#else /* MLX4_PMD_DEBUG_BROKEN_VERBS */
-#define claim_zero(...) \
-	(void)(((__VA_ARGS__) == 0) || \
-		DEBUG("Assertion `(" # __VA_ARGS__ ") == 0' failed (IGNORED)."))
-#endif /* MLX4_PMD_DEBUG_BROKEN_VERBS */
-#define claim_nonzero(...) assert((__VA_ARGS__) != 0)
-#define claim_positive(...) assert((__VA_ARGS__) >= 0)
-#else /* NDEBUG */
-/* No-ops. */
-#define DEBUG(...) (void)0
-#define claim_zero(...) (__VA_ARGS__)
-#define claim_nonzero(...) (__VA_ARGS__)
-#define claim_positive(...) (__VA_ARGS__)
-#endif /* NDEBUG */
-
-struct mlx4_rxq_stats {
-	unsigned int idx; /**< Mapping index. */
-#ifdef MLX4_PMD_SOFT_COUNTERS
-	uint64_t ipackets; /**< Total of successfully received packets. */
-	uint64_t ibytes; /**< Total of successfully received bytes. */
-#endif
-	uint64_t idropped; /**< Total of packets dropped when RX ring full. */
-	uint64_t rx_nombuf; /**< Total of RX mbuf allocation failures. */
-};
-
-/* RX element (scattered packets). */
-struct rxq_elt_sp {
-	struct ibv_recv_wr wr; /* Work Request. */
-	struct ibv_sge sges[MLX4_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */
-	struct rte_mbuf *bufs[MLX4_PMD_SGE_WR_N]; /* SGEs buffers. */
-};
-
-/* RX element. */
-struct rxq_elt {
-	struct ibv_recv_wr wr; /* Work Request. */
-	struct ibv_sge sge; /* Scatter/Gather Element. */
-	/* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */
-};
-
-/* RX queue descriptor. */
-struct rxq {
-	LIST_ENTRY(rxq) next; /* Used by parent queue only */
-	struct priv *priv; /* Back pointer to private data. */
-	struct rte_mempool *mp; /* Memory Pool for allocations. */
-	struct ibv_mr *mr; /* Memory Region (for mp). */
-	struct ibv_cq *cq; /* Completion Queue. */
-	struct ibv_qp *qp; /* Queue Pair. */
-	struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
-	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
-	struct ibv_comp_channel *channel;
-	/*
-	 * Each VLAN ID requires a separate flow steering rule.
-	 */
-	BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES);
-	struct ibv_flow *mac_flow[MLX4_MAX_MAC_ADDRESSES][MLX4_MAX_VLAN_IDS];
-	struct ibv_flow *promisc_flow; /* Promiscuous flow. */
-	struct ibv_flow *allmulti_flow; /* Multicast flow. */
-	unsigned int port_id; /* Port ID for incoming packets. */
-	unsigned int elts_n; /* (*elts)[] length. */
-	unsigned int elts_head; /* Current index in (*elts)[]. */
-	union {
-		struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
-		struct rxq_elt (*no_sp)[]; /* RX elements. */
-	} elts;
-	unsigned int sp:1; /* Use scattered RX elements. */
-	unsigned int csum:1; /* Enable checksum offloading. */
-	unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
-	struct mlx4_rxq_stats stats; /* RX queue counters. */
-	unsigned int socket; /* CPU socket ID for allocations. */
-	struct ibv_exp_res_domain *rd; /* Resource Domain. */
-	struct {
-		uint16_t queues_n;
-		uint16_t queues[RTE_MAX_QUEUES_PER_PORT];
-	} rss;
-};
-
-/* TX element. */
-struct txq_elt {
-	struct rte_mbuf *buf;
-};
-
-struct mlx4_txq_stats {
-	unsigned int idx; /**< Mapping index. */
-#ifdef MLX4_PMD_SOFT_COUNTERS
-	uint64_t opackets; /**< Total of successfully sent packets. */
-	uint64_t obytes;   /**< Total of successfully sent bytes. */
-#endif
-	uint64_t odropped; /**< Total of packets not sent when TX ring full. */
-};
-
-/*
- * Linear buffer type. It is used when transmitting buffers with too many
- * segments that do not fit the hardware queue (see max_send_sge).
- * Extra segments are copied (linearized) in such buffers, replacing the
- * last SGE during TX.
- * The size is arbitrary but large enough to hold a jumbo frame with
- * 8 segments considering mbuf.buf_len is about 2048 bytes.
- */
-typedef uint8_t linear_t[16384];
+struct mlx4_drop;
+struct mlx4_rss;
+struct rxq;
+struct txq;
+struct rte_flow;
 
-/* TX queue descriptor. */
-struct txq {
-	struct priv *priv; /* Back pointer to private data. */
-	struct {
-		const struct rte_mempool *mp; /* Cached Memory Pool. */
-		struct ibv_mr *mr; /* Memory Region (for mp). */
-		uint32_t lkey; /* mr->lkey */
-	} mp2mr[MLX4_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
-	struct ibv_cq *cq; /* Completion Queue. */
-	struct ibv_qp *qp; /* Queue Pair. */
-	struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
-	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
-#if MLX4_PMD_MAX_INLINE > 0
-	uint32_t max_inline; /* Max inline send size <= MLX4_PMD_MAX_INLINE. */
-#endif
-	unsigned int elts_n; /* (*elts)[] length. */
-	struct txq_elt (*elts)[]; /* TX elements. */
-	unsigned int elts_head; /* Current index in (*elts)[]. */
-	unsigned int elts_tail; /* First element awaiting completion. */
-	unsigned int elts_comp; /* Number of completion requests. */
-	unsigned int elts_comp_cd; /* Countdown for next completion request. */
-	unsigned int elts_comp_cd_init; /* Initial value for countdown. */
-	struct mlx4_txq_stats stats; /* TX queue counters. */
-	linear_t (*elts_linear)[]; /* Linearized buffers. */
-	struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */
-	unsigned int socket; /* CPU socket ID for allocations. */
-	struct ibv_exp_res_domain *rd; /* Resource Domain. */
+/** Memory region descriptor. */
+struct mlx4_mr {
+	LIST_ENTRY(mlx4_mr) next; /**< Next entry in list. */
+	uintptr_t start; /**< Base address for memory region. */
+	uintptr_t end; /**< End address for memory region. */
+	uint32_t lkey; /**< L_Key extracted from @p mr. */
+	uint32_t refcnt; /**< Reference count for this object. */
+	struct priv *priv; /**< Back pointer to private data. */
+	struct ibv_mr *mr; /**< Memory region associated with @p mp. */
+	struct rte_mempool *mp; /**< Target memory pool (mempool). */
 };
 
-struct rte_flow;
-
+/** Private data structure. */
 struct priv {
-	struct rte_eth_dev *dev; /* Ethernet device. */
-	struct ibv_context *ctx; /* Verbs context. */
-	struct ibv_device_attr device_attr; /* Device properties. */
-	struct ibv_pd *pd; /* Protection Domain. */
-	/*
-	 * MAC addresses array and configuration bit-field.
-	 * An extra entry that cannot be modified by the DPDK is reserved
-	 * for broadcast frames (destination MAC address ff:ff:ff:ff:ff:ff).
-	 */
-	struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES];
-	BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES);
-	/* VLAN filters. */
-	struct {
-		unsigned int enabled:1; /* If enabled. */
-		unsigned int id:12; /* VLAN ID (0-4095). */
-	} vlan_filter[MLX4_MAX_VLAN_IDS]; /* VLAN filters table. */
+	struct rte_eth_dev *dev; /**< Ethernet device. */
+	struct ibv_context *ctx; /**< Verbs context. */
+	struct ibv_device_attr device_attr; /**< Device properties. */
+	struct ibv_pd *pd; /**< Protection Domain. */
 	/* Device properties. */
-	uint16_t mtu; /* Configured MTU. */
-	uint8_t port; /* Physical port number. */
-	unsigned int started:1; /* Device started, flows enabled. */
-	unsigned int promisc:1; /* Device in promiscuous mode. */
-	unsigned int allmulti:1; /* Device receives all multicast packets. */
-	unsigned int hw_qpg:1; /* QP groups are supported. */
-	unsigned int hw_tss:1; /* TSS is supported. */
-	unsigned int hw_rss:1; /* RSS is supported. */
-	unsigned int hw_csum:1; /* Checksum offload is supported. */
-	unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */
-	unsigned int rss:1; /* RSS is enabled. */
-	unsigned int vf:1; /* This is a VF device. */
-	unsigned int pending_alarm:1; /* An alarm is pending. */
-	unsigned int isolated:1; /* Toggle isolated mode. */
-#ifdef INLINE_RECV
-	unsigned int inl_recv_size; /* Inline recv size */
-#endif
-	unsigned int max_rss_tbl_sz; /* Maximum number of RSS queues. */
-	/* RX/TX queues. */
-	unsigned int rxqs_n; /* RX queues array size. */
-	unsigned int txqs_n; /* TX queues array size. */
-	struct rxq *(*rxqs)[]; /* RX queues. */
-	struct txq *(*txqs)[]; /* TX queues. */
-	struct rte_intr_handle intr_handle_dev; /* Device interrupt handler. */
-	struct rte_intr_handle intr_handle; /* Interrupt handler. */
-	struct rte_flow_drop *flow_drop_queue; /* Flow drop queue. */
-	LIST_HEAD(mlx4_flows, rte_flow) flows;
-	struct rte_intr_conf intr_conf; /* Active interrupt configuration. */
-	LIST_HEAD(mlx4_parents, rxq) parents;
-	rte_spinlock_t lock; /* Lock for control functions. */
+	uint16_t mtu; /**< Configured MTU. */
+	uint8_t port; /**< Physical port number. */
+	uint32_t started:1; /**< Device started, flows enabled. */
+	uint32_t vf:1; /**< This is a VF device. */
+	uint32_t intr_alarm:1; /**< An interrupt alarm is scheduled. */
+	uint32_t isolated:1; /**< Toggle isolated mode. */
+	uint32_t hw_csum:1; /* Checksum offload is supported. */
+	uint32_t hw_csum_l2tun:1; /* Checksum support for L2 tunnels. */
+	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
+	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
+	LIST_HEAD(, mlx4_rss) rss; /**< Shared targets for Rx flow rules. */
+	LIST_HEAD(, rte_flow) flows; /**< Configured flow rule handles. */
+	LIST_HEAD(, mlx4_mr) mr; /**< Registered memory regions. */
+	rte_spinlock_t mr_lock; /**< Lock for @p mr access. */
+	struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES];
+	/**< Configured MAC addresses. Unused entries are zeroed. */
 };
 
-void priv_lock(struct priv *priv);
-void priv_unlock(struct priv *priv);
-
-int
-rxq_create_qp(struct rxq *rxq,
-	      uint16_t desc,
-	      int inactive,
-	      int children_n,
-	      struct rxq *rxq_parent);
-
-void
-rxq_parent_cleanup(struct rxq *parent);
-
-struct rxq *
-priv_parent_create(struct priv *priv,
-		   uint16_t queues[],
-		   uint16_t children_n);
+/* mlx4_ethdev.c */
+
+int mlx4_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]);
+int mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]);
+int mlx4_mtu_get(struct priv *priv, uint16_t *mtu);
+int mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu);
+int mlx4_dev_set_link_down(struct rte_eth_dev *dev);
+int mlx4_dev_set_link_up(struct rte_eth_dev *dev);
+void mlx4_promiscuous_enable(struct rte_eth_dev *dev);
+void mlx4_promiscuous_disable(struct rte_eth_dev *dev);
+void mlx4_allmulticast_enable(struct rte_eth_dev *dev);
+void mlx4_allmulticast_disable(struct rte_eth_dev *dev);
+void mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index);
+int mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
+		      uint32_t index, uint32_t vmdq);
+void mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr);
+int mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on);
+int mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats);
+void mlx4_stats_reset(struct rte_eth_dev *dev);
+void mlx4_dev_infos_get(struct rte_eth_dev *dev,
+			struct rte_eth_dev_info *info);
+int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete);
+int mlx4_flow_ctrl_get(struct rte_eth_dev *dev,
+		       struct rte_eth_fc_conf *fc_conf);
+int mlx4_flow_ctrl_set(struct rte_eth_dev *dev,
+		       struct rte_eth_fc_conf *fc_conf);
+const uint32_t *mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev);
+
+/* mlx4_intr.c */
+
+int mlx4_intr_uninstall(struct priv *priv);
+int mlx4_intr_install(struct priv *priv);
+int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
+int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
+
+/* mlx4_mr.c */
+
+struct mlx4_mr *mlx4_mr_get(struct priv *priv, struct rte_mempool *mp);
+void mlx4_mr_put(struct mlx4_mr *mr);
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
+			 uint32_t i);
 
 #endif /* RTE_PMD_MLX4_H_ */
diff --git a/drivers/net/mlx4/mlx4_ethdev.c b/drivers/net/mlx4/mlx4_ethdev.c
new file mode 100644
index 00000000..c2ea4db1
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_ethdev.c
@@ -0,0 +1,1047 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Miscellaneous control operations for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_bus_pci.h>
+#include <rte_errno.h>
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+#include <rte_flow.h>
+#include <rte_pci.h>
+
+#include "mlx4.h"
+#include "mlx4_flow.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Get interface name from private structure.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ * @param[out] ifname
+ *   Interface name output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
+{
+	DIR *dir;
+	struct dirent *dent;
+	unsigned int dev_type = 0;
+	unsigned int dev_port_prev = ~0u;
+	char match[IF_NAMESIZE] = "";
+
+	{
+		MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path);
+
+		dir = opendir(path);
+		if (dir == NULL) {
+			rte_errno = errno;
+			return -rte_errno;
+		}
+	}
+	while ((dent = readdir(dir)) != NULL) {
+		char *name = dent->d_name;
+		FILE *file;
+		unsigned int dev_port;
+		int r;
+
+		if ((name[0] == '.') &&
+		    ((name[1] == '\0') ||
+		     ((name[1] == '.') && (name[2] == '\0'))))
+			continue;
+
+		MKSTR(path, "%s/device/net/%s/%s",
+		      priv->ctx->device->ibdev_path, name,
+		      (dev_type ? "dev_id" : "dev_port"));
+
+		file = fopen(path, "rb");
+		if (file == NULL) {
+			if (errno != ENOENT)
+				continue;
+			/*
+			 * Switch to dev_id when dev_port does not exist as
+			 * is the case with Linux kernel versions < 3.15.
+			 */
+try_dev_id:
+			match[0] = '\0';
+			if (dev_type)
+				break;
+			dev_type = 1;
+			dev_port_prev = ~0u;
+			rewinddir(dir);
+			continue;
+		}
+		r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
+		fclose(file);
+		if (r != 1)
+			continue;
+		/*
+		 * Switch to dev_id when dev_port returns the same value for
+		 * all ports. May happen when using a MOFED release older than
+		 * 3.0 with a Linux kernel >= 3.15.
+		 */
+		if (dev_port == dev_port_prev)
+			goto try_dev_id;
+		dev_port_prev = dev_port;
+		if (dev_port == (priv->port - 1u))
+			snprintf(match, sizeof(match), "%s", name);
+	}
+	closedir(dir);
+	if (match[0] == '\0') {
+		rte_errno = ENODEV;
+		return -rte_errno;
+	}
+	strncpy(*ifname, match, sizeof(*ifname));
+	return 0;
+}
+
+/**
+ * Read from sysfs entry.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ * @param[in] entry
+ *   Entry name relative to sysfs path.
+ * @param[out] buf
+ *   Data output buffer.
+ * @param size
+ *   Buffer size.
+ *
+ * @return
+ *   Number of bytes read on success, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+static int
+mlx4_sysfs_read(const struct priv *priv, const char *entry,
+		char *buf, size_t size)
+{
+	char ifname[IF_NAMESIZE];
+	FILE *file;
+	int ret;
+
+	ret = mlx4_get_ifname(priv, &ifname);
+	if (ret)
+		return ret;
+
+	MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
+	      ifname, entry);
+
+	file = fopen(path, "rb");
+	if (file == NULL) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	ret = fread(buf, 1, size, file);
+	if ((size_t)ret < size && ferror(file)) {
+		rte_errno = EIO;
+		ret = -rte_errno;
+	} else {
+		ret = size;
+	}
+	fclose(file);
+	return ret;
+}
+
+/**
+ * Write to sysfs entry.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ * @param[in] entry
+ *   Entry name relative to sysfs path.
+ * @param[in] buf
+ *   Data buffer.
+ * @param size
+ *   Buffer size.
+ *
+ * @return
+ *   Number of bytes written on success, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+static int
+mlx4_sysfs_write(const struct priv *priv, const char *entry,
+		 char *buf, size_t size)
+{
+	char ifname[IF_NAMESIZE];
+	FILE *file;
+	int ret;
+
+	ret = mlx4_get_ifname(priv, &ifname);
+	if (ret)
+		return ret;
+
+	MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
+	      ifname, entry);
+
+	file = fopen(path, "wb");
+	if (file == NULL) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	ret = fwrite(buf, 1, size, file);
+	if ((size_t)ret < size || ferror(file)) {
+		rte_errno = EIO;
+		ret = -rte_errno;
+	} else {
+		ret = size;
+	}
+	fclose(file);
+	return ret;
+}
+
+/**
+ * Get unsigned long sysfs property.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[in] name
+ *   Entry name relative to sysfs path.
+ * @param[out] value
+ *   Value output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value)
+{
+	int ret;
+	unsigned long value_ret;
+	char value_str[32];
+
+	ret = mlx4_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1));
+	if (ret < 0) {
+		DEBUG("cannot read %s value from sysfs: %s",
+		      name, strerror(rte_errno));
+		return ret;
+	}
+	value_str[ret] = '\0';
+	errno = 0;
+	value_ret = strtoul(value_str, NULL, 0);
+	if (errno) {
+		rte_errno = errno;
+		DEBUG("invalid %s value `%s': %s", name, value_str,
+		      strerror(rte_errno));
+		return -rte_errno;
+	}
+	*value = value_ret;
+	return 0;
+}
+
+/**
+ * Set unsigned long sysfs property.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[in] name
+ *   Entry name relative to sysfs path.
+ * @param value
+ *   Value to set.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value)
+{
+	int ret;
+	MKSTR(value_str, "%lu", value);
+
+	ret = mlx4_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1));
+	if (ret < 0) {
+		DEBUG("cannot write %s `%s' (%lu) to sysfs: %s",
+		      name, value_str, value, strerror(rte_errno));
+		return ret;
+	}
+	return 0;
+}
+
+/**
+ * Perform ifreq ioctl() on associated Ethernet device.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ * @param req
+ *   Request number to pass to ioctl().
+ * @param[out] ifr
+ *   Interface request structure output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_ifreq(const struct priv *priv, int req, struct ifreq *ifr)
+{
+	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+	int ret;
+
+	if (sock == -1) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	ret = mlx4_get_ifname(priv, &ifr->ifr_name);
+	if (!ret && ioctl(sock, req, ifr) == -1) {
+		rte_errno = errno;
+		ret = -rte_errno;
+	}
+	close(sock);
+	return ret;
+}
+
+/**
+ * Get MAC address by querying netdevice.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ * @param[out] mac
+ *   MAC address output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN])
+{
+	struct ifreq request;
+	int ret = mlx4_ifreq(priv, SIOCGIFHWADDR, &request);
+
+	if (ret)
+		return ret;
+	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
+	return 0;
+}
+
+/**
+ * Get device MTU.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[out] mtu
+ *   MTU value output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mtu_get(struct priv *priv, uint16_t *mtu)
+{
+	unsigned long ulong_mtu = 0;
+	int ret = mlx4_get_sysfs_ulong(priv, "mtu", &ulong_mtu);
+
+	if (ret)
+		return ret;
+	*mtu = ulong_mtu;
+	return 0;
+}
+
+/**
+ * DPDK callback to change the MTU.
+ *
+ * @param priv
+ *   Pointer to Ethernet device structure.
+ * @param mtu
+ *   MTU value to set.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
+{
+	struct priv *priv = dev->data->dev_private;
+	uint16_t new_mtu;
+	int ret = mlx4_set_sysfs_ulong(priv, "mtu", mtu);
+
+	if (ret)
+		return ret;
+	ret = mlx4_mtu_get(priv, &new_mtu);
+	if (ret)
+		return ret;
+	if (new_mtu == mtu) {
+		priv->mtu = mtu;
+		return 0;
+	}
+	rte_errno = EINVAL;
+	return -rte_errno;
+}
+
+/**
+ * Set device flags.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param keep
+ *   Bitmask for flags that must remain untouched.
+ * @param flags
+ *   Bitmask for flags to modify.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_set_flags(struct priv *priv, unsigned int keep, unsigned int flags)
+{
+	unsigned long tmp = 0;
+	int ret = mlx4_get_sysfs_ulong(priv, "flags", &tmp);
+
+	if (ret)
+		return ret;
+	tmp &= keep;
+	tmp |= (flags & (~keep));
+	return mlx4_set_sysfs_ulong(priv, "flags", tmp);
+}
+
+/**
+ * Change the link state (UP / DOWN).
+ *
+ * @param priv
+ *   Pointer to Ethernet device private data.
+ * @param up
+ *   Nonzero for link up, otherwise link down.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_dev_set_link(struct priv *priv, int up)
+{
+	int err;
+
+	if (up) {
+		err = mlx4_set_flags(priv, ~IFF_UP, IFF_UP);
+		if (err)
+			return err;
+	} else {
+		err = mlx4_set_flags(priv, ~IFF_UP, ~IFF_UP);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/**
+ * DPDK callback to bring the link DOWN.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_dev_set_link_down(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	return mlx4_dev_set_link(priv, 0);
+}
+
+/**
+ * DPDK callback to bring the link UP.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_dev_set_link_up(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	return mlx4_dev_set_link(priv, 1);
+}
+
+/**
+ * Supported Rx mode toggles.
+ *
+ * Even and odd values respectively stand for off and on.
+ */
+enum rxmode_toggle {
+	RXMODE_TOGGLE_PROMISC_OFF,
+	RXMODE_TOGGLE_PROMISC_ON,
+	RXMODE_TOGGLE_ALLMULTI_OFF,
+	RXMODE_TOGGLE_ALLMULTI_ON,
+};
+
+/**
+ * Helper function to toggle promiscuous and all multicast modes.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param toggle
+ *   Toggle to set.
+ */
+static void
+mlx4_rxmode_toggle(struct rte_eth_dev *dev, enum rxmode_toggle toggle)
+{
+	struct priv *priv = dev->data->dev_private;
+	const char *mode;
+	struct rte_flow_error error;
+
+	switch (toggle) {
+	case RXMODE_TOGGLE_PROMISC_OFF:
+	case RXMODE_TOGGLE_PROMISC_ON:
+		mode = "promiscuous";
+		dev->data->promiscuous = toggle & 1;
+		break;
+	case RXMODE_TOGGLE_ALLMULTI_OFF:
+	case RXMODE_TOGGLE_ALLMULTI_ON:
+		mode = "all multicast";
+		dev->data->all_multicast = toggle & 1;
+		break;
+	}
+	if (!mlx4_flow_sync(priv, &error))
+		return;
+	ERROR("cannot toggle %s mode (code %d, \"%s\"),"
+	      " flow error type %d, cause %p, message: %s",
+	      mode, rte_errno, strerror(rte_errno), error.type, error.cause,
+	      error.message ? error.message : "(unspecified)");
+}
+
+/**
+ * DPDK callback to enable promiscuous mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+void
+mlx4_promiscuous_enable(struct rte_eth_dev *dev)
+{
+	mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_PROMISC_ON);
+}
+
+/**
+ * DPDK callback to disable promiscuous mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+void
+mlx4_promiscuous_disable(struct rte_eth_dev *dev)
+{
+	mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_PROMISC_OFF);
+}
+
+/**
+ * DPDK callback to enable all multicast mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+void
+mlx4_allmulticast_enable(struct rte_eth_dev *dev)
+{
+	mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_ALLMULTI_ON);
+}
+
+/**
+ * DPDK callback to disable all multicast mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+void
+mlx4_allmulticast_disable(struct rte_eth_dev *dev)
+{
+	mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_ALLMULTI_OFF);
+}
+
+/**
+ * DPDK callback to remove a MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param index
+ *   MAC address index.
+ */
+void
+mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow_error error;
+
+	if (index >= RTE_DIM(priv->mac)) {
+		rte_errno = EINVAL;
+		return;
+	}
+	memset(&priv->mac[index], 0, sizeof(priv->mac[index]));
+	if (!mlx4_flow_sync(priv, &error))
+		return;
+	ERROR("failed to synchronize flow rules after removing MAC address"
+	      " at index %d (code %d, \"%s\"),"
+	      " flow error type %d, cause %p, message: %s",
+	      index, rte_errno, strerror(rte_errno), error.type, error.cause,
+	      error.message ? error.message : "(unspecified)");
+}
+
+/**
+ * DPDK callback to add a MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mac_addr
+ *   MAC address to register.
+ * @param index
+ *   MAC address index.
+ * @param vmdq
+ *   VMDq pool index to associate address with (ignored).
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
+		  uint32_t index, uint32_t vmdq)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow_error error;
+	int ret;
+
+	(void)vmdq;
+	if (index >= RTE_DIM(priv->mac)) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	memcpy(&priv->mac[index], mac_addr, sizeof(priv->mac[index]));
+	ret = mlx4_flow_sync(priv, &error);
+	if (!ret)
+		return 0;
+	ERROR("failed to synchronize flow rules after adding MAC address"
+	      " at index %d (code %d, \"%s\"),"
+	      " flow error type %d, cause %p, message: %s",
+	      index, rte_errno, strerror(rte_errno), error.type, error.cause,
+	      error.message ? error.message : "(unspecified)");
+	return ret;
+}
+
+/**
+ * DPDK callback to configure a VLAN filter.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param vlan_id
+ *   VLAN ID to filter.
+ * @param on
+ *   Toggle filter.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow_error error;
+	unsigned int vidx = vlan_id / 64;
+	unsigned int vbit = vlan_id % 64;
+	uint64_t *v;
+	int ret;
+
+	if (vidx >= RTE_DIM(dev->data->vlan_filter_conf.ids)) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	v = &dev->data->vlan_filter_conf.ids[vidx];
+	*v &= ~(UINT64_C(1) << vbit);
+	*v |= (uint64_t)!!on << vbit;
+	ret = mlx4_flow_sync(priv, &error);
+	if (!ret)
+		return 0;
+	ERROR("failed to synchronize flow rules after %s VLAN filter on ID %u"
+	      " (code %d, \"%s\"), "
+	      " flow error type %d, cause %p, message: %s",
+	      on ? "enabling" : "disabling", vlan_id,
+	      rte_errno, strerror(rte_errno), error.type, error.cause,
+	      error.message ? error.message : "(unspecified)");
+	return ret;
+}
+
+/**
+ * DPDK callback to set the primary MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mac_addr
+ *   MAC address to register.
+ */
+void
+mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
+{
+	mlx4_mac_addr_add(dev, mac_addr, 0, 0);
+}
+
+/**
+ * DPDK callback to get information about the device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] info
+ *   Info structure output buffer.
+ */
+void
+mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
+{
+	struct priv *priv = dev->data->dev_private;
+	unsigned int max;
+	char ifname[IF_NAMESIZE];
+
+	info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
+	/* FIXME: we should ask the device for these values. */
+	info->min_rx_bufsize = 32;
+	info->max_rx_pktlen = 65536;
+	/*
+	 * Since we need one CQ per QP, the limit is the minimum number
+	 * between the two values.
+	 */
+	max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ?
+	       priv->device_attr.max_qp : priv->device_attr.max_cq);
+	/* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
+	if (max >= 65535)
+		max = 65535;
+	info->max_rx_queues = max;
+	info->max_tx_queues = max;
+	info->max_mac_addrs = RTE_DIM(priv->mac);
+	info->rx_offload_capa = 0;
+	info->tx_offload_capa = 0;
+	if (priv->hw_csum) {
+		info->tx_offload_capa |= (DEV_TX_OFFLOAD_IPV4_CKSUM |
+					  DEV_TX_OFFLOAD_UDP_CKSUM |
+					  DEV_TX_OFFLOAD_TCP_CKSUM);
+		info->rx_offload_capa |= (DEV_RX_OFFLOAD_IPV4_CKSUM |
+					  DEV_RX_OFFLOAD_UDP_CKSUM |
+					  DEV_RX_OFFLOAD_TCP_CKSUM);
+	}
+	if (priv->hw_csum_l2tun)
+		info->tx_offload_capa |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+	if (mlx4_get_ifname(priv, &ifname) == 0)
+		info->if_index = if_nametoindex(ifname);
+	info->hash_key_size = MLX4_RSS_HASH_KEY_SIZE;
+	info->speed_capa =
+			ETH_LINK_SPEED_1G |
+			ETH_LINK_SPEED_10G |
+			ETH_LINK_SPEED_20G |
+			ETH_LINK_SPEED_40G |
+			ETH_LINK_SPEED_56G;
+}
+
+/**
+ * DPDK callback to get device statistics.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] stats
+ *   Stats structure output buffer.
+ */
+int
+mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+	struct rte_eth_stats tmp;
+	unsigned int i;
+	unsigned int idx;
+
+	memset(&tmp, 0, sizeof(tmp));
+	/* Add software counters. */
+	for (i = 0; i != dev->data->nb_rx_queues; ++i) {
+		struct rxq *rxq = dev->data->rx_queues[i];
+
+		if (rxq == NULL)
+			continue;
+		idx = rxq->stats.idx;
+		if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+			tmp.q_ipackets[idx] += rxq->stats.ipackets;
+			tmp.q_ibytes[idx] += rxq->stats.ibytes;
+			tmp.q_errors[idx] += (rxq->stats.idropped +
+					      rxq->stats.rx_nombuf);
+		}
+		tmp.ipackets += rxq->stats.ipackets;
+		tmp.ibytes += rxq->stats.ibytes;
+		tmp.ierrors += rxq->stats.idropped;
+		tmp.rx_nombuf += rxq->stats.rx_nombuf;
+	}
+	for (i = 0; i != dev->data->nb_tx_queues; ++i) {
+		struct txq *txq = dev->data->tx_queues[i];
+
+		if (txq == NULL)
+			continue;
+		idx = txq->stats.idx;
+		if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+			tmp.q_opackets[idx] += txq->stats.opackets;
+			tmp.q_obytes[idx] += txq->stats.obytes;
+			tmp.q_errors[idx] += txq->stats.odropped;
+		}
+		tmp.opackets += txq->stats.opackets;
+		tmp.obytes += txq->stats.obytes;
+		tmp.oerrors += txq->stats.odropped;
+	}
+	*stats = tmp;
+	return 0;
+}
+
+/**
+ * DPDK callback to clear device statistics.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+void
+mlx4_stats_reset(struct rte_eth_dev *dev)
+{
+	unsigned int i;
+
+	for (i = 0; i != dev->data->nb_rx_queues; ++i) {
+		struct rxq *rxq = dev->data->rx_queues[i];
+
+		if (rxq)
+			rxq->stats = (struct mlx4_rxq_stats){
+				.idx = rxq->stats.idx,
+			};
+	}
+	for (i = 0; i != dev->data->nb_tx_queues; ++i) {
+		struct txq *txq = dev->data->tx_queues[i];
+
+		if (txq)
+			txq->stats = (struct mlx4_txq_stats){
+				.idx = txq->stats.idx,
+			};
+	}
+}
+
+/**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param wait_to_complete
+ *   Wait for request completion (ignored).
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete)
+{
+	const struct priv *priv = dev->data->dev_private;
+	struct ethtool_cmd edata = {
+		.cmd = ETHTOOL_GSET,
+	};
+	struct ifreq ifr;
+	struct rte_eth_link dev_link;
+	int link_speed = 0;
+
+	if (priv == NULL) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	(void)wait_to_complete;
+	if (mlx4_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
+		WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(rte_errno));
+		return -rte_errno;
+	}
+	memset(&dev_link, 0, sizeof(dev_link));
+	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
+				(ifr.ifr_flags & IFF_RUNNING));
+	ifr.ifr_data = (void *)&edata;
+	if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) {
+		WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
+		     strerror(rte_errno));
+		return -rte_errno;
+	}
+	link_speed = ethtool_cmd_speed(&edata);
+	if (link_speed == -1)
+		dev_link.link_speed = 0;
+	else
+		dev_link.link_speed = link_speed;
+	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
+				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
+				  ETH_LINK_SPEED_FIXED);
+	dev->data->dev_link = dev_link;
+	return 0;
+}
+
+/**
+ * DPDK callback to get flow control status.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] fc_conf
+ *   Flow control output buffer.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_flow_ctrl_get(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct ifreq ifr;
+	struct ethtool_pauseparam ethpause = {
+		.cmd = ETHTOOL_GPAUSEPARAM,
+	};
+	int ret;
+
+	ifr.ifr_data = (void *)&ethpause;
+	if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) {
+		ret = rte_errno;
+		WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)"
+		     " failed: %s",
+		     strerror(rte_errno));
+		goto out;
+	}
+	fc_conf->autoneg = ethpause.autoneg;
+	if (ethpause.rx_pause && ethpause.tx_pause)
+		fc_conf->mode = RTE_FC_FULL;
+	else if (ethpause.rx_pause)
+		fc_conf->mode = RTE_FC_RX_PAUSE;
+	else if (ethpause.tx_pause)
+		fc_conf->mode = RTE_FC_TX_PAUSE;
+	else
+		fc_conf->mode = RTE_FC_NONE;
+	ret = 0;
+out:
+	assert(ret >= 0);
+	return -ret;
+}
+
+/**
+ * DPDK callback to modify flow control parameters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[in] fc_conf
+ *   Flow control parameters.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct ifreq ifr;
+	struct ethtool_pauseparam ethpause = {
+		.cmd = ETHTOOL_SPAUSEPARAM,
+	};
+	int ret;
+
+	ifr.ifr_data = (void *)&ethpause;
+	ethpause.autoneg = fc_conf->autoneg;
+	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+	    (fc_conf->mode & RTE_FC_RX_PAUSE))
+		ethpause.rx_pause = 1;
+	else
+		ethpause.rx_pause = 0;
+	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+	    (fc_conf->mode & RTE_FC_TX_PAUSE))
+		ethpause.tx_pause = 1;
+	else
+		ethpause.tx_pause = 0;
+	if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) {
+		ret = rte_errno;
+		WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
+		     " failed: %s",
+		     strerror(rte_errno));
+		goto out;
+	}
+	ret = 0;
+out:
+	assert(ret >= 0);
+	return -ret;
+}
+
+/**
+ * DPDK callback to retrieve the received packet types that are recognized
+ * by the device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   Pointer to an array of recognized packet types if in Rx burst mode,
+ *   NULL otherwise.
+ */
+const uint32_t *
+mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev)
+{
+	static const uint32_t ptypes[] = {
+		/* refers to rxq_cq_to_pkt_type() */
+		RTE_PTYPE_L2_ETHER,
+		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+		RTE_PTYPE_L4_FRAG,
+		RTE_PTYPE_L4_TCP,
+		RTE_PTYPE_L4_UDP,
+		RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
+		RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
+		RTE_PTYPE_UNKNOWN
+	};
+
+	if (dev->rx_pkt_burst == mlx4_rx_burst)
+		return ptypes;
+	return NULL;
+}
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 925c89c5..8b87b298 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -2,7 +2,7 @@
  *   BSD LICENSE
  *
  *   Copyright 2017 6WIND S.A.
- *   Copyright 2017 Mellanox.
+ *   Copyright 2017 Mellanox
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -31,197 +31,328 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/**
+ * @file
+ * Flow API operations for mlx4 driver.
+ */
+
+#include <arpa/inet.h>
 #include <assert.h>
+#include <errno.h>
+#include <stdalign.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/queue.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
 
+#include <rte_byteorder.h>
+#include <rte_errno.h>
+#include <rte_eth_ctrl.h>
+#include <rte_ethdev.h>
+#include <rte_ether.h>
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
 #include <rte_malloc.h>
 
-/* Generated configuration header. */
-#include "mlx4_autoconf.h"
-
 /* PMD headers. */
 #include "mlx4.h"
 #include "mlx4_flow.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
 
-/** Static initializer for items. */
-#define ITEMS(...) \
+/** Static initializer for a list of subsequent item types. */
+#define NEXT_ITEM(...) \
 	(const enum rte_flow_item_type []){ \
 		__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
 	}
 
-/** Structure to generate a simple graph of layers supported by the NIC. */
-struct mlx4_flow_items {
-	/** List of possible actions for these items. */
-	const enum rte_flow_action_type *const actions;
-	/** Bit-masks corresponding to the possibilities for the item. */
-	const void *mask;
-	/**
-	 * Default bit-masks to use when item->mask is not provided. When
-	 * \default_mask is also NULL, the full supported bit-mask (\mask) is
-	 * used instead.
-	 */
-	const void *default_mask;
-	/** Bit-masks size in bytes. */
+/** Processor structure associated with a flow item. */
+struct mlx4_flow_proc_item {
+	/** Bit-mask for fields supported by this PMD. */
+	const void *mask_support;
+	/** Bit-mask to use when @p item->mask is not provided. */
+	const void *mask_default;
+	/** Size in bytes for @p mask_support and @p mask_default. */
 	const unsigned int mask_sz;
-	/**
-	 * Check support for a given item.
-	 *
-	 * @param item[in]
-	 *   Item specification.
-	 * @param mask[in]
-	 *   Bit-masks covering supported fields to compare with spec,
-	 *   last and mask in
-	 *   \item.
-	 * @param size
-	 *   Bit-Mask size in bytes.
-	 *
-	 * @return
-	 *   0 on success, negative value otherwise.
-	 */
-	int (*validate)(const struct rte_flow_item *item,
-			const uint8_t *mask, unsigned int size);
-	/**
-	 * Conversion function from rte_flow to NIC specific flow.
-	 *
-	 * @param item
-	 *   rte_flow item to convert.
-	 * @param default_mask
-	 *   Default bit-masks to use when item->mask is not provided.
-	 * @param data
-	 *   Internal structure to store the conversion.
-	 *
-	 * @return
-	 *   0 on success, negative value otherwise.
-	 */
-	int (*convert)(const struct rte_flow_item *item,
-		       const void *default_mask,
-		       void *data);
+	/** Merge a pattern item into a flow rule handle. */
+	int (*merge)(struct rte_flow *flow,
+		     const struct rte_flow_item *item,
+		     const struct mlx4_flow_proc_item *proc,
+		     struct rte_flow_error *error);
 	/** Size in bytes of the destination structure. */
 	const unsigned int dst_sz;
-	/** List of possible following items.  */
-	const enum rte_flow_item_type *const items;
+	/** List of possible subsequent items. */
+	const enum rte_flow_item_type *const next_item;
 };
 
-struct rte_flow_drop {
-	struct ibv_qp *qp; /**< Verbs queue pair. */
-	struct ibv_cq *cq; /**< Verbs completion queue. */
+/** Shared resources for drop flow rules. */
+struct mlx4_drop {
+	struct ibv_qp *qp; /**< QP target. */
+	struct ibv_cq *cq; /**< CQ associated with above QP. */
+	struct priv *priv; /**< Back pointer to private data. */
+	uint32_t refcnt; /**< Reference count. */
 };
 
-/** Valid action for this PMD. */
-static const enum rte_flow_action_type valid_actions[] = {
-	RTE_FLOW_ACTION_TYPE_DROP,
-	RTE_FLOW_ACTION_TYPE_QUEUE,
-	RTE_FLOW_ACTION_TYPE_RSS,
-	RTE_FLOW_ACTION_TYPE_END,
-};
+/**
+ * Convert DPDK RSS hash fields to their Verbs equivalent.
+ *
+ * @param rss_hf
+ *   Hash fields in DPDK format (see struct rte_eth_rss_conf).
+ *
+ * @return
+ *   A valid Verbs RSS hash fields mask for mlx4 on success, (uint64_t)-1
+ *   otherwise and rte_errno is set.
+ */
+static uint64_t
+mlx4_conv_rss_hf(uint64_t rss_hf)
+{
+	enum { IPV4, IPV6, TCP, UDP, };
+	const uint64_t in[] = {
+		[IPV4] = (ETH_RSS_IPV4 |
+			  ETH_RSS_FRAG_IPV4 |
+			  ETH_RSS_NONFRAG_IPV4_TCP |
+			  ETH_RSS_NONFRAG_IPV4_UDP |
+			  ETH_RSS_NONFRAG_IPV4_OTHER),
+		[IPV6] = (ETH_RSS_IPV6 |
+			  ETH_RSS_FRAG_IPV6 |
+			  ETH_RSS_NONFRAG_IPV6_TCP |
+			  ETH_RSS_NONFRAG_IPV6_UDP |
+			  ETH_RSS_NONFRAG_IPV6_OTHER |
+			  ETH_RSS_IPV6_EX |
+			  ETH_RSS_IPV6_TCP_EX |
+			  ETH_RSS_IPV6_UDP_EX),
+		[TCP] = (ETH_RSS_NONFRAG_IPV4_TCP |
+			 ETH_RSS_NONFRAG_IPV6_TCP |
+			 ETH_RSS_IPV6_TCP_EX),
+		/*
+		 * UDP support is temporarily disabled due to an
+		 * implementation issue in the kernel.
+		 */
+		[UDP] = 0,
+	};
+	const uint64_t out[RTE_DIM(in)] = {
+		[IPV4] = IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4,
+		[IPV6] = IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6,
+		[TCP] = IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP,
+		[UDP] = IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP,
+	};
+	uint64_t seen = 0;
+	uint64_t conv = 0;
+	unsigned int i;
+
+	for (i = 0; i != RTE_DIM(in); ++i)
+		if (rss_hf & in[i]) {
+			seen |= rss_hf & in[i];
+			conv |= out[i];
+		}
+	if (!(rss_hf & ~seen))
+		return conv;
+	rte_errno = ENOTSUP;
+	return (uint64_t)-1;
+}
 
 /**
- * Convert Ethernet item to Verbs specification.
+ * Merge Ethernet pattern item into flow rule handle.
  *
- * @param item[in]
- *   Item specification.
- * @param default_mask[in]
- *   Default bit-masks to use when item->mask is not provided.
- * @param data[in, out]
- *   User structure.
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks, except in the specific case of matching
+ *   all multicast traffic (@p spec->dst and @p mask->dst equal to
+ *   01:00:00:00:00:00).
+ * - Not providing @p item->spec or providing an empty @p mask->dst is
+ *   *only* supported if the rule doesn't specify additional matching
+ *   criteria (i.e. rule is promiscuous-like).
+ *
+ * @param[in, out] flow
+ *   Flow rule handle to update.
+ * @param[in] item
+ *   Pattern item to merge.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx4_flow_create_eth(const struct rte_flow_item *item,
-		     const void *default_mask,
-		     void *data)
+mlx4_flow_merge_eth(struct rte_flow *flow,
+		    const struct rte_flow_item *item,
+		    const struct mlx4_flow_proc_item *proc,
+		    struct rte_flow_error *error)
 {
 	const struct rte_flow_item_eth *spec = item->spec;
-	const struct rte_flow_item_eth *mask = item->mask;
-	struct mlx4_flow *flow = (struct mlx4_flow *)data;
+	const struct rte_flow_item_eth *mask =
+		spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
 	struct ibv_flow_spec_eth *eth;
-	const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth);
+	const char *msg;
 	unsigned int i;
 
+	if (!mask) {
+		flow->promisc = 1;
+	} else {
+		uint32_t sum_dst = 0;
+		uint32_t sum_src = 0;
+
+		for (i = 0; i != sizeof(mask->dst.addr_bytes); ++i) {
+			sum_dst += mask->dst.addr_bytes[i];
+			sum_src += mask->src.addr_bytes[i];
+		}
+		if (sum_src) {
+			msg = "mlx4 does not support source MAC matching";
+			goto error;
+		} else if (!sum_dst) {
+			flow->promisc = 1;
+		} else if (sum_dst == 1 && mask->dst.addr_bytes[0] == 1) {
+			if (!(spec->dst.addr_bytes[0] & 1)) {
+				msg = "mlx4 does not support the explicit"
+					" exclusion of all multicast traffic";
+				goto error;
+			}
+			flow->allmulti = 1;
+		} else if (sum_dst != (UINT8_C(0xff) * ETHER_ADDR_LEN)) {
+			msg = "mlx4 does not support matching partial"
+				" Ethernet fields";
+			goto error;
+		}
+	}
+	if (!flow->ibv_attr)
+		return 0;
+	if (flow->promisc) {
+		flow->ibv_attr->type = IBV_FLOW_ATTR_ALL_DEFAULT;
+		return 0;
+	}
+	if (flow->allmulti) {
+		flow->ibv_attr->type = IBV_FLOW_ATTR_MC_DEFAULT;
+		return 0;
+	}
 	++flow->ibv_attr->num_of_specs;
-	flow->ibv_attr->priority = 2;
-	eth = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
+	eth = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
 	*eth = (struct ibv_flow_spec_eth) {
 		.type = IBV_FLOW_SPEC_ETH,
-		.size = eth_size,
+		.size = sizeof(*eth),
 	};
-	if (!spec) {
-		flow->ibv_attr->type = IBV_FLOW_ATTR_ALL_DEFAULT;
-		return 0;
-	}
-	if (!mask)
-		mask = default_mask;
 	memcpy(eth->val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN);
-	memcpy(eth->val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN);
 	memcpy(eth->mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN);
-	memcpy(eth->mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN);
 	/* Remove unwanted bits from values. */
 	for (i = 0; i < ETHER_ADDR_LEN; ++i) {
 		eth->val.dst_mac[i] &= eth->mask.dst_mac[i];
-		eth->val.src_mac[i] &= eth->mask.src_mac[i];
 	}
 	return 0;
+error:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg);
 }
 
 /**
- * Convert VLAN item to Verbs specification.
+ * Merge VLAN pattern item into flow rule handle.
  *
- * @param item[in]
- *   Item specification.
- * @param default_mask[in]
- *   Default bit-masks to use when item->mask is not provided.
- * @param data[in, out]
- *   User structure.
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - Matching *all* VLAN traffic by omitting @p item->spec or providing an
+ *   empty @p item->mask would also include non-VLAN traffic. Doing so is
+ *   therefore unsupported.
+ * - No support for partial masks.
+ *
+ * @param[in, out] flow
+ *   Flow rule handle to update.
+ * @param[in] item
+ *   Pattern item to merge.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx4_flow_create_vlan(const struct rte_flow_item *item,
-		      const void *default_mask,
-		      void *data)
+mlx4_flow_merge_vlan(struct rte_flow *flow,
+		     const struct rte_flow_item *item,
+		     const struct mlx4_flow_proc_item *proc,
+		     struct rte_flow_error *error)
 {
 	const struct rte_flow_item_vlan *spec = item->spec;
-	const struct rte_flow_item_vlan *mask = item->mask;
-	struct mlx4_flow *flow = (struct mlx4_flow *)data;
+	const struct rte_flow_item_vlan *mask =
+		spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
 	struct ibv_flow_spec_eth *eth;
-	const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth);
+	const char *msg;
 
-	eth = (void *)((uintptr_t)flow->ibv_attr + flow->offset - eth_size);
-	if (!spec)
+	if (!mask || !mask->tci) {
+		msg = "mlx4 cannot match all VLAN traffic while excluding"
+			" non-VLAN traffic, TCI VID must be specified";
+		goto error;
+	}
+	if (mask->tci != RTE_BE16(0x0fff)) {
+		msg = "mlx4 does not support partial TCI VID matching";
+		goto error;
+	}
+	if (!flow->ibv_attr)
 		return 0;
-	if (!mask)
-		mask = default_mask;
+	eth = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size -
+		       sizeof(*eth));
 	eth->val.vlan_tag = spec->tci;
 	eth->mask.vlan_tag = mask->tci;
 	eth->val.vlan_tag &= eth->mask.vlan_tag;
 	return 0;
+error:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg);
 }
 
 /**
- * Convert IPv4 item to Verbs specification.
+ * Merge IPv4 pattern item into flow rule handle.
  *
- * @param item[in]
- *   Item specification.
- * @param default_mask[in]
- *   Default bit-masks to use when item->mask is not provided.
- * @param data[in, out]
- *   User structure.
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks.
+ *
+ * @param[in, out] flow
+ *   Flow rule handle to update.
+ * @param[in] item
+ *   Pattern item to merge.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx4_flow_create_ipv4(const struct rte_flow_item *item,
-		      const void *default_mask,
-		      void *data)
+mlx4_flow_merge_ipv4(struct rte_flow *flow,
+		     const struct rte_flow_item *item,
+		     const struct mlx4_flow_proc_item *proc,
+		     struct rte_flow_error *error)
 {
 	const struct rte_flow_item_ipv4 *spec = item->spec;
-	const struct rte_flow_item_ipv4 *mask = item->mask;
-	struct mlx4_flow *flow = (struct mlx4_flow *)data;
+	const struct rte_flow_item_ipv4 *mask =
+		spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
 	struct ibv_flow_spec_ipv4 *ipv4;
-	unsigned int ipv4_size = sizeof(struct ibv_flow_spec_ipv4);
+	const char *msg;
 
+	if (mask &&
+	    ((uint32_t)(mask->hdr.src_addr + 1) > UINT32_C(1) ||
+	     (uint32_t)(mask->hdr.dst_addr + 1) > UINT32_C(1))) {
+		msg = "mlx4 does not support matching partial IPv4 fields";
+		goto error;
+	}
+	if (!flow->ibv_attr)
+		return 0;
 	++flow->ibv_attr->num_of_specs;
-	flow->ibv_attr->priority = 1;
-	ipv4 = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
+	ipv4 = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
 	*ipv4 = (struct ibv_flow_spec_ipv4) {
 		.type = IBV_FLOW_SPEC_IPV4,
-		.size = ipv4_size,
+		.size = sizeof(*ipv4),
 	};
 	if (!spec)
 		return 0;
@@ -229,8 +360,6 @@ mlx4_flow_create_ipv4(const struct rte_flow_item *item,
 		.src_ip = spec->hdr.src_addr,
 		.dst_ip = spec->hdr.dst_addr,
 	};
-	if (!mask)
-		mask = default_mask;
 	ipv4->mask = (struct ibv_flow_ipv4_filter) {
 		.src_ip = mask->hdr.src_addr,
 		.dst_ip = mask->hdr.dst_addr,
@@ -239,528 +368,504 @@ mlx4_flow_create_ipv4(const struct rte_flow_item *item,
 	ipv4->val.src_ip &= ipv4->mask.src_ip;
 	ipv4->val.dst_ip &= ipv4->mask.dst_ip;
 	return 0;
+error:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg);
 }
 
 /**
- * Convert UDP item to Verbs specification.
+ * Merge UDP pattern item into flow rule handle.
  *
- * @param item[in]
- *   Item specification.
- * @param default_mask[in]
- *   Default bit-masks to use when item->mask is not provided.
- * @param data[in, out]
- *   User structure.
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks.
+ *
+ * @param[in, out] flow
+ *   Flow rule handle to update.
+ * @param[in] item
+ *   Pattern item to merge.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx4_flow_create_udp(const struct rte_flow_item *item,
-		     const void *default_mask,
-		     void *data)
+mlx4_flow_merge_udp(struct rte_flow *flow,
+		    const struct rte_flow_item *item,
+		    const struct mlx4_flow_proc_item *proc,
+		    struct rte_flow_error *error)
 {
 	const struct rte_flow_item_udp *spec = item->spec;
-	const struct rte_flow_item_udp *mask = item->mask;
-	struct mlx4_flow *flow = (struct mlx4_flow *)data;
+	const struct rte_flow_item_udp *mask =
+		spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
 	struct ibv_flow_spec_tcp_udp *udp;
-	unsigned int udp_size = sizeof(struct ibv_flow_spec_tcp_udp);
+	const char *msg;
 
+	if (mask &&
+	    ((uint16_t)(mask->hdr.src_port + 1) > UINT16_C(1) ||
+	     (uint16_t)(mask->hdr.dst_port + 1) > UINT16_C(1))) {
+		msg = "mlx4 does not support matching partial UDP fields";
+		goto error;
+	}
+	if (!flow->ibv_attr)
+		return 0;
 	++flow->ibv_attr->num_of_specs;
-	flow->ibv_attr->priority = 0;
-	udp = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
+	udp = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
 	*udp = (struct ibv_flow_spec_tcp_udp) {
 		.type = IBV_FLOW_SPEC_UDP,
-		.size = udp_size,
+		.size = sizeof(*udp),
 	};
 	if (!spec)
 		return 0;
 	udp->val.dst_port = spec->hdr.dst_port;
 	udp->val.src_port = spec->hdr.src_port;
-	if (!mask)
-		mask = default_mask;
 	udp->mask.dst_port = mask->hdr.dst_port;
 	udp->mask.src_port = mask->hdr.src_port;
 	/* Remove unwanted bits from values. */
 	udp->val.src_port &= udp->mask.src_port;
 	udp->val.dst_port &= udp->mask.dst_port;
 	return 0;
+error:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg);
 }
 
 /**
- * Convert TCP item to Verbs specification.
+ * Merge TCP pattern item into flow rule handle.
  *
- * @param item[in]
- *   Item specification.
- * @param default_mask[in]
- *   Default bit-masks to use when item->mask is not provided.
- * @param data[in, out]
- *   User structure.
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks.
+ *
+ * @param[in, out] flow
+ *   Flow rule handle to update.
+ * @param[in] item
+ *   Pattern item to merge.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx4_flow_create_tcp(const struct rte_flow_item *item,
-		     const void *default_mask,
-		     void *data)
+mlx4_flow_merge_tcp(struct rte_flow *flow,
+		    const struct rte_flow_item *item,
+		    const struct mlx4_flow_proc_item *proc,
+		    struct rte_flow_error *error)
 {
 	const struct rte_flow_item_tcp *spec = item->spec;
-	const struct rte_flow_item_tcp *mask = item->mask;
-	struct mlx4_flow *flow = (struct mlx4_flow *)data;
+	const struct rte_flow_item_tcp *mask =
+		spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
 	struct ibv_flow_spec_tcp_udp *tcp;
-	unsigned int tcp_size = sizeof(struct ibv_flow_spec_tcp_udp);
+	const char *msg;
 
+	if (mask &&
+	    ((uint16_t)(mask->hdr.src_port + 1) > UINT16_C(1) ||
+	     (uint16_t)(mask->hdr.dst_port + 1) > UINT16_C(1))) {
+		msg = "mlx4 does not support matching partial TCP fields";
+		goto error;
+	}
+	if (!flow->ibv_attr)
+		return 0;
 	++flow->ibv_attr->num_of_specs;
-	flow->ibv_attr->priority = 0;
-	tcp = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
+	tcp = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
 	*tcp = (struct ibv_flow_spec_tcp_udp) {
 		.type = IBV_FLOW_SPEC_TCP,
-		.size = tcp_size,
+		.size = sizeof(*tcp),
 	};
 	if (!spec)
 		return 0;
 	tcp->val.dst_port = spec->hdr.dst_port;
 	tcp->val.src_port = spec->hdr.src_port;
-	if (!mask)
-		mask = default_mask;
 	tcp->mask.dst_port = mask->hdr.dst_port;
 	tcp->mask.src_port = mask->hdr.src_port;
 	/* Remove unwanted bits from values. */
 	tcp->val.src_port &= tcp->mask.src_port;
 	tcp->val.dst_port &= tcp->mask.dst_port;
 	return 0;
+error:
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg);
 }
 
 /**
- * Check support for a given item.
+ * Perform basic sanity checks on a pattern item.
  *
- * @param item[in]
+ * @param[in] item
  *   Item specification.
- * @param mask[in]
- *   Bit-masks covering supported fields to compare with spec, last and mask in
- *   \item.
- * @param size
- *   Bit-Mask size in bytes.
+ * @param[in] proc
+ *   Associated item-processing object.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
  *
  * @return
- *   0 on success, negative value otherwise.
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx4_flow_item_validate(const struct rte_flow_item *item,
-			const uint8_t *mask, unsigned int size)
+mlx4_flow_item_check(const struct rte_flow_item *item,
+		     const struct mlx4_flow_proc_item *proc,
+		     struct rte_flow_error *error)
 {
-	int ret = 0;
+	const uint8_t *mask;
+	unsigned int i;
 
+	/* item->last and item->mask cannot exist without item->spec. */
 	if (!item->spec && (item->mask || item->last))
-		return -1;
-	if (item->spec && !item->mask) {
-		unsigned int i;
-		const uint8_t *spec = item->spec;
-
-		for (i = 0; i < size; ++i)
-			if ((spec[i] | mask[i]) != mask[i])
-				return -1;
-	}
-	if (item->last && !item->mask) {
-		unsigned int i;
-		const uint8_t *spec = item->last;
-
-		for (i = 0; i < size; ++i)
-			if ((spec[i] | mask[i]) != mask[i])
-				return -1;
-	}
-	if (item->spec && item->last) {
-		uint8_t spec[size];
-		uint8_t last[size];
-		const uint8_t *apply = mask;
-		unsigned int i;
-
-		if (item->mask)
-			apply = item->mask;
-		for (i = 0; i < size; ++i) {
-			spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
-			last[i] = ((const uint8_t *)item->last)[i] & apply[i];
-		}
-		ret = memcmp(spec, last, size);
-	}
-	return ret;
-}
-
-static int
-mlx4_flow_validate_eth(const struct rte_flow_item *item,
-		       const uint8_t *mask, unsigned int size)
-{
-	if (item->mask) {
-		const struct rte_flow_item_eth *mask = item->mask;
-
-		if (mask->dst.addr_bytes[0] != 0xff ||
-				mask->dst.addr_bytes[1] != 0xff ||
-				mask->dst.addr_bytes[2] != 0xff ||
-				mask->dst.addr_bytes[3] != 0xff ||
-				mask->dst.addr_bytes[4] != 0xff ||
-				mask->dst.addr_bytes[5] != 0xff)
-			return -1;
-	}
-	return mlx4_flow_item_validate(item, mask, size);
-}
-
-static int
-mlx4_flow_validate_vlan(const struct rte_flow_item *item,
-			const uint8_t *mask, unsigned int size)
-{
-	if (item->mask) {
-		const struct rte_flow_item_vlan *mask = item->mask;
-
-		if (mask->tci != 0 &&
-		    ntohs(mask->tci) != 0x0fff)
-			return -1;
-	}
-	return mlx4_flow_item_validate(item, mask, size);
-}
-
-static int
-mlx4_flow_validate_ipv4(const struct rte_flow_item *item,
-			const uint8_t *mask, unsigned int size)
-{
-	if (item->mask) {
-		const struct rte_flow_item_ipv4 *mask = item->mask;
-
-		if (mask->hdr.src_addr != 0 &&
-		    mask->hdr.src_addr != 0xffffffff)
-			return -1;
-		if (mask->hdr.dst_addr != 0 &&
-		    mask->hdr.dst_addr != 0xffffffff)
-			return -1;
-	}
-	return mlx4_flow_item_validate(item, mask, size);
-}
-
-static int
-mlx4_flow_validate_udp(const struct rte_flow_item *item,
-		       const uint8_t *mask, unsigned int size)
-{
-	if (item->mask) {
-		const struct rte_flow_item_udp *mask = item->mask;
-
-		if (mask->hdr.src_port != 0 &&
-		    mask->hdr.src_port != 0xffff)
-			return -1;
-		if (mask->hdr.dst_port != 0 &&
-		    mask->hdr.dst_port != 0xffff)
-			return -1;
-	}
-	return mlx4_flow_item_validate(item, mask, size);
-}
-
-static int
-mlx4_flow_validate_tcp(const struct rte_flow_item *item,
-		       const uint8_t *mask, unsigned int size)
-{
-	if (item->mask) {
-		const struct rte_flow_item_tcp *mask = item->mask;
-
-		if (mask->hdr.src_port != 0 &&
-		    mask->hdr.src_port != 0xffff)
-			return -1;
-		if (mask->hdr.dst_port != 0 &&
-		    mask->hdr.dst_port != 0xffff)
-			return -1;
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item,
+			 "\"mask\" or \"last\" field provided without a"
+			 " corresponding \"spec\"");
+	/* No spec, no mask, no problem. */
+	if (!item->spec)
+		return 0;
+	mask = item->mask ?
+		(const uint8_t *)item->mask :
+		(const uint8_t *)proc->mask_default;
+	assert(mask);
+	/*
+	 * Single-pass check to make sure that:
+	 * - Mask is supported, no bits are set outside proc->mask_support.
+	 * - Both item->spec and item->last are included in mask.
+	 */
+	for (i = 0; i != proc->mask_sz; ++i) {
+		if (!mask[i])
+			continue;
+		if ((mask[i] | ((const uint8_t *)proc->mask_support)[i]) !=
+		    ((const uint8_t *)proc->mask_support)[i])
+			return rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				 item, "unsupported field found in \"mask\"");
+		if (item->last &&
+		    (((const uint8_t *)item->spec)[i] & mask[i]) !=
+		    (((const uint8_t *)item->last)[i] & mask[i]))
+			return rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				 item,
+				 "range between \"spec\" and \"last\""
+				 " is larger than \"mask\"");
 	}
-	return mlx4_flow_item_validate(item, mask, size);
+	return 0;
 }
 
 /** Graph of supported items and associated actions. */
-static const struct mlx4_flow_items mlx4_flow_items[] = {
+static const struct mlx4_flow_proc_item mlx4_flow_proc_item_list[] = {
 	[RTE_FLOW_ITEM_TYPE_END] = {
-		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
+		.next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_ETH),
 	},
 	[RTE_FLOW_ITEM_TYPE_ETH] = {
-		.items = ITEMS(RTE_FLOW_ITEM_TYPE_VLAN,
-			       RTE_FLOW_ITEM_TYPE_IPV4),
-		.actions = valid_actions,
-		.mask = &(const struct rte_flow_item_eth){
+		.next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_VLAN,
+				       RTE_FLOW_ITEM_TYPE_IPV4),
+		.mask_support = &(const struct rte_flow_item_eth){
+			/* Only destination MAC can be matched. */
 			.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
-			.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
 		},
-		.default_mask = &rte_flow_item_eth_mask,
+		.mask_default = &rte_flow_item_eth_mask,
 		.mask_sz = sizeof(struct rte_flow_item_eth),
-		.validate = mlx4_flow_validate_eth,
-		.convert = mlx4_flow_create_eth,
+		.merge = mlx4_flow_merge_eth,
 		.dst_sz = sizeof(struct ibv_flow_spec_eth),
 	},
 	[RTE_FLOW_ITEM_TYPE_VLAN] = {
-		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4),
-		.actions = valid_actions,
-		.mask = &(const struct rte_flow_item_vlan){
-		/* rte_flow_item_vlan_mask is invalid for mlx4. */
-#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
-			.tci = 0x0fff,
-#else
-			.tci = 0xff0f,
-#endif
+		.next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_IPV4),
+		.mask_support = &(const struct rte_flow_item_vlan){
+			/* Only TCI VID matching is supported. */
+			.tci = RTE_BE16(0x0fff),
 		},
+		.mask_default = &rte_flow_item_vlan_mask,
 		.mask_sz = sizeof(struct rte_flow_item_vlan),
-		.validate = mlx4_flow_validate_vlan,
-		.convert = mlx4_flow_create_vlan,
+		.merge = mlx4_flow_merge_vlan,
 		.dst_sz = 0,
 	},
 	[RTE_FLOW_ITEM_TYPE_IPV4] = {
-		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
-			       RTE_FLOW_ITEM_TYPE_TCP),
-		.actions = valid_actions,
-		.mask = &(const struct rte_flow_item_ipv4){
+		.next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_UDP,
+				       RTE_FLOW_ITEM_TYPE_TCP),
+		.mask_support = &(const struct rte_flow_item_ipv4){
 			.hdr = {
-				.src_addr = -1,
-				.dst_addr = -1,
+				.src_addr = RTE_BE32(0xffffffff),
+				.dst_addr = RTE_BE32(0xffffffff),
 			},
 		},
-		.default_mask = &rte_flow_item_ipv4_mask,
+		.mask_default = &rte_flow_item_ipv4_mask,
 		.mask_sz = sizeof(struct rte_flow_item_ipv4),
-		.validate = mlx4_flow_validate_ipv4,
-		.convert = mlx4_flow_create_ipv4,
+		.merge = mlx4_flow_merge_ipv4,
 		.dst_sz = sizeof(struct ibv_flow_spec_ipv4),
 	},
 	[RTE_FLOW_ITEM_TYPE_UDP] = {
-		.actions = valid_actions,
-		.mask = &(const struct rte_flow_item_udp){
+		.mask_support = &(const struct rte_flow_item_udp){
 			.hdr = {
-				.src_port = -1,
-				.dst_port = -1,
+				.src_port = RTE_BE16(0xffff),
+				.dst_port = RTE_BE16(0xffff),
 			},
 		},
-		.default_mask = &rte_flow_item_udp_mask,
+		.mask_default = &rte_flow_item_udp_mask,
 		.mask_sz = sizeof(struct rte_flow_item_udp),
-		.validate = mlx4_flow_validate_udp,
-		.convert = mlx4_flow_create_udp,
+		.merge = mlx4_flow_merge_udp,
 		.dst_sz = sizeof(struct ibv_flow_spec_tcp_udp),
 	},
 	[RTE_FLOW_ITEM_TYPE_TCP] = {
-		.actions = valid_actions,
-		.mask = &(const struct rte_flow_item_tcp){
+		.mask_support = &(const struct rte_flow_item_tcp){
 			.hdr = {
-				.src_port = -1,
-				.dst_port = -1,
+				.src_port = RTE_BE16(0xffff),
+				.dst_port = RTE_BE16(0xffff),
 			},
 		},
-		.default_mask = &rte_flow_item_tcp_mask,
+		.mask_default = &rte_flow_item_tcp_mask,
 		.mask_sz = sizeof(struct rte_flow_item_tcp),
-		.validate = mlx4_flow_validate_tcp,
-		.convert = mlx4_flow_create_tcp,
+		.merge = mlx4_flow_merge_tcp,
 		.dst_sz = sizeof(struct ibv_flow_spec_tcp_udp),
 	},
 };
 
 /**
- * Validate a flow supported by the NIC.
+ * Make sure a flow rule is supported and initialize associated structure.
  *
  * @param priv
  *   Pointer to private structure.
  * @param[in] attr
  *   Flow rule attributes.
- * @param[in] items
+ * @param[in] pattern
  *   Pattern specification (list terminated by the END pattern item).
  * @param[in] actions
  *   Associated actions (list terminated by the END action).
  * @param[out] error
  *   Perform verbose error reporting if not NULL.
- * @param[in, out] flow
- *   Flow structure to update.
+ * @param[in, out] addr
+ *   Buffer where the resulting flow rule handle pointer must be stored.
+ *   If NULL, stop processing after validation stage.
  *
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-priv_flow_validate(struct priv *priv,
-		   const struct rte_flow_attr *attr,
-		   const struct rte_flow_item items[],
-		   const struct rte_flow_action actions[],
-		   struct rte_flow_error *error,
-		   struct mlx4_flow *flow)
+mlx4_flow_prepare(struct priv *priv,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item pattern[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error,
+		  struct rte_flow **addr)
 {
-	const struct mlx4_flow_items *cur_item = mlx4_flow_items;
-	struct mlx4_flow_action action = {
-		.queue = 0,
-		.drop = 0,
-	};
-
-	(void)priv;
-	if (attr->group) {
-		rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
-				   NULL,
-				   "groups are not supported");
-		return -rte_errno;
-	}
-	if (attr->priority) {
-		rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
-				   NULL,
-				   "priorities are not supported");
-		return -rte_errno;
-	}
-	if (attr->egress) {
-		rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
-				   NULL,
-				   "egress is not supported");
-		return -rte_errno;
-	}
-	if (!attr->ingress) {
-		rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
-				   NULL,
-				   "only ingress is supported");
-		return -rte_errno;
-	}
-	/* Go over items list. */
-	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
-		const struct mlx4_flow_items *token = NULL;
+	const struct rte_flow_item *item;
+	const struct rte_flow_action *action;
+	const struct mlx4_flow_proc_item *proc;
+	struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) };
+	struct rte_flow *flow = &temp;
+	const char *msg = NULL;
+
+	if (attr->group)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+			 NULL, "groups are not supported");
+	if (attr->priority > MLX4_FLOW_PRIORITY_LAST)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+			 NULL, "maximum priority level is "
+			 MLX4_STR_EXPAND(MLX4_FLOW_PRIORITY_LAST));
+	if (attr->egress)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
+			 NULL, "egress is not supported");
+	if (!attr->ingress)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+			 NULL, "only ingress is supported");
+fill:
+	proc = mlx4_flow_proc_item_list;
+	/* Go over pattern. */
+	for (item = pattern; item->type; ++item) {
+		const struct mlx4_flow_proc_item *next = NULL;
 		unsigned int i;
 		int err;
 
-		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
+		if (item->type == RTE_FLOW_ITEM_TYPE_VOID)
+			continue;
+		if (item->type == MLX4_FLOW_ITEM_TYPE_INTERNAL) {
+			flow->internal = 1;
 			continue;
-		/*
-		 * The nic can support patterns with NULL eth spec only
-		 * if eth is a single item in a rule.
-		 */
-		if (!items->spec &&
-			items->type == RTE_FLOW_ITEM_TYPE_ETH) {
-			const struct rte_flow_item *next = items + 1;
-
-			if (next->type != RTE_FLOW_ITEM_TYPE_END) {
-				rte_flow_error_set(error, ENOTSUP,
-						   RTE_FLOW_ERROR_TYPE_ITEM,
-						   items,
-						   "the rule requires"
-						   " an Ethernet spec");
-				return -rte_errno;
-			}
 		}
-		for (i = 0;
-		     cur_item->items &&
-		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
-		     ++i) {
-			if (cur_item->items[i] == items->type) {
-				token = &mlx4_flow_items[items->type];
+		if (flow->promisc || flow->allmulti) {
+			msg = "mlx4 does not support additional matching"
+				" criteria combined with indiscriminate"
+				" matching on Ethernet headers";
+			goto exit_item_not_supported;
+		}
+		for (i = 0; proc->next_item && proc->next_item[i]; ++i) {
+			if (proc->next_item[i] == item->type) {
+				next = &mlx4_flow_proc_item_list[item->type];
 				break;
 			}
 		}
-		if (!token)
-			goto exit_item_not_supported;
-		cur_item = token;
-		err = cur_item->validate(items,
-					(const uint8_t *)cur_item->mask,
-					 cur_item->mask_sz);
-		if (err)
+		if (!next)
 			goto exit_item_not_supported;
-		if (flow->ibv_attr && cur_item->convert) {
-			err = cur_item->convert(items,
-						(cur_item->default_mask ?
-						 cur_item->default_mask :
-						 cur_item->mask),
-						 flow);
+		proc = next;
+		/*
+		 * Perform basic sanity checks only once, while handle is
+		 * not allocated.
+		 */
+		if (flow == &temp) {
+			err = mlx4_flow_item_check(item, proc, error);
 			if (err)
-				goto exit_item_not_supported;
+				return err;
 		}
-		flow->offset += cur_item->dst_sz;
+		if (proc->merge) {
+			err = proc->merge(flow, item, proc, error);
+			if (err)
+				return err;
+		}
+		flow->ibv_attr_size += proc->dst_sz;
 	}
-	/* Go over actions list */
-	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
-		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
-			continue;
-		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
-			action.drop = 1;
-		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
-			const struct rte_flow_action_queue *queue =
-				(const struct rte_flow_action_queue *)
-				actions->conf;
+	/* Go over actions list. */
+	for (action = actions; action->type; ++action) {
+		switch (action->type) {
+			const struct rte_flow_action_queue *queue;
+			const struct rte_flow_action_rss *rss;
+			const struct rte_eth_rss_conf *rss_conf;
+			unsigned int i;
 
-			if (!queue || (queue->index > (priv->rxqs_n - 1)))
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			continue;
+		case RTE_FLOW_ACTION_TYPE_DROP:
+			flow->drop = 1;
+			break;
+		case RTE_FLOW_ACTION_TYPE_QUEUE:
+			if (flow->rss)
+				break;
+			queue = action->conf;
+			if (queue->index >= priv->dev->data->nb_rx_queues) {
+				msg = "queue target index beyond number of"
+					" configured Rx queues";
 				goto exit_action_not_supported;
-			action.queue = 1;
-			action.queues_n = 1;
-			action.queues[0] = queue->index;
-		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
-			int i;
-			int ierr;
-			const struct rte_flow_action_rss *rss =
-				(const struct rte_flow_action_rss *)
-				actions->conf;
-
-			if (!priv->hw_rss) {
-				rte_flow_error_set(error, ENOTSUP,
-					   RTE_FLOW_ERROR_TYPE_ACTION,
-					   actions,
-					   "RSS cannot be used with "
-					   "the current configuration");
-				return -rte_errno;
 			}
-			if (!priv->isolated) {
-				rte_flow_error_set(error, ENOTSUP,
-					   RTE_FLOW_ERROR_TYPE_ACTION,
-					   actions,
-					   "RSS cannot be used without "
-					   "isolated mode");
-				return -rte_errno;
+			flow->rss = mlx4_rss_get
+				(priv, 0, mlx4_rss_hash_key_default, 1,
+				 &queue->index);
+			if (!flow->rss) {
+				msg = "not enough resources for additional"
+					" single-queue RSS context";
+				goto exit_action_not_supported;
+			}
+			break;
+		case RTE_FLOW_ACTION_TYPE_RSS:
+			if (flow->rss)
+				break;
+			rss = action->conf;
+			/* Default RSS configuration if none is provided. */
+			rss_conf =
+				rss->rss_conf ?
+				rss->rss_conf :
+				&(struct rte_eth_rss_conf){
+					.rss_key = mlx4_rss_hash_key_default,
+					.rss_key_len = MLX4_RSS_HASH_KEY_SIZE,
+					.rss_hf = (ETH_RSS_IPV4 |
+						   ETH_RSS_NONFRAG_IPV4_TCP |
+						   ETH_RSS_IPV6 |
+						   ETH_RSS_NONFRAG_IPV6_TCP),
+				};
+			/* Sanity checks. */
+			for (i = 0; i < rss->num; ++i)
+				if (rss->queue[i] >=
+				    priv->dev->data->nb_rx_queues)
+					break;
+			if (i != rss->num) {
+				msg = "queue index target beyond number of"
+					" configured Rx queues";
+				goto exit_action_not_supported;
 			}
 			if (!rte_is_power_of_2(rss->num)) {
-				rte_flow_error_set(error, ENOTSUP,
-					   RTE_FLOW_ERROR_TYPE_ACTION,
-					   actions,
-					   "the number of queues "
-					   "should be power of two");
-				return -rte_errno;
+				msg = "for RSS, mlx4 requires the number of"
+					" queues to be a power of two";
+				goto exit_action_not_supported;
 			}
-			if (priv->max_rss_tbl_sz < rss->num) {
-				rte_flow_error_set(error, ENOTSUP,
-					   RTE_FLOW_ERROR_TYPE_ACTION,
-					   actions,
-					   "the number of queues "
-					   "is too large");
-				return -rte_errno;
+			if (rss_conf->rss_key_len !=
+			    sizeof(flow->rss->key)) {
+				msg = "mlx4 supports exactly one RSS hash key"
+					" length: "
+					MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE);
+				goto exit_action_not_supported;
 			}
-			/* checking indexes array */
-			ierr = 0;
-			for (i = 0; i < rss->num; ++i) {
-				int j;
-				if (rss->queue[i] >= priv->rxqs_n)
-					ierr = 1;
-				/*
-				 * Prevent the user from specifying
-				 * the same queue twice in the RSS array.
-				 */
-				for (j = i + 1; j < rss->num && !ierr; ++j)
-					if (rss->queue[j] == rss->queue[i])
-						ierr = 1;
-				if (ierr) {
-					rte_flow_error_set(
-						error,
-						ENOTSUP,
-						RTE_FLOW_ERROR_TYPE_HANDLE,
-						NULL,
-						"RSS action only supports "
-						"unique queue indices "
-						"in a list");
-					return -rte_errno;
-				}
+			for (i = 1; i < rss->num; ++i)
+				if (rss->queue[i] - rss->queue[i - 1] != 1)
+					break;
+			if (i != rss->num) {
+				msg = "mlx4 requires RSS contexts to use"
+					" consecutive queue indices only";
+				goto exit_action_not_supported;
 			}
-			action.queue = 1;
-			action.queues_n = rss->num;
-			for (i = 0; i < rss->num; ++i)
-				action.queues[i] = rss->queue[i];
-		} else {
+			if (rss->queue[0] % rss->num) {
+				msg = "mlx4 requires the first queue of a RSS"
+					" context to be aligned on a multiple"
+					" of the context size";
+				goto exit_action_not_supported;
+			}
+			flow->rss = mlx4_rss_get
+				(priv, mlx4_conv_rss_hf(rss_conf->rss_hf),
+				 rss_conf->rss_key, rss->num, rss->queue);
+			if (!flow->rss) {
+				msg = "either invalid parameters or not enough"
+					" resources for additional multi-queue"
+					" RSS context";
+				goto exit_action_not_supported;
+			}
+			break;
+		default:
 			goto exit_action_not_supported;
 		}
 	}
-	if (!action.queue && !action.drop) {
-		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
-				   NULL, "no valid action");
-		return -rte_errno;
+	if (!flow->rss && !flow->drop)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL, "no valid action");
+	/* Validation ends here. */
+	if (!addr) {
+		if (flow->rss)
+			mlx4_rss_put(flow->rss);
+		return 0;
 	}
+	if (flow == &temp) {
+		/* Allocate proper handle based on collected data. */
+		const struct mlx4_malloc_vec vec[] = {
+			{
+				.align = alignof(struct rte_flow),
+				.size = sizeof(*flow),
+				.addr = (void **)&flow,
+			},
+			{
+				.align = alignof(struct ibv_flow_attr),
+				.size = temp.ibv_attr_size,
+				.addr = (void **)&temp.ibv_attr,
+			},
+		};
+
+		if (!mlx4_zmallocv(__func__, vec, RTE_DIM(vec)))
+			return rte_flow_error_set
+				(error, -rte_errno,
+				 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				 "flow rule handle allocation failure");
+		/* Most fields will be updated by second pass. */
+		*flow = (struct rte_flow){
+			.ibv_attr = temp.ibv_attr,
+			.ibv_attr_size = sizeof(*flow->ibv_attr),
+			.rss = temp.rss,
+		};
+		*flow->ibv_attr = (struct ibv_flow_attr){
+			.type = IBV_FLOW_ATTR_NORMAL,
+			.size = sizeof(*flow->ibv_attr),
+			.priority = attr->priority,
+			.port = priv->port,
+		};
+		goto fill;
+	}
+	*addr = flow;
 	return 0;
 exit_item_not_supported:
-	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
-			   items, "item not supported");
-	return -rte_errno;
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				  item, msg ? msg : "item not supported");
 exit_action_not_supported:
-	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
-			   actions, "action not supported");
-	return -rte_errno;
+	return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+				  action, msg ? msg : "action not supported");
 }
 
 /**
@@ -769,552 +874,691 @@ exit_action_not_supported:
  * @see rte_flow_validate()
  * @see rte_flow_ops
  */
-int
+static int
 mlx4_flow_validate(struct rte_eth_dev *dev,
 		   const struct rte_flow_attr *attr,
-		   const struct rte_flow_item items[],
+		   const struct rte_flow_item pattern[],
 		   const struct rte_flow_action actions[],
 		   struct rte_flow_error *error)
 {
 	struct priv *priv = dev->data->dev_private;
-	int ret;
-	struct mlx4_flow flow = { .offset = sizeof(struct ibv_flow_attr) };
-
-	priv_lock(priv);
-	ret = priv_flow_validate(priv, attr, items, actions, error, &flow);
-	priv_unlock(priv);
-	return ret;
-}
-
-/**
- * Destroy a drop queue.
- *
- * @param priv
- *   Pointer to private structure.
- */
-static void
-mlx4_flow_destroy_drop_queue(struct priv *priv)
-{
-	if (priv->flow_drop_queue) {
-		struct rte_flow_drop *fdq = priv->flow_drop_queue;
 
-		priv->flow_drop_queue = NULL;
-		claim_zero(ibv_destroy_qp(fdq->qp));
-		claim_zero(ibv_destroy_cq(fdq->cq));
-		rte_free(fdq);
-	}
+	return mlx4_flow_prepare(priv, attr, pattern, actions, error, NULL);
 }
 
 /**
- * Create a single drop queue for all drop flows.
+ * Get a drop flow rule resources instance.
  *
  * @param priv
  *   Pointer to private structure.
  *
  * @return
- *   0 on success, negative value otherwise.
+ *   Pointer to drop flow resources on success, NULL otherwise and rte_errno
+ *   is set.
  */
-static int
-mlx4_flow_create_drop_queue(struct priv *priv)
+static struct mlx4_drop *
+mlx4_drop_get(struct priv *priv)
 {
-	struct ibv_qp *qp;
-	struct ibv_cq *cq;
-	struct rte_flow_drop *fdq;
+	struct mlx4_drop *drop = priv->drop;
 
-	fdq = rte_calloc(__func__, 1, sizeof(*fdq), 0);
-	if (!fdq) {
-		ERROR("Cannot allocate memory for drop struct");
-		goto err;
-	}
-	cq = ibv_exp_create_cq(priv->ctx, 1, NULL, NULL, 0,
-			      &(struct ibv_exp_cq_init_attr){
-					.comp_mask = 0,
-			      });
-	if (!cq) {
-		ERROR("Cannot create drop CQ");
-		goto err_create_cq;
-	}
-	qp = ibv_exp_create_qp(priv->ctx,
-			      &(struct ibv_exp_qp_init_attr){
-					.send_cq = cq,
-					.recv_cq = cq,
-					.cap = {
-						.max_recv_wr = 1,
-						.max_recv_sge = 1,
-					},
-					.qp_type = IBV_QPT_RAW_PACKET,
-					.comp_mask =
-						IBV_EXP_QP_INIT_ATTR_PD |
-						IBV_EXP_QP_INIT_ATTR_PORT,
-					.pd = priv->pd,
-					.port_num = priv->port,
-			      });
-	if (!qp) {
-		ERROR("Cannot create drop QP");
-		goto err_create_qp;
+	if (drop) {
+		assert(drop->refcnt);
+		assert(drop->priv == priv);
+		++drop->refcnt;
+		return drop;
 	}
-	*fdq = (struct rte_flow_drop){
-		.qp = qp,
-		.cq = cq,
+	drop = rte_malloc(__func__, sizeof(*drop), 0);
+	if (!drop)
+		goto error;
+	*drop = (struct mlx4_drop){
+		.priv = priv,
+		.refcnt = 1,
 	};
-	priv->flow_drop_queue = fdq;
-	return 0;
-err_create_qp:
-	claim_zero(ibv_destroy_cq(cq));
-err_create_cq:
-	rte_free(fdq);
-err:
-	return -1;
+	drop->cq = ibv_create_cq(priv->ctx, 1, NULL, NULL, 0);
+	if (!drop->cq)
+		goto error;
+	drop->qp = ibv_create_qp(priv->pd,
+				 &(struct ibv_qp_init_attr){
+					.send_cq = drop->cq,
+					.recv_cq = drop->cq,
+					.qp_type = IBV_QPT_RAW_PACKET,
+				 });
+	if (!drop->qp)
+		goto error;
+	priv->drop = drop;
+	return drop;
+error:
+	if (drop->qp)
+		claim_zero(ibv_destroy_qp(drop->qp));
+	if (drop->cq)
+		claim_zero(ibv_destroy_cq(drop->cq));
+	if (drop)
+		rte_free(drop);
+	rte_errno = ENOMEM;
+	return NULL;
 }
 
 /**
- * Get RSS parent rxq structure for given queues.
+ * Give back a drop flow rule resources instance.
  *
- * Creates a new or returns an existed one.
- *
- * @param priv
- *   Pointer to private structure.
- * @param queues
- *   queues indices array, NULL in default RSS case.
- * @param children_n
- *   the size of queues array.
- *
- * @return
- *   Pointer to a parent rxq structure, NULL on failure.
+ * @param drop
+ *   Pointer to drop flow rule resources.
  */
-static struct rxq *
-priv_parent_get(struct priv *priv,
-		uint16_t queues[],
-		uint16_t children_n,
-		struct rte_flow_error *error)
+static void
+mlx4_drop_put(struct mlx4_drop *drop)
 {
-	unsigned int i;
-	struct rxq *parent;
-
-	for (parent = LIST_FIRST(&priv->parents);
-	     parent;
-	     parent = LIST_NEXT(parent, next)) {
-		unsigned int same = 0;
-		unsigned int overlap = 0;
-
-		/*
-		 * Find out whether an appropriate parent queue already exists
-		 * and can be reused, otherwise make sure there are no overlaps.
-		 */
-		for (i = 0; i < children_n; ++i) {
-			unsigned int j;
-
-			for (j = 0; j < parent->rss.queues_n; ++j) {
-				if (parent->rss.queues[j] != queues[i])
-					continue;
-				++overlap;
-				if (i == j)
-					++same;
-			}
-		}
-		if (same == children_n &&
-			children_n == parent->rss.queues_n)
-			return parent;
-		else if (overlap)
-			goto error;
-	}
-	/* Exclude the cases when some QPs were created without RSS */
-	for (i = 0; i < children_n; ++i) {
-		struct rxq *rxq = (*priv->rxqs)[queues[i]];
-		if (rxq->qp)
-			goto error;
-	}
-	parent = priv_parent_create(priv, queues, children_n);
-	if (!parent) {
-		rte_flow_error_set(error,
-				   ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				   NULL, "flow rule creation failure");
-		return NULL;
-	}
-	return parent;
-
-error:
-	rte_flow_error_set(error,
-			   EEXIST,
-			   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-			   NULL,
-			   "sharing a queue between several"
-			   " RSS groups is not supported");
-	return NULL;
+	assert(drop->refcnt);
+	if (--drop->refcnt)
+		return;
+	drop->priv->drop = NULL;
+	claim_zero(ibv_destroy_qp(drop->qp));
+	claim_zero(ibv_destroy_cq(drop->cq));
+	rte_free(drop);
 }
 
 /**
- * Complete flow rule creation.
+ * Toggle a configured flow rule.
  *
  * @param priv
  *   Pointer to private structure.
- * @param ibv_attr
- *   Verbs flow attributes.
- * @param action
- *   Target action structure.
+ * @param flow
+ *   Flow rule handle to toggle.
+ * @param enable
+ *   Whether associated Verbs flow must be created or removed.
  * @param[out] error
  *   Perform verbose error reporting if not NULL.
  *
  * @return
- *   A flow if the rule could be created.
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
-static struct rte_flow *
-priv_flow_create_action_queue(struct priv *priv,
-			      struct ibv_flow_attr *ibv_attr,
-			      struct mlx4_flow_action *action,
-			      struct rte_flow_error *error)
+static int
+mlx4_flow_toggle(struct priv *priv,
+		 struct rte_flow *flow,
+		 int enable,
+		 struct rte_flow_error *error)
 {
-	struct ibv_qp *qp;
-	struct rte_flow *rte_flow;
-	struct rxq *rxq_parent = NULL;
+	struct ibv_qp *qp = NULL;
+	const char *msg;
+	int err;
 
-	assert(priv->pd);
-	assert(priv->ctx);
-	rte_flow = rte_calloc(__func__, 1, sizeof(*rte_flow), 0);
-	if (!rte_flow) {
-		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
-				   NULL, "cannot allocate flow memory");
-		return NULL;
+	if (!enable) {
+		if (!flow->ibv_flow)
+			return 0;
+		claim_zero(ibv_destroy_flow(flow->ibv_flow));
+		flow->ibv_flow = NULL;
+		if (flow->drop)
+			mlx4_drop_put(priv->drop);
+		else if (flow->rss)
+			mlx4_rss_detach(flow->rss);
+		return 0;
 	}
-	if (action->drop) {
-		qp = priv->flow_drop_queue ? priv->flow_drop_queue->qp : NULL;
-	} else {
-		int ret;
+	assert(flow->ibv_attr);
+	if (!flow->internal &&
+	    !priv->isolated &&
+	    flow->ibv_attr->priority == MLX4_FLOW_PRIORITY_LAST) {
+		if (flow->ibv_flow) {
+			claim_zero(ibv_destroy_flow(flow->ibv_flow));
+			flow->ibv_flow = NULL;
+			if (flow->drop)
+				mlx4_drop_put(priv->drop);
+			else if (flow->rss)
+				mlx4_rss_detach(flow->rss);
+		}
+		err = EACCES;
+		msg = ("priority level "
+		       MLX4_STR_EXPAND(MLX4_FLOW_PRIORITY_LAST)
+		       " is reserved when not in isolated mode");
+		goto error;
+	}
+	if (flow->rss) {
+		struct mlx4_rss *rss = flow->rss;
+		int missing = 0;
 		unsigned int i;
-		struct rxq *rxq = NULL;
 
-		if (action->queues_n > 1) {
-			rxq_parent = priv_parent_get(priv, action->queues,
-						     action->queues_n, error);
-			if (!rxq_parent)
-				goto error;
+		/* Stop at the first nonexistent target queue. */
+		for (i = 0; i != rss->queues; ++i)
+			if (rss->queue_id[i] >=
+			    priv->dev->data->nb_rx_queues ||
+			    !priv->dev->data->rx_queues[rss->queue_id[i]]) {
+				missing = 1;
+				break;
+			}
+		if (flow->ibv_flow) {
+			if (missing ^ !flow->drop)
+				return 0;
+			/* Verbs flow needs updating. */
+			claim_zero(ibv_destroy_flow(flow->ibv_flow));
+			flow->ibv_flow = NULL;
+			if (flow->drop)
+				mlx4_drop_put(priv->drop);
+			else
+				mlx4_rss_detach(rss);
 		}
-		for (i = 0; i < action->queues_n; ++i) {
-			rxq = (*priv->rxqs)[action->queues[i]];
-			/*
-			 * In case of isolated mode we postpone
-			 * ibv receive queue creation till the first
-			 * rte_flow rule will be applied on that queue.
-			 */
-			if (!rxq->qp) {
-				assert(priv->isolated);
-				ret = rxq_create_qp(rxq, rxq->elts_n,
-						    0, 0, rxq_parent);
-				if (ret) {
-					rte_flow_error_set(
-						error,
-						ENOMEM,
-						RTE_FLOW_ERROR_TYPE_HANDLE,
-						NULL,
-						"flow rule creation failure");
-					goto error;
-				}
+		if (!missing) {
+			err = mlx4_rss_attach(rss);
+			if (err) {
+				err = -err;
+				msg = "cannot create indirection table or hash"
+					" QP to associate flow rule with";
+				goto error;
 			}
+			qp = rss->qp;
 		}
-		qp = action->queues_n > 1 ? rxq_parent->qp : rxq->qp;
-		rte_flow->qp = qp;
+		/* A missing target queue drops traffic implicitly. */
+		flow->drop = missing;
 	}
-	rte_flow->ibv_attr = ibv_attr;
-	if (!priv->started)
-		return rte_flow;
-	rte_flow->ibv_flow = ibv_create_flow(qp, rte_flow->ibv_attr);
-	if (!rte_flow->ibv_flow) {
-		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
-				   NULL, "flow rule creation failure");
-		goto error;
+	if (flow->drop) {
+		mlx4_drop_get(priv);
+		if (!priv->drop) {
+			err = rte_errno;
+			msg = "resources for drop flow rule cannot be created";
+			goto error;
+		}
+		qp = priv->drop->qp;
 	}
-	return rte_flow;
-
+	assert(qp);
+	if (flow->ibv_flow)
+		return 0;
+	flow->ibv_flow = ibv_create_flow(qp, flow->ibv_attr);
+	if (flow->ibv_flow)
+		return 0;
+	if (flow->drop)
+		mlx4_drop_put(priv->drop);
+	else if (flow->rss)
+		mlx4_rss_detach(flow->rss);
+	err = errno;
+	msg = "flow rule rejected by device";
 error:
-	if (rxq_parent)
-		rxq_parent_cleanup(rxq_parent);
-	rte_free(rte_flow);
-	return NULL;
+	return rte_flow_error_set
+		(error, err, RTE_FLOW_ERROR_TYPE_HANDLE, flow, msg);
 }
 
 /**
- * Convert a flow.
- *
- * @param priv
- *   Pointer to private structure.
- * @param[in] attr
- *   Flow rule attributes.
- * @param[in] items
- *   Pattern specification (list terminated by the END pattern item).
- * @param[in] actions
- *   Associated actions (list terminated by the END action).
- * @param[out] error
- *   Perform verbose error reporting if not NULL.
+ * Create a flow.
  *
- * @return
- *   A flow on success, NULL otherwise.
+ * @see rte_flow_create()
+ * @see rte_flow_ops
  */
 static struct rte_flow *
-priv_flow_create(struct priv *priv,
+mlx4_flow_create(struct rte_eth_dev *dev,
 		 const struct rte_flow_attr *attr,
-		 const struct rte_flow_item items[],
+		 const struct rte_flow_item pattern[],
 		 const struct rte_flow_action actions[],
 		 struct rte_flow_error *error)
 {
-	struct rte_flow *rte_flow;
-	struct mlx4_flow_action action;
-	struct mlx4_flow flow = { .offset = sizeof(struct ibv_flow_attr), };
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow *flow;
 	int err;
 
-	err = priv_flow_validate(priv, attr, items, actions, error, &flow);
+	err = mlx4_flow_prepare(priv, attr, pattern, actions, error, &flow);
 	if (err)
 		return NULL;
-	flow.ibv_attr = rte_malloc(__func__, flow.offset, 0);
-	if (!flow.ibv_attr) {
-		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
-				   NULL, "cannot allocate ibv_attr memory");
-		return NULL;
-	}
-	flow.offset = sizeof(struct ibv_flow_attr);
-	*flow.ibv_attr = (struct ibv_flow_attr){
-		.comp_mask = 0,
-		.type = IBV_FLOW_ATTR_NORMAL,
-		.size = sizeof(struct ibv_flow_attr),
-		.priority = attr->priority,
-		.num_of_specs = 0,
-		.port = priv->port,
-		.flags = 0,
-	};
-	claim_zero(priv_flow_validate(priv, attr, items, actions,
-				      error, &flow));
-	action = (struct mlx4_flow_action){
-		.queue = 0,
-		.drop = 0,
-	};
-	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
-		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
-			continue;
-		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
-			action.queue = 1;
-			action.queues_n = 1;
-			action.queues[0] =
-				((const struct rte_flow_action_queue *)
-				 actions->conf)->index;
-		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
-			action.drop = 1;
-		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
-			unsigned int i;
-			const struct rte_flow_action_rss *rss =
-				(const struct rte_flow_action_rss *)
-				 actions->conf;
+	err = mlx4_flow_toggle(priv, flow, priv->started, error);
+	if (!err) {
+		struct rte_flow *curr = LIST_FIRST(&priv->flows);
 
-			action.queue = 1;
-			action.queues_n = rss->num;
-			for (i = 0; i < rss->num; ++i)
-				action.queues[i] = rss->queue[i];
+		/* New rules are inserted after internal ones. */
+		if (!curr || !curr->internal) {
+			LIST_INSERT_HEAD(&priv->flows, flow, next);
 		} else {
-			rte_flow_error_set(error, ENOTSUP,
-					   RTE_FLOW_ERROR_TYPE_ACTION,
-					   actions, "unsupported action");
-			goto exit;
+			while (LIST_NEXT(curr, next) &&
+			       LIST_NEXT(curr, next)->internal)
+				curr = LIST_NEXT(curr, next);
+			LIST_INSERT_AFTER(curr, flow, next);
 		}
+		return flow;
 	}
-	rte_flow = priv_flow_create_action_queue(priv, flow.ibv_attr,
-						 &action, error);
-	if (rte_flow)
-		return rte_flow;
-exit:
-	rte_free(flow.ibv_attr);
+	if (flow->rss)
+		mlx4_rss_put(flow->rss);
+	rte_flow_error_set(error, -err, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			   error->message);
+	rte_free(flow);
 	return NULL;
 }
 
 /**
- * Create a flow.
+ * Configure isolated mode.
  *
- * @see rte_flow_create()
- * @see rte_flow_ops
- */
-struct rte_flow *
-mlx4_flow_create(struct rte_eth_dev *dev,
-		 const struct rte_flow_attr *attr,
-		 const struct rte_flow_item items[],
-		 const struct rte_flow_action actions[],
-		 struct rte_flow_error *error)
-{
-	struct priv *priv = dev->data->dev_private;
-	struct rte_flow *flow;
-
-	priv_lock(priv);
-	flow = priv_flow_create(priv, attr, items, actions, error);
-	if (flow) {
-		LIST_INSERT_HEAD(&priv->flows, flow, next);
-		DEBUG("Flow created %p", (void *)flow);
-	}
-	priv_unlock(priv);
-	return flow;
-}
-
-/**
  * @see rte_flow_isolate()
- *
- * Must be done before calling dev_configure().
- *
- * @param dev
- *   Pointer to the ethernet device structure.
- * @param enable
- *   Nonzero to enter isolated mode, attempt to leave it otherwise.
- * @param[out] error
- *   Perform verbose error reporting if not NULL. PMDs initialize this
- *   structure in case of error only.
- *
- * @return
- *   0 on success, a negative value on error.
+ * @see rte_flow_ops
  */
-int
+static int
 mlx4_flow_isolate(struct rte_eth_dev *dev,
 		  int enable,
 		  struct rte_flow_error *error)
 {
 	struct priv *priv = dev->data->dev_private;
 
-	priv_lock(priv);
-	if (priv->rxqs) {
-		rte_flow_error_set(error, ENOTSUP,
-				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				   NULL, "isolated mode must be set"
-				   " before configuring the device");
-		priv_unlock(priv);
+	if (!!enable == !!priv->isolated)
+		return 0;
+	priv->isolated = !!enable;
+	if (mlx4_flow_sync(priv, error)) {
+		priv->isolated = !enable;
 		return -rte_errno;
 	}
-	priv->isolated = !!enable;
-	priv_unlock(priv);
 	return 0;
 }
 
 /**
- * Destroy a flow.
+ * Destroy a flow rule.
  *
- * @param priv
- *   Pointer to private structure.
- * @param[in] flow
- *   Flow to destroy.
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
  */
-static void
-priv_flow_destroy(struct priv *priv, struct rte_flow *flow)
+static int
+mlx4_flow_destroy(struct rte_eth_dev *dev,
+		  struct rte_flow *flow,
+		  struct rte_flow_error *error)
 {
-	(void)priv;
+	struct priv *priv = dev->data->dev_private;
+	int err = mlx4_flow_toggle(priv, flow, 0, error);
+
+	if (err)
+		return err;
 	LIST_REMOVE(flow, next);
-	if (flow->ibv_flow)
-		claim_zero(ibv_destroy_flow(flow->ibv_flow));
-	rte_free(flow->ibv_attr);
-	DEBUG("Flow destroyed %p", (void *)flow);
+	if (flow->rss)
+		mlx4_rss_put(flow->rss);
 	rte_free(flow);
+	return 0;
 }
 
 /**
- * Destroy a flow.
+ * Destroy user-configured flow rules.
  *
- * @see rte_flow_destroy()
+ * This function skips internal flows rules.
+ *
+ * @see rte_flow_flush()
  * @see rte_flow_ops
  */
-int
-mlx4_flow_destroy(struct rte_eth_dev *dev,
-		  struct rte_flow *flow,
-		  struct rte_flow_error *error)
+static int
+mlx4_flow_flush(struct rte_eth_dev *dev,
+		struct rte_flow_error *error)
 {
 	struct priv *priv = dev->data->dev_private;
+	struct rte_flow *flow = LIST_FIRST(&priv->flows);
+
+	while (flow) {
+		struct rte_flow *next = LIST_NEXT(flow, next);
 
-	(void)error;
-	priv_lock(priv);
-	priv_flow_destroy(priv, flow);
-	priv_unlock(priv);
+		if (!flow->internal)
+			mlx4_flow_destroy(dev, flow, error);
+		flow = next;
+	}
 	return 0;
 }
 
 /**
- * Destroy all flows.
+ * Helper function to determine the next configured VLAN filter.
  *
  * @param priv
  *   Pointer to private structure.
+ * @param vlan
+ *   VLAN ID to use as a starting point.
+ *
+ * @return
+ *   Next configured VLAN ID or a high value (>= 4096) if there is none.
  */
-static void
-priv_flow_flush(struct priv *priv)
+static uint16_t
+mlx4_flow_internal_next_vlan(struct priv *priv, uint16_t vlan)
+{
+	while (vlan < 4096) {
+		if (priv->dev->data->vlan_filter_conf.ids[vlan / 64] &
+		    (UINT64_C(1) << (vlan % 64)))
+			return vlan;
+		++vlan;
+	}
+	return vlan;
+}
+
+/**
+ * Generate internal flow rules.
+ *
+ * Various flow rules are created depending on the mode the device is in:
+ *
+ * 1. Promiscuous: port MAC + catch-all (VLAN filtering is ignored).
+ * 2. All multicast: port MAC/VLAN + catch-all multicast.
+ * 3. Otherwise: port MAC/VLAN + broadcast MAC/VLAN.
+ *
+ * About MAC flow rules:
+ *
+ * - MAC flow rules are generated from @p dev->data->mac_addrs
+ *   (@p priv->mac array).
+ * - An additional flow rule for Ethernet broadcasts is also generated.
+ * - All these are per-VLAN if @p dev->data->dev_conf.rxmode.hw_vlan_filter
+ *   is enabled and VLAN filters are configured.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 {
-	while (!LIST_EMPTY(&priv->flows)) {
-		struct rte_flow *flow;
+	struct rte_flow_attr attr = {
+		.priority = MLX4_FLOW_PRIORITY_LAST,
+		.ingress = 1,
+	};
+	struct rte_flow_item_eth eth_spec;
+	const struct rte_flow_item_eth eth_mask = {
+		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+	};
+	const struct rte_flow_item_eth eth_allmulti = {
+		.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
+	};
+	struct rte_flow_item_vlan vlan_spec;
+	const struct rte_flow_item_vlan vlan_mask = {
+		.tci = RTE_BE16(0x0fff),
+	};
+	struct rte_flow_item pattern[] = {
+		{
+			.type = MLX4_FLOW_ITEM_TYPE_INTERNAL,
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_ETH,
+			.spec = &eth_spec,
+			.mask = &eth_mask,
+		},
+		{
+			/* Replaced with VLAN if filtering is enabled. */
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+	};
+	/*
+	 * Round number of queues down to their previous power of 2 to
+	 * comply with RSS context limitations. Extra queues silently do not
+	 * get RSS by default.
+	 */
+	uint32_t queues =
+		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
+	alignas(struct rte_flow_action_rss) uint8_t rss_conf_data
+		[offsetof(struct rte_flow_action_rss, queue) +
+		 sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues];
+	struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data;
+	struct rte_flow_action actions[] = {
+		{
+			.type = RTE_FLOW_ACTION_TYPE_RSS,
+			.conf = rss_conf,
+		},
+		{
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		},
+	};
+	struct ether_addr *rule_mac = &eth_spec.dst;
+	rte_be16_t *rule_vlan =
+		priv->dev->data->dev_conf.rxmode.hw_vlan_filter &&
+		!priv->dev->data->promiscuous ?
+		&vlan_spec.tci :
+		NULL;
+	int broadcast =
+		!priv->dev->data->promiscuous &&
+		!priv->dev->data->all_multicast;
+	uint16_t vlan = 0;
+	struct rte_flow *flow;
+	unsigned int i;
+	int err = 0;
 
-		flow = LIST_FIRST(&priv->flows);
-		priv_flow_destroy(priv, flow);
+	/* Nothing to be done if there are no Rx queues. */
+	if (!queues)
+		goto error;
+	/* Prepare default RSS configuration. */
+	*rss_conf = (struct rte_flow_action_rss){
+		.rss_conf = NULL, /* Rely on default fallback settings. */
+		.num = queues,
+	};
+	for (i = 0; i != queues; ++i)
+		rss_conf->queue[i] = i;
+	/*
+	 * Set up VLAN item if filtering is enabled and at least one VLAN
+	 * filter is configured.
+	 */
+	if (rule_vlan) {
+		vlan = mlx4_flow_internal_next_vlan(priv, 0);
+		if (vlan < 4096) {
+			pattern[2] = (struct rte_flow_item){
+				.type = RTE_FLOW_ITEM_TYPE_VLAN,
+				.spec = &vlan_spec,
+				.mask = &vlan_mask,
+			};
+next_vlan:
+			*rule_vlan = rte_cpu_to_be_16(vlan);
+		} else {
+			rule_vlan = NULL;
+		}
 	}
+	for (i = 0; i != RTE_DIM(priv->mac) + broadcast; ++i) {
+		const struct ether_addr *mac;
+
+		/* Broadcasts are handled by an extra iteration. */
+		if (i < RTE_DIM(priv->mac))
+			mac = &priv->mac[i];
+		else
+			mac = &eth_mask.dst;
+		if (is_zero_ether_addr(mac))
+			continue;
+		/* Check if MAC flow rule is already present. */
+		for (flow = LIST_FIRST(&priv->flows);
+		     flow && flow->internal;
+		     flow = LIST_NEXT(flow, next)) {
+			const struct ibv_flow_spec_eth *eth =
+				(const void *)((uintptr_t)flow->ibv_attr +
+					       sizeof(*flow->ibv_attr));
+			unsigned int j;
+
+			if (!flow->mac)
+				continue;
+			assert(flow->ibv_attr->type == IBV_FLOW_ATTR_NORMAL);
+			assert(flow->ibv_attr->num_of_specs == 1);
+			assert(eth->type == IBV_FLOW_SPEC_ETH);
+			assert(flow->rss);
+			if (rule_vlan &&
+			    (eth->val.vlan_tag != *rule_vlan ||
+			     eth->mask.vlan_tag != RTE_BE16(0x0fff)))
+				continue;
+			if (!rule_vlan && eth->mask.vlan_tag)
+				continue;
+			for (j = 0; j != sizeof(mac->addr_bytes); ++j)
+				if (eth->val.dst_mac[j] != mac->addr_bytes[j] ||
+				    eth->mask.dst_mac[j] != UINT8_C(0xff) ||
+				    eth->val.src_mac[j] != UINT8_C(0x00) ||
+				    eth->mask.src_mac[j] != UINT8_C(0x00))
+					break;
+			if (j != sizeof(mac->addr_bytes))
+				continue;
+			if (flow->rss->queues != queues ||
+			    memcmp(flow->rss->queue_id, rss_conf->queue,
+				   queues * sizeof(flow->rss->queue_id[0])))
+				continue;
+			break;
+		}
+		if (!flow || !flow->internal) {
+			/* Not found, create a new flow rule. */
+			memcpy(rule_mac, mac, sizeof(*mac));
+			flow = mlx4_flow_create(priv->dev, &attr, pattern,
+						actions, error);
+			if (!flow) {
+				err = -rte_errno;
+				goto error;
+			}
+		}
+		flow->select = 1;
+		flow->mac = 1;
+	}
+	if (rule_vlan) {
+		vlan = mlx4_flow_internal_next_vlan(priv, vlan + 1);
+		if (vlan < 4096)
+			goto next_vlan;
+	}
+	/* Take care of promiscuous and all multicast flow rules. */
+	if (!broadcast) {
+		for (flow = LIST_FIRST(&priv->flows);
+		     flow && flow->internal;
+		     flow = LIST_NEXT(flow, next)) {
+			if (priv->dev->data->promiscuous) {
+				if (flow->promisc)
+					break;
+			} else {
+				assert(priv->dev->data->all_multicast);
+				if (flow->allmulti)
+					break;
+			}
+		}
+		if (flow && flow->internal) {
+			assert(flow->rss);
+			if (flow->rss->queues != queues ||
+			    memcmp(flow->rss->queue_id, rss_conf->queue,
+				   queues * sizeof(flow->rss->queue_id[0])))
+				flow = NULL;
+		}
+		if (!flow || !flow->internal) {
+			/* Not found, create a new flow rule. */
+			if (priv->dev->data->promiscuous) {
+				pattern[1].spec = NULL;
+				pattern[1].mask = NULL;
+			} else {
+				assert(priv->dev->data->all_multicast);
+				pattern[1].spec = &eth_allmulti;
+				pattern[1].mask = &eth_allmulti;
+			}
+			pattern[2] = pattern[3];
+			flow = mlx4_flow_create(priv->dev, &attr, pattern,
+						actions, error);
+			if (!flow) {
+				err = -rte_errno;
+				goto error;
+			}
+		}
+		assert(flow->promisc || flow->allmulti);
+		flow->select = 1;
+	}
+error:
+	/* Clear selection and clean up stale internal flow rules. */
+	flow = LIST_FIRST(&priv->flows);
+	while (flow && flow->internal) {
+		struct rte_flow *next = LIST_NEXT(flow, next);
+
+		if (!flow->select)
+			claim_zero(mlx4_flow_destroy(priv->dev, flow, error));
+		else
+			flow->select = 0;
+		flow = next;
+	}
+	return err;
 }
 
 /**
- * Destroy all flows.
+ * Synchronize flow rules.
  *
- * @see rte_flow_flush()
- * @see rte_flow_ops
+ * This function synchronizes flow rules with the state of the device by
+ * taking into account isolated mode and whether target queues are
+ * configured.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx4_flow_flush(struct rte_eth_dev *dev,
-		struct rte_flow_error *error)
+mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error)
 {
-	struct priv *priv = dev->data->dev_private;
+	struct rte_flow *flow;
+	int ret;
 
-	(void)error;
-	priv_lock(priv);
-	priv_flow_flush(priv);
-	priv_unlock(priv);
+	/* Internal flow rules are guaranteed to come first in the list. */
+	if (priv->isolated) {
+		/*
+		 * Get rid of them in isolated mode, stop at the first
+		 * non-internal rule found.
+		 */
+		for (flow = LIST_FIRST(&priv->flows);
+		     flow && flow->internal;
+		     flow = LIST_FIRST(&priv->flows))
+			claim_zero(mlx4_flow_destroy(priv->dev, flow, error));
+	} else {
+		/* Refresh internal rules. */
+		ret = mlx4_flow_internal(priv, error);
+		if (ret)
+			return ret;
+	}
+	/* Toggle the remaining flow rules . */
+	LIST_FOREACH(flow, &priv->flows, next) {
+		ret = mlx4_flow_toggle(priv, flow, priv->started, error);
+		if (ret)
+			return ret;
+	}
+	if (!priv->started)
+		assert(!priv->drop);
 	return 0;
 }
 
 /**
- * Remove all flows.
+ * Clean up all flow rules.
  *
- * Called by dev_stop() to remove all flows.
+ * Unlike mlx4_flow_flush(), this function takes care of all remaining flow
+ * rules regardless of whether they are internal or user-configured.
  *
  * @param priv
  *   Pointer to private structure.
  */
 void
-mlx4_priv_flow_stop(struct priv *priv)
+mlx4_flow_clean(struct priv *priv)
 {
 	struct rte_flow *flow;
 
-	for (flow = LIST_FIRST(&priv->flows);
-	     flow;
-	     flow = LIST_NEXT(flow, next)) {
-		claim_zero(ibv_destroy_flow(flow->ibv_flow));
-		flow->ibv_flow = NULL;
-		DEBUG("Flow %p removed", (void *)flow);
-	}
-	mlx4_flow_destroy_drop_queue(priv);
+	while ((flow = LIST_FIRST(&priv->flows)))
+		mlx4_flow_destroy(priv->dev, flow, NULL);
+	assert(LIST_EMPTY(&priv->rss));
 }
 
+static const struct rte_flow_ops mlx4_flow_ops = {
+	.validate = mlx4_flow_validate,
+	.create = mlx4_flow_create,
+	.destroy = mlx4_flow_destroy,
+	.flush = mlx4_flow_flush,
+	.isolate = mlx4_flow_isolate,
+};
+
 /**
- * Add all flows.
+ * Manage filter operations.
  *
- * @param priv
- *   Pointer to private structure.
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param filter_type
+ *   Filter type.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
  *
  * @return
- *   0 on success, a errno value otherwise and rte_errno is set.
+ *   0 on success, negative errno value otherwise and rte_errno is set.
  */
 int
-mlx4_priv_flow_start(struct priv *priv)
+mlx4_filter_ctrl(struct rte_eth_dev *dev,
+		 enum rte_filter_type filter_type,
+		 enum rte_filter_op filter_op,
+		 void *arg)
 {
-	int ret;
-	struct ibv_qp *qp;
-	struct rte_flow *flow;
-
-	ret = mlx4_flow_create_drop_queue(priv);
-	if (ret)
-		return -1;
-	for (flow = LIST_FIRST(&priv->flows);
-	     flow;
-	     flow = LIST_NEXT(flow, next)) {
-		qp = flow->qp ? flow->qp : priv->flow_drop_queue->qp;
-		flow->ibv_flow = ibv_create_flow(qp, flow->ibv_attr);
-		if (!flow->ibv_flow) {
-			DEBUG("Flow %p cannot be applied", (void *)flow);
-			rte_errno = EINVAL;
-			return rte_errno;
-		}
-		DEBUG("Flow %p applied", (void *)flow);
+	switch (filter_type) {
+	case RTE_ETH_FILTER_GENERIC:
+		if (filter_op != RTE_ETH_FILTER_GET)
+			break;
+		*(const void **)arg = &mlx4_flow_ops;
+		return 0;
+	default:
+		ERROR("%p: filter type (%d) not supported",
+		      (void *)dev, filter_type);
+		break;
 	}
-	return 0;
+	rte_errno = ENOTSUP;
+	return -rte_errno;
 }
diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h
index beabcf2d..651fd37b 100644
--- a/drivers/net/mlx4/mlx4_flow.h
+++ b/drivers/net/mlx4/mlx4_flow.h
@@ -2,7 +2,7 @@
  *   BSD LICENSE
  *
  *   Copyright 2017 6WIND S.A.
- *   Copyright 2017 Mellanox.
+ *   Copyright 2017 Mellanox
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -34,12 +34,10 @@
 #ifndef RTE_PMD_MLX4_FLOW_H_
 #define RTE_PMD_MLX4_FLOW_H_
 
-#include <stddef.h>
 #include <stdint.h>
 #include <sys/queue.h>
 
-/* Verbs header. */
-/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+/* Verbs headers do not support -pedantic. */
 #ifdef PEDANTIC
 #pragma GCC diagnostic ignored "-Wpedantic"
 #endif
@@ -48,61 +46,40 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
+#include <rte_eth_ctrl.h>
+#include <rte_ethdev.h>
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
 #include <rte_byteorder.h>
 
-#include "mlx4.h"
+/** Last and lowest priority level for a flow rule. */
+#define MLX4_FLOW_PRIORITY_LAST UINT32_C(0xfff)
 
+/** Meta pattern item used to distinguish internal rules. */
+#define MLX4_FLOW_ITEM_TYPE_INTERNAL ((enum rte_flow_item_type)-1)
+
+/** PMD-specific (mlx4) definition of a flow rule handle. */
 struct rte_flow {
 	LIST_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
 	struct ibv_flow *ibv_flow; /**< Verbs flow. */
 	struct ibv_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */
-	struct ibv_qp *qp; /**< Verbs queue pair. */
+	uint32_t ibv_attr_size; /**< Size of Verbs attributes. */
+	uint32_t select:1; /**< Used by operations on the linked list. */
+	uint32_t internal:1; /**< Internal flow rule outside isolated mode. */
+	uint32_t mac:1; /**< Rule associated with a configured MAC address. */
+	uint32_t promisc:1; /**< This rule matches everything. */
+	uint32_t allmulti:1; /**< This rule matches all multicast traffic. */
+	uint32_t drop:1; /**< This rule drops packets. */
+	struct mlx4_rss *rss; /**< Rx target. */
 };
 
-int
-mlx4_flow_validate(struct rte_eth_dev *dev,
-		   const struct rte_flow_attr *attr,
-		   const struct rte_flow_item items[],
-		   const struct rte_flow_action actions[],
-		   struct rte_flow_error *error);
-
-struct rte_flow *
-mlx4_flow_create(struct rte_eth_dev *dev,
-		 const struct rte_flow_attr *attr,
-		 const struct rte_flow_item items[],
-		 const struct rte_flow_action actions[],
-		 struct rte_flow_error *error);
-
-int
-mlx4_flow_destroy(struct rte_eth_dev *dev,
-		  struct rte_flow *flow,
-		  struct rte_flow_error *error);
-
-int
-mlx4_flow_flush(struct rte_eth_dev *dev,
-		struct rte_flow_error *error);
-
-/** Structure to pass to the conversion function. */
-struct mlx4_flow {
-	struct ibv_flow_attr *ibv_attr; /**< Verbs attribute. */
-	unsigned int offset; /**< Offset in bytes in the ibv_attr buffer. */
-};
-
-int
-mlx4_flow_isolate(struct rte_eth_dev *dev,
-		  int enable,
-		  struct rte_flow_error *error);
-
-struct mlx4_flow_action {
-	uint32_t drop:1; /**< Target is a drop queue. */
-	uint32_t queue:1; /**< Target is a receive queue. */
-	uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queue indices to use. */
-	uint16_t queues_n; /**< Number of entries in queue[] */
-};
+/* mlx4_flow.c */
 
-int mlx4_priv_flow_start(struct priv *priv);
-void mlx4_priv_flow_stop(struct priv *priv);
+int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error);
+void mlx4_flow_clean(struct priv *priv);
+int mlx4_filter_ctrl(struct rte_eth_dev *dev,
+		     enum rte_filter_type filter_type,
+		     enum rte_filter_op filter_op,
+		     void *arg);
 
 #endif /* RTE_PMD_MLX4_FLOW_H_ */
diff --git a/drivers/net/mlx4/mlx4_intr.c b/drivers/net/mlx4/mlx4_intr.c
new file mode 100644
index 00000000..b17d109a
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_intr.c
@@ -0,0 +1,397 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Interrupts handling for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_alarm.h>
+#include <rte_errno.h>
+#include <rte_ethdev.h>
+#include <rte_io.h>
+#include <rte_interrupts.h>
+
+#include "mlx4.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+static int mlx4_link_status_check(struct priv *priv);
+
+/**
+ * Clean up Rx interrupts handler.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+static void
+mlx4_rx_intr_vec_disable(struct priv *priv)
+{
+	struct rte_intr_handle *intr_handle = &priv->intr_handle;
+
+	rte_intr_free_epoll_fd(intr_handle);
+	free(intr_handle->intr_vec);
+	intr_handle->nb_efd = 0;
+	intr_handle->intr_vec = NULL;
+}
+
+/**
+ * Allocate queue vector and fill epoll fd list for Rx interrupts.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_rx_intr_vec_enable(struct priv *priv)
+{
+	unsigned int i;
+	unsigned int rxqs_n = priv->dev->data->nb_rx_queues;
+	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+	unsigned int count = 0;
+	struct rte_intr_handle *intr_handle = &priv->intr_handle;
+
+	mlx4_rx_intr_vec_disable(priv);
+	intr_handle->intr_vec = malloc(sizeof(intr_handle->intr_vec[rxqs_n]));
+	if (intr_handle->intr_vec == NULL) {
+		rte_errno = ENOMEM;
+		ERROR("failed to allocate memory for interrupt vector,"
+		      " Rx interrupts will not be supported");
+		return -rte_errno;
+	}
+	for (i = 0; i != n; ++i) {
+		struct rxq *rxq = priv->dev->data->rx_queues[i];
+
+		/* Skip queues that cannot request interrupts. */
+		if (!rxq || !rxq->channel) {
+			/* Use invalid intr_vec[] index to disable entry. */
+			intr_handle->intr_vec[i] =
+				RTE_INTR_VEC_RXTX_OFFSET +
+				RTE_MAX_RXTX_INTR_VEC_ID;
+			continue;
+		}
+		if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
+			rte_errno = E2BIG;
+			ERROR("too many Rx queues for interrupt vector size"
+			      " (%d), Rx interrupts cannot be enabled",
+			      RTE_MAX_RXTX_INTR_VEC_ID);
+			mlx4_rx_intr_vec_disable(priv);
+			return -rte_errno;
+		}
+		intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
+		intr_handle->efds[count] = rxq->channel->fd;
+		count++;
+	}
+	if (!count)
+		mlx4_rx_intr_vec_disable(priv);
+	else
+		intr_handle->nb_efd = count;
+	return 0;
+}
+
+/**
+ * Process scheduled link status check.
+ *
+ * If LSC interrupts are requested, process related callback.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+static void
+mlx4_link_status_alarm(struct priv *priv)
+{
+	const struct rte_intr_conf *const intr_conf =
+		&priv->dev->data->dev_conf.intr_conf;
+
+	assert(priv->intr_alarm == 1);
+	priv->intr_alarm = 0;
+	if (intr_conf->lsc && !mlx4_link_status_check(priv))
+		_rte_eth_dev_callback_process(priv->dev,
+					      RTE_ETH_EVENT_INTR_LSC,
+					      NULL, NULL);
+}
+
+/**
+ * Check link status.
+ *
+ * In case of inconsistency, another check is scheduled.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success (link status is consistent), negative errno value
+ *   otherwise and rte_errno is set.
+ */
+static int
+mlx4_link_status_check(struct priv *priv)
+{
+	struct rte_eth_link *link = &priv->dev->data->dev_link;
+	int ret = mlx4_link_update(priv->dev, 0);
+
+	if (ret)
+		return ret;
+	if ((!link->link_speed && link->link_status) ||
+	    (link->link_speed && !link->link_status)) {
+		if (!priv->intr_alarm) {
+			/* Inconsistent status, check again later. */
+			ret = rte_eal_alarm_set(MLX4_INTR_ALARM_TIMEOUT,
+						(void (*)(void *))
+						mlx4_link_status_alarm,
+						priv);
+			if (ret)
+				return ret;
+			priv->intr_alarm = 1;
+		}
+		rte_errno = EINPROGRESS;
+		return -rte_errno;
+	}
+	return 0;
+}
+
+/**
+ * Handle interrupts from the NIC.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+static void
+mlx4_interrupt_handler(struct priv *priv)
+{
+	enum { LSC, RMV, };
+	static const enum rte_eth_event_type type[] = {
+		[LSC] = RTE_ETH_EVENT_INTR_LSC,
+		[RMV] = RTE_ETH_EVENT_INTR_RMV,
+	};
+	uint32_t caught[RTE_DIM(type)] = { 0 };
+	struct ibv_async_event event;
+	const struct rte_intr_conf *const intr_conf =
+		&priv->dev->data->dev_conf.intr_conf;
+	unsigned int i;
+
+	/* Read all message and acknowledge them. */
+	while (!ibv_get_async_event(priv->ctx, &event)) {
+		switch (event.event_type) {
+		case IBV_EVENT_PORT_ACTIVE:
+		case IBV_EVENT_PORT_ERR:
+			if (intr_conf->lsc && !mlx4_link_status_check(priv))
+				++caught[LSC];
+			break;
+		case IBV_EVENT_DEVICE_FATAL:
+			if (intr_conf->rmv)
+				++caught[RMV];
+			break;
+		default:
+			DEBUG("event type %d on physical port %d not handled",
+			      event.event_type, event.element.port_num);
+		}
+		ibv_ack_async_event(&event);
+	}
+	for (i = 0; i != RTE_DIM(caught); ++i)
+		if (caught[i])
+			_rte_eth_dev_callback_process(priv->dev, type[i],
+						      NULL, NULL);
+}
+
+/**
+ * MLX4 CQ notification .
+ *
+ * @param rxq
+ *   Pointer to receive queue structure.
+ * @param solicited
+ *   Is request solicited or not.
+ */
+static void
+mlx4_arm_cq(struct rxq *rxq, int solicited)
+{
+	struct mlx4_cq *cq = &rxq->mcq;
+	uint64_t doorbell;
+	uint32_t sn = cq->arm_sn & MLX4_CQ_DB_GEQ_N_MASK;
+	uint32_t ci = cq->cons_index & MLX4_CQ_DB_CI_MASK;
+	uint32_t cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT;
+
+	*cq->arm_db = rte_cpu_to_be_32(sn << 28 | cmd | ci);
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	rte_wmb();
+	doorbell = sn << 28 | cmd | cq->cqn;
+	doorbell <<= 32;
+	doorbell |= ci;
+	rte_write64(rte_cpu_to_be_64(doorbell), cq->cq_db_reg);
+}
+
+/**
+ * Uninstall interrupt handler.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_intr_uninstall(struct priv *priv)
+{
+	int err = rte_errno; /* Make sure rte_errno remains unchanged. */
+
+	if (priv->intr_handle.fd != -1) {
+		rte_intr_callback_unregister(&priv->intr_handle,
+					     (void (*)(void *))
+					     mlx4_interrupt_handler,
+					     priv);
+		priv->intr_handle.fd = -1;
+	}
+	rte_eal_alarm_cancel((void (*)(void *))mlx4_link_status_alarm, priv);
+	priv->intr_alarm = 0;
+	mlx4_rx_intr_vec_disable(priv);
+	rte_errno = err;
+	return 0;
+}
+
+/**
+ * Install interrupt handler.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_intr_install(struct priv *priv)
+{
+	const struct rte_intr_conf *const intr_conf =
+		&priv->dev->data->dev_conf.intr_conf;
+	int rc;
+
+	mlx4_intr_uninstall(priv);
+	if (intr_conf->rxq && mlx4_rx_intr_vec_enable(priv) < 0)
+		goto error;
+	if (intr_conf->lsc | intr_conf->rmv) {
+		priv->intr_handle.fd = priv->ctx->async_fd;
+		rc = rte_intr_callback_register(&priv->intr_handle,
+						(void (*)(void *))
+						mlx4_interrupt_handler,
+						priv);
+		if (rc < 0) {
+			rte_errno = -rc;
+			goto error;
+		}
+	}
+	return 0;
+error:
+	mlx4_intr_uninstall(priv);
+	return -rte_errno;
+}
+
+/**
+ * DPDK callback for Rx queue interrupt disable.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   Rx queue index.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct rxq *rxq = dev->data->rx_queues[idx];
+	struct ibv_cq *ev_cq;
+	void *ev_ctx;
+	int ret;
+
+	if (!rxq || !rxq->channel) {
+		ret = EINVAL;
+	} else {
+		ret = ibv_get_cq_event(rxq->cq->channel, &ev_cq, &ev_ctx);
+		if (ret || ev_cq != rxq->cq)
+			ret = EINVAL;
+	}
+	if (ret) {
+		rte_errno = ret;
+		WARN("unable to disable interrupt on rx queue %d",
+		     idx);
+	} else {
+		rxq->mcq.arm_sn++;
+		ibv_ack_cq_events(rxq->cq, 1);
+	}
+	return -ret;
+}
+
+/**
+ * DPDK callback for Rx queue interrupt enable.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   Rx queue index.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct rxq *rxq = dev->data->rx_queues[idx];
+	int ret = 0;
+
+	if (!rxq || !rxq->channel) {
+		ret = EINVAL;
+		rte_errno = ret;
+		WARN("unable to arm interrupt on rx queue %d", idx);
+	} else {
+		mlx4_arm_cq(rxq, 0);
+	}
+	return -ret;
+}
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
new file mode 100644
index 00000000..2a3e2695
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -0,0 +1,293 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Memory management functions for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_memory.h>
+#include <rte_mempool.h>
+#include <rte_spinlock.h>
+
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+struct mlx4_check_mempool_data {
+	int ret;
+	char *start;
+	char *end;
+};
+
+/**
+ * Called by mlx4_check_mempool() when iterating the memory chunks.
+ *
+ * @param[in] mp
+ *   Pointer to memory pool (unused).
+ * @param[in, out] data
+ *   Pointer to shared buffer with mlx4_check_mempool().
+ * @param[in] memhdr
+ *   Pointer to mempool chunk header.
+ * @param mem_idx
+ *   Mempool element index (unused).
+ */
+static void
+mlx4_check_mempool_cb(struct rte_mempool *mp, void *opaque,
+		      struct rte_mempool_memhdr *memhdr,
+		      unsigned int mem_idx)
+{
+	struct mlx4_check_mempool_data *data = opaque;
+
+	(void)mp;
+	(void)mem_idx;
+	/* It already failed, skip the next chunks. */
+	if (data->ret != 0)
+		return;
+	/* It is the first chunk. */
+	if (data->start == NULL && data->end == NULL) {
+		data->start = memhdr->addr;
+		data->end = data->start + memhdr->len;
+		return;
+	}
+	if (data->end == memhdr->addr) {
+		data->end += memhdr->len;
+		return;
+	}
+	if (data->start == (char *)memhdr->addr + memhdr->len) {
+		data->start -= memhdr->len;
+		return;
+	}
+	/* Error, mempool is not virtually contiguous. */
+	data->ret = -1;
+}
+
+/**
+ * Check if a mempool can be used: it must be virtually contiguous.
+ *
+ * @param[in] mp
+ *   Pointer to memory pool.
+ * @param[out] start
+ *   Pointer to the start address of the mempool virtual memory area.
+ * @param[out] end
+ *   Pointer to the end address of the mempool virtual memory area.
+ *
+ * @return
+ *   0 on success (mempool is virtually contiguous), -1 on error.
+ */
+static int
+mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start, uintptr_t *end)
+{
+	struct mlx4_check_mempool_data data;
+
+	memset(&data, 0, sizeof(data));
+	rte_mempool_mem_iter(mp, mlx4_check_mempool_cb, &data);
+	*start = (uintptr_t)data.start;
+	*end = (uintptr_t)data.end;
+	return data.ret;
+}
+
+/**
+ * Obtain a memory region from a memory pool.
+ *
+ * If a matching memory region already exists, it is returned with its
+ * reference count incremented, otherwise a new one is registered.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param mp
+ *   Pointer to memory pool.
+ *
+ * @return
+ *   Memory region pointer, NULL in case of error and rte_errno is set.
+ */
+struct mlx4_mr *
+mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	uintptr_t start;
+	uintptr_t end;
+	unsigned int i;
+	struct mlx4_mr *mr;
+
+	if (mlx4_check_mempool(mp, &start, &end) != 0) {
+		rte_errno = EINVAL;
+		ERROR("mempool %p: not virtually contiguous",
+			(void *)mp);
+		return NULL;
+	}
+	DEBUG("mempool %p area start=%p end=%p size=%zu",
+	      (void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	/* Round start and end to page boundary if found in memory segments. */
+	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+		uintptr_t addr = (uintptr_t)ms[i].addr;
+		size_t len = ms[i].len;
+		unsigned int align = ms[i].hugepage_sz;
+
+		if ((start > addr) && (start < addr + len))
+			start = RTE_ALIGN_FLOOR(start, align);
+		if ((end > addr) && (end < addr + len))
+			end = RTE_ALIGN_CEIL(end, align);
+	}
+	DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+	      (void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	rte_spinlock_lock(&priv->mr_lock);
+	LIST_FOREACH(mr, &priv->mr, next)
+		if (mp == mr->mp && start >= mr->start && end <= mr->end)
+			break;
+	if (mr) {
+		++mr->refcnt;
+		goto release;
+	}
+	mr = rte_malloc(__func__, sizeof(*mr), 0);
+	if (!mr) {
+		rte_errno = ENOMEM;
+		goto release;
+	}
+	*mr = (struct mlx4_mr){
+		.start = start,
+		.end = end,
+		.refcnt = 1,
+		.priv = priv,
+		.mr = ibv_reg_mr(priv->pd, (void *)start, end - start,
+				 IBV_ACCESS_LOCAL_WRITE),
+		.mp = mp,
+	};
+	if (mr->mr) {
+		mr->lkey = mr->mr->lkey;
+		LIST_INSERT_HEAD(&priv->mr, mr, next);
+	} else {
+		rte_free(mr);
+		mr = NULL;
+		rte_errno = errno ? errno : EINVAL;
+	}
+release:
+	rte_spinlock_unlock(&priv->mr_lock);
+	return mr;
+}
+
+/**
+ * Release a memory region.
+ *
+ * This function decrements its reference count and destroys it after
+ * reaching 0.
+ *
+ * Note to avoid race conditions given this function may be used from the
+ * data plane, it's extremely important that each user holds its own
+ * reference.
+ *
+ * @param mr
+ *   Memory region to release.
+ */
+void
+mlx4_mr_put(struct mlx4_mr *mr)
+{
+	struct priv *priv = mr->priv;
+
+	rte_spinlock_lock(&priv->mr_lock);
+	assert(mr->refcnt);
+	if (--mr->refcnt)
+		goto release;
+	LIST_REMOVE(mr, next);
+	claim_zero(ibv_dereg_mr(mr->mr));
+	rte_free(mr);
+release:
+	rte_spinlock_unlock(&priv->mr_lock);
+}
+
+/**
+ * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
+ * If mp2mr[] is full, remove an entry first.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be added.
+ * @param[in] i
+ *   Index in memory pool (MP) where to add memory region (MR).
+ *
+ * @return
+ *   Added mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t
+mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
+{
+	struct mlx4_mr *mr;
+
+	/* Add a new entry, register MR first. */
+	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+	      (void *)txq, mp->name, (void *)mp);
+	mr = mlx4_mr_get(txq->priv, mp);
+	if (unlikely(mr == NULL)) {
+		DEBUG("%p: unable to configure MR, mlx4_mr_get() failed",
+		      (void *)txq);
+		return (uint32_t)-1;
+	}
+	if (unlikely(i == RTE_DIM(txq->mp2mr))) {
+		/* Table is full, remove oldest entry. */
+		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+		      (void *)txq);
+		--i;
+		mlx4_mr_put(txq->mp2mr[0].mr);
+		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+	}
+	/* Store the new entry. */
+	txq->mp2mr[i].mp = mp;
+	txq->mp2mr[i].mr = mr;
+	txq->mp2mr[i].lkey = mr->lkey;
+	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
+	      (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
+	return txq->mp2mr[i].lkey;
+}
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
new file mode 100644
index 00000000..fcc7c129
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -0,0 +1,177 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MLX4_PRM_H_
+#define MLX4_PRM_H_
+
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx4dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+/* ConnectX-3 Tx queue basic block. */
+#define MLX4_TXBB_SHIFT 6
+#define MLX4_TXBB_SIZE (1 << MLX4_TXBB_SHIFT)
+
+/* Typical TSO descriptor with 16 gather entries is 352 bytes. */
+#define MLX4_MAX_WQE_SIZE 512
+#define MLX4_MAX_WQE_TXBBS (MLX4_MAX_WQE_SIZE / MLX4_TXBB_SIZE)
+
+/* Send queue stamping/invalidating information. */
+#define MLX4_SQ_STAMP_STRIDE 64
+#define MLX4_SQ_STAMP_DWORDS (MLX4_SQ_STAMP_STRIDE / 4)
+#define MLX4_SQ_STAMP_SHIFT 31
+#define MLX4_SQ_STAMP_VAL 0x7fffffff
+
+/* Work queue element (WQE) flags. */
+#define MLX4_BIT_WQE_OWN 0x80000000
+#define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
+#define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+
+#define MLX4_SIZE_TO_TXBBS(size) \
+	(RTE_ALIGN((size), (MLX4_TXBB_SIZE)) >> (MLX4_TXBB_SHIFT))
+
+/* CQE checksum flags. */
+enum {
+	MLX4_CQE_L2_TUNNEL_IPV4 = (int)(1u << 25),
+	MLX4_CQE_L2_TUNNEL_L4_CSUM = (int)(1u << 26),
+	MLX4_CQE_L2_TUNNEL = (int)(1u << 27),
+	MLX4_CQE_L2_VLAN_MASK = (int)(3u << 29),
+	MLX4_CQE_L2_TUNNEL_IPOK = (int)(1u << 31),
+};
+
+/* CQE status flags. */
+#define MLX4_CQE_STATUS_IPV4 (1 << 22)
+#define MLX4_CQE_STATUS_IPV4F (1 << 23)
+#define MLX4_CQE_STATUS_IPV6 (1 << 24)
+#define MLX4_CQE_STATUS_IPV4OPT (1 << 25)
+#define MLX4_CQE_STATUS_TCP (1 << 26)
+#define MLX4_CQE_STATUS_UDP (1 << 27)
+#define MLX4_CQE_STATUS_PTYPE_MASK \
+	(MLX4_CQE_STATUS_IPV4 | \
+	 MLX4_CQE_STATUS_IPV4F | \
+	 MLX4_CQE_STATUS_IPV6 | \
+	 MLX4_CQE_STATUS_IPV4OPT | \
+	 MLX4_CQE_STATUS_TCP | \
+	 MLX4_CQE_STATUS_UDP)
+
+/* Send queue information. */
+struct mlx4_sq {
+	volatile uint8_t *buf; /**< SQ buffer. */
+	volatile uint8_t *eob; /**< End of SQ buffer */
+	uint32_t head; /**< SQ head counter in units of TXBBS. */
+	uint32_t tail; /**< SQ tail counter in units of TXBBS. */
+	uint32_t txbb_cnt; /**< Num of WQEBB in the Q (should be ^2). */
+	uint32_t txbb_cnt_mask; /**< txbbs_cnt mask (txbb_cnt is ^2). */
+	uint32_t headroom_txbbs; /**< Num of txbbs that should be kept free. */
+	volatile uint32_t *db; /**< Pointer to the doorbell. */
+	uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */
+};
+
+#define mlx4_get_send_wqe(sq, n) ((sq)->buf + ((n) * (MLX4_TXBB_SIZE)))
+
+/* Completion queue events, numbers and masks. */
+#define MLX4_CQ_DB_GEQ_N_MASK 0x3
+#define MLX4_CQ_DOORBELL 0x20
+#define MLX4_CQ_DB_CI_MASK 0xffffff
+
+/* Completion queue information. */
+struct mlx4_cq {
+	volatile void *cq_uar; /**< CQ user access region. */
+	volatile void *cq_db_reg; /**< CQ doorbell register. */
+	volatile uint32_t *set_ci_db; /**< Pointer to the CQ doorbell. */
+	volatile uint32_t *arm_db; /**< Arming Rx events doorbell. */
+	volatile uint8_t *buf; /**< Pointer to the completion queue buffer. */
+	uint32_t cqe_cnt; /**< Number of entries in the queue. */
+	uint32_t cqe_64:1; /**< CQ entry size is 64 bytes. */
+	uint32_t cons_index; /**< Last queue entry that was handled. */
+	uint32_t cqn; /**< CQ number. */
+	int arm_sn; /**< Rx event counter. */
+};
+
+/**
+ * Retrieve a CQE entry from a CQ.
+ *
+ * cqe = cq->buf + cons_index * cqe_size + cqe_offset
+ *
+ * Where cqe_size is 32 or 64 bytes and cqe_offset is 0 or 32 (depending on
+ * cqe_size).
+ *
+ * @param cq
+ *   CQ to retrieve entry from.
+ * @param index
+ *   Entry index.
+ *
+ * @return
+ *   Pointer to CQE entry.
+ */
+static inline volatile struct mlx4_cqe *
+mlx4_get_cqe(struct mlx4_cq *cq, uint32_t index)
+{
+	return (volatile struct mlx4_cqe *)(cq->buf +
+				   ((index & (cq->cqe_cnt - 1)) <<
+				    (5 + cq->cqe_64)) +
+				   (cq->cqe_64 << 5));
+}
+
+/**
+ * Transpose a flag in a value.
+ *
+ * @param val
+ *   Input value.
+ * @param from
+ *   Flag to retrieve from input value.
+ * @param to
+ *   Flag to set in output value.
+ *
+ * @return
+ *   Output value with transposed flag enabled if present on input.
+ */
+static inline uint64_t
+mlx4_transpose(uint64_t val, uint64_t from, uint64_t to)
+{
+	return (from >= to ?
+		(val & from) / (from / to) :
+		(val & from) * (to / from));
+}
+
+#endif /* MLX4_PRM_H_ */
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
new file mode 100644
index 00000000..8b97a894
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -0,0 +1,873 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Rx queues configuration for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx4dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_ethdev.h>
+#include <rte_flow.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+
+#include "mlx4.h"
+#include "mlx4_flow.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Historical RSS hash key.
+ *
+ * This used to be the default for mlx4 in Linux before v3.19 switched to
+ * generating random hash keys through netdev_rss_key_fill().
+ *
+ * It is used in this PMD for consistency with past DPDK releases but can
+ * now be overridden through user configuration.
+ *
+ * Note: this is not const to work around API quirks.
+ */
+uint8_t
+mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = {
+	0x2c, 0xc6, 0x81, 0xd1,
+	0x5b, 0xdb, 0xf4, 0xf7,
+	0xfc, 0xa2, 0x83, 0x19,
+	0xdb, 0x1a, 0x3e, 0x94,
+	0x6b, 0x9e, 0x38, 0xd9,
+	0x2c, 0x9c, 0x03, 0xd1,
+	0xad, 0x99, 0x44, 0xa7,
+	0xd9, 0x56, 0x3d, 0x59,
+	0x06, 0x3c, 0x25, 0xf3,
+	0xfc, 0x1f, 0xdc, 0x2a,
+};
+
+/**
+ * Obtain a RSS context with specified properties.
+ *
+ * Used when creating a flow rule targeting one or several Rx queues.
+ *
+ * If a matching RSS context already exists, it is returned with its
+ * reference count incremented.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param fields
+ *   Fields for RSS processing (Verbs format).
+ * @param[in] key
+ *   Hash key to use (whose size is exactly MLX4_RSS_HASH_KEY_SIZE).
+ * @param queues
+ *   Number of target queues.
+ * @param[in] queue_id
+ *   Target queues.
+ *
+ * @return
+ *   Pointer to RSS context on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx4_rss *
+mlx4_rss_get(struct priv *priv, uint64_t fields,
+	     uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+	     uint16_t queues, const uint16_t queue_id[])
+{
+	struct mlx4_rss *rss;
+	size_t queue_id_size = sizeof(queue_id[0]) * queues;
+
+	LIST_FOREACH(rss, &priv->rss, next)
+		if (fields == rss->fields &&
+		    queues == rss->queues &&
+		    !memcmp(key, rss->key, MLX4_RSS_HASH_KEY_SIZE) &&
+		    !memcmp(queue_id, rss->queue_id, queue_id_size)) {
+			++rss->refcnt;
+			return rss;
+		}
+	rss = rte_malloc(__func__, offsetof(struct mlx4_rss, queue_id) +
+			 queue_id_size, 0);
+	if (!rss)
+		goto error;
+	*rss = (struct mlx4_rss){
+		.priv = priv,
+		.refcnt = 1,
+		.usecnt = 0,
+		.qp = NULL,
+		.ind = NULL,
+		.fields = fields,
+		.queues = queues,
+	};
+	memcpy(rss->key, key, MLX4_RSS_HASH_KEY_SIZE);
+	memcpy(rss->queue_id, queue_id, queue_id_size);
+	LIST_INSERT_HEAD(&priv->rss, rss, next);
+	return rss;
+error:
+	rte_errno = ENOMEM;
+	return NULL;
+}
+
+/**
+ * Release a RSS context instance.
+ *
+ * Used when destroying a flow rule targeting one or several Rx queues.
+ *
+ * This function decrements the reference count of the context and destroys
+ * it after reaching 0. The context must have no users at this point; all
+ * prior calls to mlx4_rss_attach() must have been followed by matching
+ * calls to mlx4_rss_detach().
+ *
+ * @param rss
+ *   RSS context to release.
+ */
+void
+mlx4_rss_put(struct mlx4_rss *rss)
+{
+	assert(rss->refcnt);
+	if (--rss->refcnt)
+		return;
+	assert(!rss->usecnt);
+	assert(!rss->qp);
+	assert(!rss->ind);
+	LIST_REMOVE(rss, next);
+	rte_free(rss);
+}
+
+/**
+ * Attach a user to a RSS context instance.
+ *
+ * Used when the RSS QP and indirection table objects must be instantiated,
+ * that is, when a flow rule must be enabled.
+ *
+ * This function increments the usage count of the context.
+ *
+ * @param rss
+ *   RSS context to attach to.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rss_attach(struct mlx4_rss *rss)
+{
+	assert(rss->refcnt);
+	if (rss->usecnt++) {
+		assert(rss->qp);
+		assert(rss->ind);
+		return 0;
+	}
+
+	struct ibv_wq *ind_tbl[rss->queues];
+	struct priv *priv = rss->priv;
+	const char *msg;
+	unsigned int i = 0;
+	int ret;
+
+	if (!rte_is_power_of_2(RTE_DIM(ind_tbl))) {
+		ret = EINVAL;
+		msg = "number of RSS queues must be a power of two";
+		goto error;
+	}
+	for (i = 0; i != RTE_DIM(ind_tbl); ++i) {
+		uint16_t id = rss->queue_id[i];
+		struct rxq *rxq = NULL;
+
+		if (id < priv->dev->data->nb_rx_queues)
+			rxq = priv->dev->data->rx_queues[id];
+		if (!rxq) {
+			ret = EINVAL;
+			msg = "RSS target queue is not configured";
+			goto error;
+		}
+		ret = mlx4_rxq_attach(rxq);
+		if (ret) {
+			ret = -ret;
+			msg = "unable to attach RSS target queue";
+			goto error;
+		}
+		ind_tbl[i] = rxq->wq;
+	}
+	rss->ind = ibv_create_rwq_ind_table
+		(priv->ctx,
+		 &(struct ibv_rwq_ind_table_init_attr){
+			.log_ind_tbl_size = rte_log2_u32(RTE_DIM(ind_tbl)),
+			.ind_tbl = ind_tbl,
+			.comp_mask = 0,
+		 });
+	if (!rss->ind) {
+		ret = errno ? errno : EINVAL;
+		msg = "RSS indirection table creation failure";
+		goto error;
+	}
+	rss->qp = ibv_create_qp_ex
+		(priv->ctx,
+		 &(struct ibv_qp_init_attr_ex){
+			.comp_mask = (IBV_QP_INIT_ATTR_PD |
+				      IBV_QP_INIT_ATTR_RX_HASH |
+				      IBV_QP_INIT_ATTR_IND_TABLE),
+			.qp_type = IBV_QPT_RAW_PACKET,
+			.pd = priv->pd,
+			.rwq_ind_tbl = rss->ind,
+			.rx_hash_conf = {
+				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
+				.rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE,
+				.rx_hash_key = rss->key,
+				.rx_hash_fields_mask = rss->fields,
+			},
+		 });
+	if (!rss->qp) {
+		ret = errno ? errno : EINVAL;
+		msg = "RSS hash QP creation failure";
+		goto error;
+	}
+	ret = ibv_modify_qp
+		(rss->qp,
+		 &(struct ibv_qp_attr){
+			.qp_state = IBV_QPS_INIT,
+			.port_num = priv->port,
+		 },
+		 IBV_QP_STATE | IBV_QP_PORT);
+	if (ret) {
+		msg = "failed to switch RSS hash QP to INIT state";
+		goto error;
+	}
+	ret = ibv_modify_qp
+		(rss->qp,
+		 &(struct ibv_qp_attr){
+			.qp_state = IBV_QPS_RTR,
+		 },
+		 IBV_QP_STATE);
+	if (ret) {
+		msg = "failed to switch RSS hash QP to RTR state";
+		goto error;
+	}
+	return 0;
+error:
+	if (rss->qp) {
+		claim_zero(ibv_destroy_qp(rss->qp));
+		rss->qp = NULL;
+	}
+	if (rss->ind) {
+		claim_zero(ibv_destroy_rwq_ind_table(rss->ind));
+		rss->ind = NULL;
+	}
+	while (i--)
+		mlx4_rxq_detach(priv->dev->data->rx_queues[rss->queue_id[i]]);
+	ERROR("mlx4: %s", msg);
+	--rss->usecnt;
+	rte_errno = ret;
+	return -ret;
+}
+
+/**
+ * Detach a user from a RSS context instance.
+ *
+ * Used when disabling (not destroying) a flow rule.
+ *
+ * This function decrements the usage count of the context and destroys
+ * usage resources after reaching 0.
+ *
+ * @param rss
+ *   RSS context to detach from.
+ */
+void
+mlx4_rss_detach(struct mlx4_rss *rss)
+{
+	struct priv *priv = rss->priv;
+	unsigned int i;
+
+	assert(rss->refcnt);
+	assert(rss->qp);
+	assert(rss->ind);
+	if (--rss->usecnt)
+		return;
+	claim_zero(ibv_destroy_qp(rss->qp));
+	rss->qp = NULL;
+	claim_zero(ibv_destroy_rwq_ind_table(rss->ind));
+	rss->ind = NULL;
+	for (i = 0; i != rss->queues; ++i)
+		mlx4_rxq_detach(priv->dev->data->rx_queues[rss->queue_id[i]]);
+}
+
+/**
+ * Initialize common RSS context resources.
+ *
+ * Because ConnectX-3 hardware limitations require a fixed order in the
+ * indirection table, WQs must be allocated sequentially to be part of a
+ * common RSS context.
+ *
+ * Since a newly created WQ cannot be moved to a different context, this
+ * function allocates them all at once, one for each configured Rx queue,
+ * as well as all related resources (CQs and mbufs).
+ *
+ * This must therefore be done before creating any Rx flow rules relying on
+ * indirection tables.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rss_init(struct priv *priv)
+{
+	struct rte_eth_dev *dev = priv->dev;
+	uint8_t log2_range = rte_log2_u32(dev->data->nb_rx_queues);
+	uint32_t wq_num_prev = 0;
+	const char *msg;
+	unsigned int i;
+	int ret;
+
+	/* Prepare range for RSS contexts before creating the first WQ. */
+	ret = mlx4dv_set_context_attr(priv->ctx,
+				      MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ,
+				      &log2_range);
+	if (ret) {
+		ERROR("cannot set up range size for RSS context to %u"
+		      " (for %u Rx queues), error: %s",
+		      1 << log2_range, dev->data->nb_rx_queues, strerror(ret));
+		rte_errno = ret;
+		return -ret;
+	}
+	for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) {
+		struct rxq *rxq = priv->dev->data->rx_queues[i];
+		struct ibv_cq *cq;
+		struct ibv_wq *wq;
+		uint32_t wq_num;
+
+		/* Attach the configured Rx queues. */
+		if (rxq) {
+			assert(!rxq->usecnt);
+			ret = mlx4_rxq_attach(rxq);
+			if (!ret) {
+				wq_num = rxq->wq->wq_num;
+				goto wq_num_check;
+			}
+			ret = -ret;
+			msg = "unable to create Rx queue resources";
+			goto error;
+		}
+		/*
+		 * WQs are temporarily allocated for unconfigured Rx queues
+		 * to maintain proper index alignment in indirection table
+		 * by skipping unused WQ numbers.
+		 *
+		 * The reason this works at all even though these WQs are
+		 * immediately destroyed is that WQNs are allocated
+		 * sequentially and are guaranteed to never be reused in the
+		 * same context by the underlying implementation.
+		 */
+		cq = ibv_create_cq(priv->ctx, 1, NULL, NULL, 0);
+		if (!cq) {
+			ret = ENOMEM;
+			msg = "placeholder CQ creation failure";
+			goto error;
+		}
+		wq = ibv_create_wq
+			(priv->ctx,
+			 &(struct ibv_wq_init_attr){
+				.wq_type = IBV_WQT_RQ,
+				.max_wr = 1,
+				.max_sge = 1,
+				.pd = priv->pd,
+				.cq = cq,
+			 });
+		if (wq) {
+			wq_num = wq->wq_num;
+			claim_zero(ibv_destroy_wq(wq));
+		} else {
+			wq_num = 0; /* Shut up GCC 4.8 warnings. */
+		}
+		claim_zero(ibv_destroy_cq(cq));
+		if (!wq) {
+			ret = ENOMEM;
+			msg = "placeholder WQ creation failure";
+			goto error;
+		}
+wq_num_check:
+		/*
+		 * While guaranteed by the implementation, make sure WQ
+		 * numbers are really sequential (as the saying goes,
+		 * trust, but verify).
+		 */
+		if (i && wq_num - wq_num_prev != 1) {
+			if (rxq)
+				mlx4_rxq_detach(rxq);
+			ret = ERANGE;
+			msg = "WQ numbers are not sequential";
+			goto error;
+		}
+		wq_num_prev = wq_num;
+	}
+	return 0;
+error:
+	ERROR("cannot initialize common RSS resources (queue %u): %s: %s",
+	      i, msg, strerror(ret));
+	while (i--) {
+		struct rxq *rxq = priv->dev->data->rx_queues[i];
+
+		if (rxq)
+			mlx4_rxq_detach(rxq);
+	}
+	rte_errno = ret;
+	return -ret;
+}
+
+/**
+ * Release common RSS context resources.
+ *
+ * As the reverse of mlx4_rss_init(), this must be done after removing all
+ * flow rules relying on indirection tables.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+void
+mlx4_rss_deinit(struct priv *priv)
+{
+	unsigned int i;
+
+	for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) {
+		struct rxq *rxq = priv->dev->data->rx_queues[i];
+
+		if (rxq) {
+			assert(rxq->usecnt == 1);
+			mlx4_rxq_detach(rxq);
+		}
+	}
+}
+
+/**
+ * Attach a user to a Rx queue.
+ *
+ * Used when the resources of an Rx queue must be instantiated for it to
+ * become in a usable state.
+ *
+ * This function increments the usage count of the Rx queue.
+ *
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rxq_attach(struct rxq *rxq)
+{
+	if (rxq->usecnt++) {
+		assert(rxq->cq);
+		assert(rxq->wq);
+		assert(rxq->wqes);
+		assert(rxq->rq_db);
+		return 0;
+	}
+
+	struct priv *priv = rxq->priv;
+	const uint32_t elts_n = 1 << rxq->elts_n;
+	const uint32_t sges_n = 1 << rxq->sges_n;
+	struct rte_mbuf *(*elts)[elts_n] = rxq->elts;
+	struct mlx4dv_obj mlxdv;
+	struct mlx4dv_rwq dv_rwq;
+	struct mlx4dv_cq dv_cq = { .comp_mask = MLX4DV_CQ_MASK_UAR, };
+	const char *msg;
+	struct ibv_cq *cq = NULL;
+	struct ibv_wq *wq = NULL;
+	volatile struct mlx4_wqe_data_seg (*wqes)[];
+	unsigned int i;
+	int ret;
+
+	assert(rte_is_power_of_2(elts_n));
+	cq = ibv_create_cq(priv->ctx, elts_n / sges_n, NULL, rxq->channel, 0);
+	if (!cq) {
+		ret = ENOMEM;
+		msg = "CQ creation failure";
+		goto error;
+	}
+	wq = ibv_create_wq
+		(priv->ctx,
+		 &(struct ibv_wq_init_attr){
+			.wq_type = IBV_WQT_RQ,
+			.max_wr = elts_n / sges_n,
+			.max_sge = sges_n,
+			.pd = priv->pd,
+			.cq = cq,
+		 });
+	if (!wq) {
+		ret = errno ? errno : EINVAL;
+		msg = "WQ creation failure";
+		goto error;
+	}
+	ret = ibv_modify_wq
+		(wq,
+		 &(struct ibv_wq_attr){
+			.attr_mask = IBV_WQ_ATTR_STATE,
+			.wq_state = IBV_WQS_RDY,
+		 });
+	if (ret) {
+		msg = "WQ state change to IBV_WQS_RDY failed";
+		goto error;
+	}
+	/* Retrieve device queue information. */
+	mlxdv.cq.in = cq;
+	mlxdv.cq.out = &dv_cq;
+	mlxdv.rwq.in = wq;
+	mlxdv.rwq.out = &dv_rwq;
+	ret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_RWQ | MLX4DV_OBJ_CQ);
+	if (ret) {
+		msg = "failed to obtain device information from WQ/CQ objects";
+		goto error;
+	}
+	wqes = (volatile struct mlx4_wqe_data_seg (*)[])
+		((uintptr_t)dv_rwq.buf.buf + dv_rwq.rq.offset);
+	for (i = 0; i != RTE_DIM(*elts); ++i) {
+		volatile struct mlx4_wqe_data_seg *scat = &(*wqes)[i];
+		struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
+
+		if (buf == NULL) {
+			while (i--) {
+				rte_pktmbuf_free_seg((*elts)[i]);
+				(*elts)[i] = NULL;
+			}
+			ret = ENOMEM;
+			msg = "cannot allocate mbuf";
+			goto error;
+		}
+		/* Headroom is reserved by rte_pktmbuf_alloc(). */
+		assert(buf->data_off == RTE_PKTMBUF_HEADROOM);
+		/* Buffer is supposed to be empty. */
+		assert(rte_pktmbuf_data_len(buf) == 0);
+		assert(rte_pktmbuf_pkt_len(buf) == 0);
+		/* Only the first segment keeps headroom. */
+		if (i % sges_n)
+			buf->data_off = 0;
+		buf->port = rxq->port_id;
+		buf->data_len = rte_pktmbuf_tailroom(buf);
+		buf->pkt_len = rte_pktmbuf_tailroom(buf);
+		buf->nb_segs = 1;
+		*scat = (struct mlx4_wqe_data_seg){
+			.addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
+								  uintptr_t)),
+			.byte_count = rte_cpu_to_be_32(buf->data_len),
+			.lkey = rte_cpu_to_be_32(rxq->mr->lkey),
+		};
+		(*elts)[i] = buf;
+	}
+	DEBUG("%p: allocated and configured %u segments (max %u packets)",
+	      (void *)rxq, elts_n, elts_n / sges_n);
+	rxq->cq = cq;
+	rxq->wq = wq;
+	rxq->wqes = wqes;
+	rxq->rq_db = dv_rwq.rdb;
+	rxq->mcq.buf = dv_cq.buf.buf;
+	rxq->mcq.cqe_cnt = dv_cq.cqe_cnt;
+	rxq->mcq.set_ci_db = dv_cq.set_ci_db;
+	rxq->mcq.cqe_64 = (dv_cq.cqe_size & 64) ? 1 : 0;
+	rxq->mcq.arm_db = dv_cq.arm_db;
+	rxq->mcq.arm_sn = dv_cq.arm_sn;
+	rxq->mcq.cqn = dv_cq.cqn;
+	rxq->mcq.cq_uar = dv_cq.cq_uar;
+	rxq->mcq.cq_db_reg = (uint8_t *)dv_cq.cq_uar + MLX4_CQ_DOORBELL;
+	/* Update doorbell counter. */
+	rxq->rq_ci = elts_n / sges_n;
+	rte_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+	return 0;
+error:
+	if (wq)
+		claim_zero(ibv_destroy_wq(wq));
+	if (cq)
+		claim_zero(ibv_destroy_cq(cq));
+	rte_errno = ret;
+	ERROR("error while attaching Rx queue %p: %s: %s",
+	      (void *)rxq, msg, strerror(ret));
+	return -ret;
+}
+
+/**
+ * Detach a user from a Rx queue.
+ *
+ * This function decrements the usage count of the Rx queue and destroys
+ * usage resources after reaching 0.
+ *
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ */
+void
+mlx4_rxq_detach(struct rxq *rxq)
+{
+	unsigned int i;
+	struct rte_mbuf *(*elts)[1 << rxq->elts_n] = rxq->elts;
+
+	if (--rxq->usecnt)
+		return;
+	rxq->rq_ci = 0;
+	memset(&rxq->mcq, 0, sizeof(rxq->mcq));
+	rxq->rq_db = NULL;
+	rxq->wqes = NULL;
+	claim_zero(ibv_destroy_wq(rxq->wq));
+	rxq->wq = NULL;
+	claim_zero(ibv_destroy_cq(rxq->cq));
+	rxq->cq = NULL;
+	DEBUG("%p: freeing Rx queue elements", (void *)rxq);
+	for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+		if (!(*elts)[i])
+			continue;
+		rte_pktmbuf_free_seg((*elts)[i]);
+		(*elts)[i] = NULL;
+	}
+}
+
+/**
+ * DPDK callback to configure a Rx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   Rx queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ *   Thresholds parameters.
+ * @param mp
+ *   Memory pool for buffer allocations.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+		    unsigned int socket, const struct rte_eth_rxconf *conf,
+		    struct rte_mempool *mp)
+{
+	struct priv *priv = dev->data->dev_private;
+	uint32_t mb_len = rte_pktmbuf_data_room_size(mp);
+	struct rte_mbuf *(*elts)[rte_align32pow2(desc)];
+	struct rxq *rxq;
+	struct mlx4_malloc_vec vec[] = {
+		{
+			.align = RTE_CACHE_LINE_SIZE,
+			.size = sizeof(*rxq),
+			.addr = (void **)&rxq,
+		},
+		{
+			.align = RTE_CACHE_LINE_SIZE,
+			.size = sizeof(*elts),
+			.addr = (void **)&elts,
+		},
+	};
+	int ret;
+
+	(void)conf; /* Thresholds configuration (ignored). */
+	DEBUG("%p: configuring queue %u for %u descriptors",
+	      (void *)dev, idx, desc);
+	if (idx >= dev->data->nb_rx_queues) {
+		rte_errno = EOVERFLOW;
+		ERROR("%p: queue index out of range (%u >= %u)",
+		      (void *)dev, idx, dev->data->nb_rx_queues);
+		return -rte_errno;
+	}
+	rxq = dev->data->rx_queues[idx];
+	if (rxq) {
+		rte_errno = EEXIST;
+		ERROR("%p: Rx queue %u already configured, release it first",
+		      (void *)dev, idx);
+		return -rte_errno;
+	}
+	if (!desc) {
+		rte_errno = EINVAL;
+		ERROR("%p: invalid number of Rx descriptors", (void *)dev);
+		return -rte_errno;
+	}
+	if (desc != RTE_DIM(*elts)) {
+		desc = RTE_DIM(*elts);
+		WARN("%p: increased number of descriptors in Rx queue %u"
+		     " to the next power of two (%u)",
+		     (void *)dev, idx, desc);
+	}
+	/* Allocate and initialize Rx queue. */
+	mlx4_zmallocv_socket("RXQ", vec, RTE_DIM(vec), socket);
+	if (!rxq) {
+		ERROR("%p: unable to allocate queue index %u",
+		      (void *)dev, idx);
+		return -rte_errno;
+	}
+	*rxq = (struct rxq){
+		.priv = priv,
+		.mp = mp,
+		.port_id = dev->data->port_id,
+		.sges_n = 0,
+		.elts_n = rte_log2_u32(desc),
+		.elts = elts,
+		/* Toggle Rx checksum offload if hardware supports it. */
+		.csum = (priv->hw_csum &&
+			 dev->data->dev_conf.rxmode.hw_ip_checksum),
+		.csum_l2tun = (priv->hw_csum_l2tun &&
+			       dev->data->dev_conf.rxmode.hw_ip_checksum),
+		.stats = {
+			.idx = idx,
+		},
+		.socket = socket,
+	};
+	/* Enable scattered packets support for this queue if necessary. */
+	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
+	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
+		;
+	} else if (dev->data->dev_conf.rxmode.enable_scatter) {
+		uint32_t size =
+			RTE_PKTMBUF_HEADROOM +
+			dev->data->dev_conf.rxmode.max_rx_pkt_len;
+		uint32_t sges_n;
+
+		/*
+		 * Determine the number of SGEs needed for a full packet
+		 * and round it to the next power of two.
+		 */
+		sges_n = rte_log2_u32((size / mb_len) + !!(size % mb_len));
+		rxq->sges_n = sges_n;
+		/* Make sure sges_n did not overflow. */
+		size = mb_len * (1 << rxq->sges_n);
+		size -= RTE_PKTMBUF_HEADROOM;
+		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
+			rte_errno = EOVERFLOW;
+			ERROR("%p: too many SGEs (%u) needed to handle"
+			      " requested maximum packet size %u",
+			      (void *)dev,
+			      1 << sges_n,
+			      dev->data->dev_conf.rxmode.max_rx_pkt_len);
+			goto error;
+		}
+	} else {
+		WARN("%p: the requested maximum Rx packet size (%u) is"
+		     " larger than a single mbuf (%u) and scattered"
+		     " mode has not been requested",
+		     (void *)dev,
+		     dev->data->dev_conf.rxmode.max_rx_pkt_len,
+		     mb_len - RTE_PKTMBUF_HEADROOM);
+	}
+	DEBUG("%p: maximum number of segments per packet: %u",
+	      (void *)dev, 1 << rxq->sges_n);
+	if (desc % (1 << rxq->sges_n)) {
+		rte_errno = EINVAL;
+		ERROR("%p: number of Rx queue descriptors (%u) is not a"
+		      " multiple of maximum segments per packet (%u)",
+		      (void *)dev,
+		      desc,
+		      1 << rxq->sges_n);
+		goto error;
+	}
+	/* Use the entire Rx mempool as the memory region. */
+	rxq->mr = mlx4_mr_get(priv, mp);
+	if (!rxq->mr) {
+		ERROR("%p: MR creation failure: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
+	if (dev->data->dev_conf.intr_conf.rxq) {
+		rxq->channel = ibv_create_comp_channel(priv->ctx);
+		if (rxq->channel == NULL) {
+			rte_errno = ENOMEM;
+			ERROR("%p: Rx interrupt completion channel creation"
+			      " failure: %s",
+			      (void *)dev, strerror(rte_errno));
+			goto error;
+		}
+		if (mlx4_fd_set_non_blocking(rxq->channel->fd) < 0) {
+			ERROR("%p: unable to make Rx interrupt completion"
+			      " channel non-blocking: %s",
+			      (void *)dev, strerror(rte_errno));
+			goto error;
+		}
+	}
+	DEBUG("%p: adding Rx queue %p to list", (void *)dev, (void *)rxq);
+	dev->data->rx_queues[idx] = rxq;
+	return 0;
+error:
+	dev->data->rx_queues[idx] = NULL;
+	ret = rte_errno;
+	mlx4_rx_queue_release(rxq);
+	rte_errno = ret;
+	assert(rte_errno > 0);
+	return -rte_errno;
+}
+
+/**
+ * DPDK callback to release a Rx queue.
+ *
+ * @param dpdk_rxq
+ *   Generic Rx queue pointer.
+ */
+void
+mlx4_rx_queue_release(void *dpdk_rxq)
+{
+	struct rxq *rxq = (struct rxq *)dpdk_rxq;
+	struct priv *priv;
+	unsigned int i;
+
+	if (rxq == NULL)
+		return;
+	priv = rxq->priv;
+	for (i = 0; i != priv->dev->data->nb_rx_queues; ++i)
+		if (priv->dev->data->rx_queues[i] == rxq) {
+			DEBUG("%p: removing Rx queue %p from list",
+			      (void *)priv->dev, (void *)rxq);
+			priv->dev->data->rx_queues[i] = NULL;
+			break;
+		}
+	assert(!rxq->cq);
+	assert(!rxq->wq);
+	assert(!rxq->wqes);
+	assert(!rxq->rq_db);
+	if (rxq->channel)
+		claim_zero(ibv_destroy_comp_channel(rxq->channel));
+	if (rxq->mr)
+		mlx4_mr_put(rxq->mr);
+	rte_free(rxq);
+}
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
new file mode 100644
index 00000000..3985e06d
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -0,0 +1,1071 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Data plane functions for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_io.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include "mlx4.h"
+#include "mlx4_prm.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+#define WQE_ONE_DATA_SEG_SIZE \
+	(sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))
+
+/**
+ * Pointer-value pair structure used in tx_post_send for saving the first
+ * DWORD (32 byte) of a TXBB.
+ */
+struct pv {
+	volatile struct mlx4_wqe_data_seg *dseg;
+	uint32_t val;
+};
+
+/** A table to translate Rx completion flags to packet type. */
+uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
+	/*
+	 * The index to the array should have:
+	 *  bit[7] - MLX4_CQE_L2_TUNNEL
+	 *  bit[6] - MLX4_CQE_L2_TUNNEL_IPV4
+	 *  bit[5] - MLX4_CQE_STATUS_UDP
+	 *  bit[4] - MLX4_CQE_STATUS_TCP
+	 *  bit[3] - MLX4_CQE_STATUS_IPV4OPT
+	 *  bit[2] - MLX4_CQE_STATUS_IPV6
+	 *  bit[1] - MLX4_CQE_STATUS_IPV4F
+	 *  bit[0] - MLX4_CQE_STATUS_IPV4
+	 * giving a total of up to 256 entries.
+	 */
+	[0x00] = RTE_PTYPE_L2_ETHER,
+	[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+	[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_FRAG,
+	[0x03] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_FRAG,
+	[0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+	[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT,
+	[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_FRAG,
+	[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP,
+	[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP,
+	[0x14] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_TCP,
+	[0x18] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_TCP,
+	[0x19] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_TCP,
+	[0x1a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_TCP,
+	[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_UDP,
+	[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_UDP,
+	[0x24] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_L4_UDP,
+	[0x28] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_UDP,
+	[0x29] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_UDP,
+	[0x2a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+		     RTE_PTYPE_L4_UDP,
+	/* Tunneled - L3 IPV6 */
+	[0x80] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+	[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
+	[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0x83] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0x84] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
+	[0x88] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT,
+	[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT,
+	[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG,
+	/* Tunneled - L3 IPV6, TCP */
+	[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0x93] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0x94] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0x98] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0x99] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0x9a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_TCP,
+	/* Tunneled - L3 IPV6, UDP */
+	[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xa3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xaa] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_UDP,
+	/* Tunneled - L3 IPV4 */
+	[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+	[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
+	[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0xc3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	[0xc4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
+	[0xc8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT,
+	[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT,
+	[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_FRAG,
+	/* Tunneled - L3 IPV4, TCP */
+	[0xd0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0xd3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0xd4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0xd8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0xd9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT |
+		     RTE_PTYPE_INNER_L4_TCP,
+	[0xda] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_TCP,
+	/* Tunneled - L3 IPV4, UDP */
+	[0xe0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xe3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xe4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L4_UDP,
+	[0xe8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP,
+	[0xe9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP,
+	[0xea] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+		     RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
+		     RTE_PTYPE_INNER_L4_UDP,
+};
+
+/**
+ * Stamp a WQE so it won't be reused by the HW.
+ *
+ * Routine is used when freeing WQE used by the chip or when failing
+ * building an WQ entry has failed leaving partial information on the queue.
+ *
+ * @param sq
+ *   Pointer to the SQ structure.
+ * @param index
+ *   Index of the freed WQE.
+ * @param num_txbbs
+ *   Number of blocks to stamp.
+ *   If < 0 the routine will use the size written in the WQ entry.
+ * @param owner
+ *   The value of the WQE owner bit to use in the stamp.
+ *
+ * @return
+ *   The number of Tx basic blocs (TXBB) the WQE contained.
+ */
+static int
+mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner)
+{
+	uint32_t stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
+					  (!!owner << MLX4_SQ_STAMP_SHIFT));
+	volatile uint8_t *wqe = mlx4_get_send_wqe(sq,
+						(index & sq->txbb_cnt_mask));
+	volatile uint32_t *ptr = (volatile uint32_t *)wqe;
+	int i;
+	int txbbs_size;
+	int num_txbbs;
+
+	/* Extract the size from the control segment of the WQE. */
+	num_txbbs = MLX4_SIZE_TO_TXBBS((((volatile struct mlx4_wqe_ctrl_seg *)
+					 wqe)->fence_size & 0x3f) << 4);
+	txbbs_size = num_txbbs * MLX4_TXBB_SIZE;
+	/* Optimize the common case when there is no wrap-around. */
+	if (wqe + txbbs_size <= sq->eob) {
+		/* Stamp the freed descriptor. */
+		for (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += MLX4_SQ_STAMP_DWORDS;
+		}
+	} else {
+		/* Stamp the freed descriptor. */
+		for (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += MLX4_SQ_STAMP_DWORDS;
+			if ((volatile uint8_t *)ptr >= sq->eob) {
+				ptr = (volatile uint32_t *)sq->buf;
+				stamp ^= RTE_BE32(0x80000000);
+			}
+		}
+	}
+	return num_txbbs;
+}
+
+/**
+ * Manage Tx completions.
+ *
+ * When sending a burst, mlx4_tx_burst() posts several WRs.
+ * To improve performance, a completion event is only required once every
+ * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
+ * for other WRs, but this information would not be used anyway.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ *
+ * @return
+ *   0 on success, -1 on failure.
+ */
+static int
+mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
+				  struct mlx4_sq *sq)
+{
+	unsigned int elts_comp = txq->elts_comp;
+	unsigned int elts_tail = txq->elts_tail;
+	struct mlx4_cq *cq = &txq->mcq;
+	volatile struct mlx4_cqe *cqe;
+	uint32_t cons_index = cq->cons_index;
+	uint16_t new_index;
+	uint16_t nr_txbbs = 0;
+	int pkts = 0;
+
+	/*
+	 * Traverse over all CQ entries reported and handle each WQ entry
+	 * reported by them.
+	 */
+	do {
+		cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index);
+		if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+		    !!(cons_index & cq->cqe_cnt)))
+			break;
+		/*
+		 * Make sure we read the CQE after we read the ownership bit.
+		 */
+		rte_io_rmb();
+#ifndef NDEBUG
+		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+			     MLX4_CQE_OPCODE_ERROR)) {
+			volatile struct mlx4_err_cqe *cqe_err =
+				(volatile struct mlx4_err_cqe *)cqe;
+			ERROR("%p CQE error - vendor syndrome: 0x%x"
+			      " syndrome: 0x%x\n",
+			      (void *)txq, cqe_err->vendor_err,
+			      cqe_err->syndrome);
+		}
+#endif /* NDEBUG */
+		/* Get WQE index reported in the CQE. */
+		new_index =
+			rte_be_to_cpu_16(cqe->wqe_index) & sq->txbb_cnt_mask;
+		do {
+			/* Free next descriptor. */
+			nr_txbbs +=
+				mlx4_txq_stamp_freed_wqe(sq,
+				     (sq->tail + nr_txbbs) & sq->txbb_cnt_mask,
+				     !!((sq->tail + nr_txbbs) & sq->txbb_cnt));
+			pkts++;
+		} while (((sq->tail + nr_txbbs) & sq->txbb_cnt_mask) !=
+			 new_index);
+		cons_index++;
+	} while (1);
+	if (unlikely(pkts == 0))
+		return 0;
+	/* Update CQ. */
+	cq->cons_index = cons_index;
+	*cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index & MLX4_CQ_DB_CI_MASK);
+	sq->tail = sq->tail + nr_txbbs;
+	/* Update the list of packets posted for transmission. */
+	elts_comp -= pkts;
+	assert(elts_comp <= txq->elts_comp);
+	/*
+	 * Assume completion status is successful as nothing can be done about
+	 * it anyway.
+	 */
+	elts_tail += pkts;
+	if (elts_tail >= elts_n)
+		elts_tail -= elts_n;
+	txq->elts_tail = elts_tail;
+	txq->elts_comp = elts_comp;
+	return 0;
+}
+
+/**
+ * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
+ * the cloned mbuf is allocated is returned instead.
+ *
+ * @param buf
+ *   Pointer to mbuf.
+ *
+ * @return
+ *   Memory pool where data is located for given mbuf.
+ */
+static struct rte_mempool *
+mlx4_txq_mb2mp(struct rte_mbuf *buf)
+{
+	if (unlikely(RTE_MBUF_INDIRECT(buf)))
+		return rte_mbuf_from_indirect(buf)->pool;
+	return buf->pool;
+}
+
+static int
+mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
+		   volatile struct mlx4_wqe_ctrl_seg **pctrl)
+{
+	int wqe_real_size;
+	int nr_txbbs;
+	struct pv *pv = (struct pv *)txq->bounce_buf;
+	struct mlx4_sq *sq = &txq->msq;
+	uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl;
+	volatile struct mlx4_wqe_data_seg *dseg;
+	struct rte_mbuf *sbuf;
+	uint32_t lkey;
+	uintptr_t addr;
+	uint32_t byte_count;
+	int pv_counter = 0;
+
+	/* Calculate the needed work queue entry size for this packet. */
+	wqe_real_size = sizeof(volatile struct mlx4_wqe_ctrl_seg) +
+		buf->nb_segs * sizeof(volatile struct mlx4_wqe_data_seg);
+	nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+	/*
+	 * Check that there is room for this WQE in the send queue and that
+	 * the WQE size is legal.
+	 */
+	if (((sq->head - sq->tail) + nr_txbbs +
+				sq->headroom_txbbs) >= sq->txbb_cnt ||
+			nr_txbbs > MLX4_MAX_WQE_TXBBS) {
+		return -1;
+	}
+	/* Get the control and data entries of the WQE. */
+	ctrl = (volatile struct mlx4_wqe_ctrl_seg *)
+			mlx4_get_send_wqe(sq, head_idx);
+	dseg = (volatile struct mlx4_wqe_data_seg *)
+			((uintptr_t)ctrl + sizeof(struct mlx4_wqe_ctrl_seg));
+	*pctrl = ctrl;
+	/* Fill the data segments with buffer information. */
+	for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+		addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+		rte_prefetch0((volatile void *)addr);
+		/* Handle WQE wraparound. */
+		if (dseg >= (volatile struct mlx4_wqe_data_seg *)sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)sq->buf;
+		dseg->addr = rte_cpu_to_be_64(addr);
+		/* Memory region key (big endian) for this memory pool. */
+		lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+		dseg->lkey = rte_cpu_to_be_32(lkey);
+#ifndef NDEBUG
+		/* Calculate the needed work queue entry size for this packet */
+		if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
+			/* MR does not exist. */
+			DEBUG("%p: unable to get MP <-> MR association",
+					(void *)txq);
+			/*
+			 * Restamp entry in case of failure.
+			 * Make sure that size is written correctly
+			 * Note that we give ownership to the SW, not the HW.
+			 */
+			wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+				buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+			ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+			mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					(sq->head & sq->txbb_cnt) ? 0 : 1);
+			return -1;
+		}
+#endif /* NDEBUG */
+		if (likely(sbuf->data_len)) {
+			byte_count = rte_cpu_to_be_32(sbuf->data_len);
+		} else {
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			byte_count = RTE_BE32(0x80000000);
+		}
+		/*
+		 * If the data segment is not at the beginning of a
+		 * Tx basic block (TXBB) then write the byte count,
+		 * else postpone the writing to just before updating the
+		 * control segment.
+		 */
+		if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+#if RTE_CACHE_LINE_SIZE < 64
+			/*
+			 * Need a barrier here before writing the byte_count
+			 * fields to make sure that all the data is visible
+			 * before the byte_count field is set.
+			 * Otherwise, if the segment begins a new cacheline,
+			 * the HCA prefetcher could grab the 64-byte chunk and
+			 * get a valid (!= 0xffffffff) byte count but stale
+			 * data, and end up sending the wrong data.
+			 */
+			rte_io_wmb();
+#endif /* RTE_CACHE_LINE_SIZE */
+			dseg->byte_count = byte_count;
+		} else {
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[pv_counter].dseg = dseg;
+			pv[pv_counter++].val = byte_count;
+		}
+	}
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (pv_counter) {
+		/* Need a barrier here before writing the byte_count. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
+	}
+	/* Fill the control parameters for this packet. */
+	ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+	return nr_txbbs;
+}
+
+/**
+ * DPDK callback for Tx.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct txq *txq = (struct txq *)dpdk_txq;
+	unsigned int elts_head = txq->elts_head;
+	const unsigned int elts_n = txq->elts_n;
+	unsigned int bytes_sent = 0;
+	unsigned int i;
+	unsigned int max;
+	struct mlx4_sq *sq = &txq->msq;
+	int nr_txbbs;
+
+	assert(txq->elts_comp_cd != 0);
+	if (likely(txq->elts_comp != 0))
+		mlx4_txq_complete(txq, elts_n, sq);
+	max = (elts_n - (elts_head - txq->elts_tail));
+	if (max > elts_n)
+		max -= elts_n;
+	assert(max >= 1);
+	assert(max <= elts_n);
+	/* Always leave one free entry in the ring. */
+	--max;
+	if (max > pkts_n)
+		max = pkts_n;
+	for (i = 0; (i != max); ++i) {
+		struct rte_mbuf *buf = pkts[i];
+		unsigned int elts_head_next =
+			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
+		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
+		struct txq_elt *elt = &(*txq->elts)[elts_head];
+		uint32_t owner_opcode = MLX4_OPCODE_SEND;
+		volatile struct mlx4_wqe_ctrl_seg *ctrl;
+		volatile struct mlx4_wqe_data_seg *dseg;
+		union {
+			uint32_t flags;
+			uint16_t flags16[2];
+		} srcrb;
+		uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+		uint32_t lkey;
+		uintptr_t addr;
+
+		/* Clean up old buffer. */
+		if (likely(elt->buf != NULL)) {
+			struct rte_mbuf *tmp = elt->buf;
+
+#ifndef NDEBUG
+			/* Poisoning. */
+			memset(elt, 0x66, sizeof(*elt));
+#endif
+			/* Faster than rte_pktmbuf_free(). */
+			do {
+				struct rte_mbuf *next = tmp->next;
+
+				rte_pktmbuf_free_seg(tmp);
+				tmp = next;
+			} while (tmp != NULL);
+		}
+		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+		if (buf->nb_segs == 1) {
+			/*
+			 * Check that there is room for this WQE in the send
+			 * queue and that the WQE size is legal
+			 */
+			if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs) >=
+			     sq->txbb_cnt || 1 > MLX4_MAX_WQE_TXBBS) {
+				elt->buf = NULL;
+				break;
+			}
+			/* Get the control and data entries of the WQE. */
+			ctrl = (volatile struct mlx4_wqe_ctrl_seg *)
+					mlx4_get_send_wqe(sq, head_idx);
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)ctrl +
+					sizeof(struct mlx4_wqe_ctrl_seg));
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
+			rte_prefetch0((volatile void *)addr);
+			/* Handle WQE wraparound. */
+			if (dseg >=
+				(volatile struct mlx4_wqe_data_seg *)sq->eob)
+				dseg = (volatile struct mlx4_wqe_data_seg *)
+						sq->buf;
+			dseg->addr = rte_cpu_to_be_64(addr);
+			/* Memory region key (big endian). */
+			lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+			dseg->lkey = rte_cpu_to_be_32(lkey);
+#ifndef NDEBUG
+			if (unlikely(dseg->lkey ==
+				rte_cpu_to_be_32((uint32_t)-1))) {
+				/* MR does not exist. */
+				DEBUG("%p: unable to get MP <-> MR association",
+				      (void *)txq);
+				/*
+				 * Restamp entry in case of failure.
+				 * Make sure that size is written correctly
+				 * Note that we give ownership to the SW,
+				 * not the HW.
+				 */
+				ctrl->fence_size =
+					(WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
+				mlx4_txq_stamp_freed_wqe(sq, head_idx,
+					     (sq->head & sq->txbb_cnt) ? 0 : 1);
+				elt->buf = NULL;
+				break;
+			}
+#endif /* NDEBUG */
+			/* Never be TXBB aligned, no need compiler barrier. */
+			dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
+			/* Fill the control parameters for this packet. */
+			ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
+			nr_txbbs = 1;
+		} else {
+			nr_txbbs = mlx4_tx_burst_segs(buf, txq, &ctrl);
+			if (nr_txbbs < 0) {
+				elt->buf = NULL;
+				break;
+			}
+		}
+		/*
+		 * For raw Ethernet, the SOLICIT flag is used to indicate
+		 * that no ICRC should be calculated.
+		 */
+		txq->elts_comp_cd -= nr_txbbs;
+		if (unlikely(txq->elts_comp_cd <= 0)) {
+			txq->elts_comp_cd = txq->elts_comp_cd_init;
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
+					       MLX4_WQE_CTRL_CQ_UPDATE);
+		} else {
+			srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
+		}
+		/* Enable HW checksum offload if requested */
+		if (txq->csum &&
+		    (buf->ol_flags &
+		     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
+			const uint64_t is_tunneled = (buf->ol_flags &
+						      (PKT_TX_TUNNEL_GRE |
+						       PKT_TX_TUNNEL_VXLAN));
+
+			if (is_tunneled && txq->csum_l2tun) {
+				owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
+						MLX4_WQE_CTRL_IL4_HDR_CSUM;
+				if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
+					srcrb.flags |=
+					    RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
+			} else {
+				srcrb.flags |=
+					RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+						MLX4_WQE_CTRL_TCP_UDP_CSUM);
+			}
+		}
+		if (txq->lb) {
+			/*
+			 * Copy destination MAC address to the WQE, this allows
+			 * loopback in eSwitch, so that VFs and PF can
+			 * communicate with each other.
+			 */
+			srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *));
+			ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *,
+					      sizeof(uint16_t)));
+		} else {
+			ctrl->imm = 0;
+		}
+		ctrl->srcrb_flags = srcrb.flags;
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		rte_io_wmb();
+		ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
+					      ((sq->head & sq->txbb_cnt) ?
+						       MLX4_BIT_WQE_OWN : 0));
+		sq->head += nr_txbbs;
+		elt->buf = buf;
+		bytes_sent += buf->pkt_len;
+		elts_head = elts_head_next;
+	}
+	/* Take a shortcut if nothing must be sent. */
+	if (unlikely(i == 0))
+		return 0;
+	/* Increment send statistics counters. */
+	txq->stats.opackets += i;
+	txq->stats.obytes += bytes_sent;
+	/* Make sure that descriptors are written before doorbell record. */
+	rte_wmb();
+	/* Ring QP doorbell. */
+	rte_write32(txq->msq.doorbell_qpn, txq->msq.db);
+	txq->elts_head = elts_head;
+	txq->elts_comp += i;
+	return i;
+}
+
+/**
+ * Translate Rx completion flags to packet type.
+ *
+ * @param[in] cqe
+ *   Pointer to CQE.
+ *
+ * @return
+ *   Packet type for struct rte_mbuf.
+ */
+static inline uint32_t
+rxq_cq_to_pkt_type(volatile struct mlx4_cqe *cqe)
+{
+	uint8_t idx = 0;
+	uint32_t pinfo = rte_be_to_cpu_32(cqe->vlan_my_qpn);
+	uint32_t status = rte_be_to_cpu_32(cqe->status);
+
+	/*
+	 * The index to the array should have:
+	 *  bit[7] - MLX4_CQE_L2_TUNNEL
+	 *  bit[6] - MLX4_CQE_L2_TUNNEL_IPV4
+	 */
+	if (!(pinfo & MLX4_CQE_L2_VLAN_MASK) && (pinfo & MLX4_CQE_L2_TUNNEL))
+		idx |= ((pinfo & MLX4_CQE_L2_TUNNEL) >> 20) |
+		       ((pinfo & MLX4_CQE_L2_TUNNEL_IPV4) >> 19);
+	/*
+	 * The index to the array should have:
+	 *  bit[5] - MLX4_CQE_STATUS_UDP
+	 *  bit[4] - MLX4_CQE_STATUS_TCP
+	 *  bit[3] - MLX4_CQE_STATUS_IPV4OPT
+	 *  bit[2] - MLX4_CQE_STATUS_IPV6
+	 *  bit[1] - MLX4_CQE_STATUS_IPV4F
+	 *  bit[0] - MLX4_CQE_STATUS_IPV4
+	 * giving a total of up to 256 entries.
+	 */
+	idx |= ((status & MLX4_CQE_STATUS_PTYPE_MASK) >> 22);
+	return mlx4_ptype_table[idx];
+}
+
+/**
+ * Translate Rx completion flags to offload flags.
+ *
+ * @param flags
+ *   Rx completion flags returned by mlx4_cqe_flags().
+ * @param csum
+ *   Whether Rx checksums are enabled.
+ * @param csum_l2tun
+ *   Whether Rx L2 tunnel checksums are enabled.
+ *
+ * @return
+ *   Offload flags (ol_flags) in mbuf format.
+ */
+static inline uint32_t
+rxq_cq_to_ol_flags(uint32_t flags, int csum, int csum_l2tun)
+{
+	uint32_t ol_flags = 0;
+
+	if (csum)
+		ol_flags |=
+			mlx4_transpose(flags,
+				       MLX4_CQE_STATUS_IP_HDR_CSUM_OK,
+				       PKT_RX_IP_CKSUM_GOOD) |
+			mlx4_transpose(flags,
+				       MLX4_CQE_STATUS_TCP_UDP_CSUM_OK,
+				       PKT_RX_L4_CKSUM_GOOD);
+	if ((flags & MLX4_CQE_L2_TUNNEL) && csum_l2tun)
+		ol_flags |=
+			mlx4_transpose(flags,
+				       MLX4_CQE_L2_TUNNEL_IPOK,
+				       PKT_RX_IP_CKSUM_GOOD) |
+			mlx4_transpose(flags,
+				       MLX4_CQE_L2_TUNNEL_L4_CSUM,
+				       PKT_RX_L4_CKSUM_GOOD);
+	return ol_flags;
+}
+
+/**
+ * Extract checksum information from CQE flags.
+ *
+ * @param cqe
+ *   Pointer to CQE structure.
+ * @param csum
+ *   Whether Rx checksums are enabled.
+ * @param csum_l2tun
+ *   Whether Rx L2 tunnel checksums are enabled.
+ *
+ * @return
+ *   CQE checksum information.
+ */
+static inline uint32_t
+mlx4_cqe_flags(volatile struct mlx4_cqe *cqe, int csum, int csum_l2tun)
+{
+	uint32_t flags = 0;
+
+	/*
+	 * The relevant bits are in different locations on their
+	 * CQE fields therefore we can join them in one 32bit
+	 * variable.
+	 */
+	if (csum)
+		flags = (rte_be_to_cpu_32(cqe->status) &
+			 MLX4_CQE_STATUS_IPV4_CSUM_OK);
+	if (csum_l2tun)
+		flags |= (rte_be_to_cpu_32(cqe->vlan_my_qpn) &
+			  (MLX4_CQE_L2_TUNNEL |
+			   MLX4_CQE_L2_TUNNEL_IPOK |
+			   MLX4_CQE_L2_TUNNEL_L4_CSUM |
+			   MLX4_CQE_L2_TUNNEL_IPV4));
+	return flags;
+}
+
+/**
+ * Poll one CQE from CQ.
+ *
+ * @param rxq
+ *   Pointer to the receive queue structure.
+ * @param[out] out
+ *   Just polled CQE.
+ *
+ * @return
+ *   Number of bytes of the CQE, 0 in case there is no completion.
+ */
+static unsigned int
+mlx4_cq_poll_one(struct rxq *rxq, volatile struct mlx4_cqe **out)
+{
+	int ret = 0;
+	volatile struct mlx4_cqe *cqe = NULL;
+	struct mlx4_cq *cq = &rxq->mcq;
+
+	cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index);
+	if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+	    !!(cq->cons_index & cq->cqe_cnt))
+		goto out;
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rte_rmb();
+	assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK));
+	assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) !=
+	       MLX4_CQE_OPCODE_ERROR);
+	ret = rte_be_to_cpu_32(cqe->byte_cnt);
+	++cq->cons_index;
+out:
+	*out = cqe;
+	return ret;
+}
+
+/**
+ * DPDK callback for Rx with scattered packets support.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct rxq *rxq = dpdk_rxq;
+	const uint32_t wr_cnt = (1 << rxq->elts_n) - 1;
+	const uint16_t sges_n = rxq->sges_n;
+	struct rte_mbuf *pkt = NULL;
+	struct rte_mbuf *seg = NULL;
+	unsigned int i = 0;
+	uint32_t rq_ci = rxq->rq_ci << sges_n;
+	int len = 0;
+
+	while (pkts_n) {
+		volatile struct mlx4_cqe *cqe;
+		uint32_t idx = rq_ci & wr_cnt;
+		struct rte_mbuf *rep = (*rxq->elts)[idx];
+		volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx];
+
+		/* Update the 'next' pointer of the previous segment. */
+		if (pkt)
+			seg->next = rep;
+		seg = rep;
+		rte_prefetch0(seg);
+		rte_prefetch0(scat);
+		rep = rte_mbuf_raw_alloc(rxq->mp);
+		if (unlikely(rep == NULL)) {
+			++rxq->stats.rx_nombuf;
+			if (!pkt) {
+				/*
+				 * No buffers before we even started,
+				 * bail out silently.
+				 */
+				break;
+			}
+			while (pkt != seg) {
+				assert(pkt != (*rxq->elts)[idx]);
+				rep = pkt->next;
+				pkt->next = NULL;
+				pkt->nb_segs = 1;
+				rte_mbuf_raw_free(pkt);
+				pkt = rep;
+			}
+			break;
+		}
+		if (!pkt) {
+			/* Looking for the new packet. */
+			len = mlx4_cq_poll_one(rxq, &cqe);
+			if (!len) {
+				rte_mbuf_raw_free(rep);
+				break;
+			}
+			if (unlikely(len < 0)) {
+				/* Rx error, packet is likely too large. */
+				rte_mbuf_raw_free(rep);
+				++rxq->stats.idropped;
+				goto skip;
+			}
+			pkt = seg;
+			/* Update packet information. */
+			pkt->packet_type = rxq_cq_to_pkt_type(cqe);
+			pkt->ol_flags = 0;
+			pkt->pkt_len = len;
+			if (rxq->csum | rxq->csum_l2tun) {
+				uint32_t flags =
+					mlx4_cqe_flags(cqe,
+						       rxq->csum,
+						       rxq->csum_l2tun);
+
+				pkt->ol_flags =
+					rxq_cq_to_ol_flags(flags,
+							   rxq->csum,
+							   rxq->csum_l2tun);
+			}
+		}
+		rep->nb_segs = 1;
+		rep->port = rxq->port_id;
+		rep->data_len = seg->data_len;
+		rep->data_off = seg->data_off;
+		(*rxq->elts)[idx] = rep;
+		/*
+		 * Fill NIC descriptor with the new buffer. The lkey and size
+		 * of the buffers are already known, only the buffer address
+		 * changes.
+		 */
+		scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+		if (len > seg->data_len) {
+			len -= seg->data_len;
+			++pkt->nb_segs;
+			++rq_ci;
+			continue;
+		}
+		/* The last segment. */
+		seg->data_len = len;
+		/* Increment bytes counter. */
+		rxq->stats.ibytes += pkt->pkt_len;
+		/* Return packet. */
+		*(pkts++) = pkt;
+		pkt = NULL;
+		--pkts_n;
+		++i;
+skip:
+		/* Align consumer index to the next stride. */
+		rq_ci >>= sges_n;
+		++rq_ci;
+		rq_ci <<= sges_n;
+	}
+	if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci))
+		return 0;
+	/* Update the consumer index. */
+	rxq->rq_ci = rq_ci >> sges_n;
+	rte_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+	*rxq->mcq.set_ci_db =
+		rte_cpu_to_be_32(rxq->mcq.cons_index & MLX4_CQ_DB_CI_MASK);
+	/* Increment packets counter. */
+	rxq->stats.ipackets += i;
+	return i;
+}
+
+/**
+ * Dummy DPDK callback for Tx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	(void)dpdk_txq;
+	(void)pkts;
+	(void)pkts_n;
+	return 0;
+}
+
+/**
+ * Dummy DPDK callback for Rx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	(void)dpdk_rxq;
+	(void)pkts;
+	(void)pkts_n;
+	return 0;
+}
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
new file mode 100644
index 00000000..4acad801
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -0,0 +1,214 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MLX4_RXTX_H_
+#define MLX4_RXTX_H_
+
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx4dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ethdev.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+
+#include "mlx4.h"
+#include "mlx4_prm.h"
+
+/** Rx queue counters. */
+struct mlx4_rxq_stats {
+	unsigned int idx; /**< Mapping index. */
+	uint64_t ipackets; /**< Total of successfully received packets. */
+	uint64_t ibytes; /**< Total of successfully received bytes. */
+	uint64_t idropped; /**< Total of packets dropped when Rx ring full. */
+	uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */
+};
+
+/** Rx queue descriptor. */
+struct rxq {
+	struct priv *priv; /**< Back pointer to private data. */
+	struct rte_mempool *mp; /**< Memory pool for allocations. */
+	struct mlx4_mr *mr; /**< Memory region. */
+	struct ibv_cq *cq; /**< Completion queue. */
+	struct ibv_wq *wq; /**< Work queue. */
+	struct ibv_comp_channel *channel; /**< Rx completion channel. */
+	uint16_t rq_ci; /**< Saved RQ consumer index. */
+	uint16_t port_id; /**< Port ID for incoming packets. */
+	uint16_t sges_n; /**< Number of segments per packet (log2 value). */
+	uint16_t elts_n; /**< Mbuf queue size (log2 value). */
+	struct rte_mbuf *(*elts)[]; /**< Rx elements. */
+	volatile struct mlx4_wqe_data_seg (*wqes)[]; /**< HW queue entries. */
+	volatile uint32_t *rq_db; /**< RQ doorbell record. */
+	uint32_t csum:1; /**< Enable checksum offloading. */
+	uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */
+	struct mlx4_cq mcq;  /**< Info for directly manipulating the CQ. */
+	struct mlx4_rxq_stats stats; /**< Rx queue counters. */
+	unsigned int socket; /**< CPU socket ID for allocations. */
+	uint32_t usecnt; /**< Number of users relying on queue resources. */
+	uint8_t data[]; /**< Remaining queue resources. */
+};
+
+/** Shared flow target for Rx queues. */
+struct mlx4_rss {
+	LIST_ENTRY(mlx4_rss) next; /**< Next entry in list. */
+	struct priv *priv; /**< Back pointer to private data. */
+	uint32_t refcnt; /**< Reference count for this object. */
+	uint32_t usecnt; /**< Number of users relying on @p qp and @p ind. */
+	struct ibv_qp *qp; /**< Queue pair. */
+	struct ibv_rwq_ind_table *ind; /**< Indirection table. */
+	uint64_t fields; /**< Fields for RSS processing (Verbs format). */
+	uint8_t key[MLX4_RSS_HASH_KEY_SIZE]; /**< Hash key to use. */
+	uint16_t queues; /**< Number of target queues. */
+	uint16_t queue_id[]; /**< Target queues. */
+};
+
+/** Tx element. */
+struct txq_elt {
+	struct rte_mbuf *buf; /**< Buffer. */
+};
+
+/** Rx queue counters. */
+struct mlx4_txq_stats {
+	unsigned int idx; /**< Mapping index. */
+	uint64_t opackets; /**< Total of successfully sent packets. */
+	uint64_t obytes; /**< Total of successfully sent bytes. */
+	uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+};
+
+/** Tx queue descriptor. */
+struct txq {
+	struct mlx4_sq msq; /**< Info for directly manipulating the SQ. */
+	struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */
+	unsigned int elts_head; /**< Current index in (*elts)[]. */
+	unsigned int elts_tail; /**< First element awaiting completion. */
+	unsigned int elts_comp; /**< Number of packets awaiting completion. */
+	int elts_comp_cd; /**< Countdown for next completion. */
+	unsigned int elts_comp_cd_init; /**< Initial value for countdown. */
+	unsigned int elts_n; /**< (*elts)[] length. */
+	struct txq_elt (*elts)[]; /**< Tx elements. */
+	struct mlx4_txq_stats stats; /**< Tx queue counters. */
+	uint32_t max_inline; /**< Max inline send size. */
+	uint32_t csum:1; /**< Enable checksum offloading. */
+	uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */
+	uint32_t lb:1; /**< Whether packets should be looped back by eSwitch. */
+	uint8_t *bounce_buf;
+	/**< Memory used for storing the first DWORD of data TXBBs. */
+	struct {
+		const struct rte_mempool *mp; /**< Cached memory pool. */
+		struct mlx4_mr *mr; /**< Memory region (for mp). */
+		uint32_t lkey; /**< mr->lkey copy. */
+	} mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */
+	struct priv *priv; /**< Back pointer to private data. */
+	unsigned int socket; /**< CPU socket ID for allocations. */
+	struct ibv_cq *cq; /**< Completion queue. */
+	struct ibv_qp *qp; /**< Queue pair. */
+	uint8_t data[]; /**< Remaining queue resources. */
+};
+
+/* mlx4_rxq.c */
+
+uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE];
+int mlx4_rss_init(struct priv *priv);
+void mlx4_rss_deinit(struct priv *priv);
+struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields,
+			      uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+			      uint16_t queues, const uint16_t queue_id[]);
+void mlx4_rss_put(struct mlx4_rss *rss);
+int mlx4_rss_attach(struct mlx4_rss *rss);
+void mlx4_rss_detach(struct mlx4_rss *rss);
+int mlx4_rxq_attach(struct rxq *rxq);
+void mlx4_rxq_detach(struct rxq *rxq);
+int mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
+			uint16_t desc, unsigned int socket,
+			const struct rte_eth_rxconf *conf,
+			struct rte_mempool *mp);
+void mlx4_rx_queue_release(void *dpdk_rxq);
+
+/* mlx4_rxtx.c */
+
+uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
+		       uint16_t pkts_n);
+uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
+		       uint16_t pkts_n);
+uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
+			       uint16_t pkts_n);
+uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+			       uint16_t pkts_n);
+
+/* mlx4_txq.c */
+
+int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
+			uint16_t desc, unsigned int socket,
+			const struct rte_eth_txconf *conf);
+void mlx4_tx_queue_release(void *dpdk_txq);
+
+/**
+ * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
+ * Call mlx4_txq_add_mr() if MP is not registered yet.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be returned.
+ *
+ * @return
+ *   mr->lkey on success, (uint32_t)-1 on failure.
+ */
+static inline uint32_t
+mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+	unsigned int i;
+
+	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+		if (unlikely(txq->mp2mr[i].mp == NULL)) {
+			/* Unknown MP, add a new MR for it. */
+			break;
+		}
+		if (txq->mp2mr[i].mp == mp) {
+			/* MP found MP. */
+			return txq->mp2mr[i].lkey;
+		}
+	}
+	return mlx4_txq_add_mr(txq, mp, i);
+}
+
+#endif /* MLX4_RXTX_H_ */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
new file mode 100644
index 00000000..7882a4d0
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -0,0 +1,414 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Tx queues configuration for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+
+#include "mlx4.h"
+#include "mlx4_autoconf.h"
+#include "mlx4_prm.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Free Tx queue elements.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ */
+static void
+mlx4_txq_free_elts(struct txq *txq)
+{
+	unsigned int elts_head = txq->elts_head;
+	unsigned int elts_tail = txq->elts_tail;
+	struct txq_elt (*elts)[txq->elts_n] = txq->elts;
+
+	DEBUG("%p: freeing WRs", (void *)txq);
+	while (elts_tail != elts_head) {
+		struct txq_elt *elt = &(*elts)[elts_tail];
+
+		assert(elt->buf != NULL);
+		rte_pktmbuf_free(elt->buf);
+		elt->buf = NULL;
+		if (++elts_tail == RTE_DIM(*elts))
+			elts_tail = 0;
+	}
+	txq->elts_tail = txq->elts_head;
+}
+
+struct txq_mp2mr_mbuf_check_data {
+	int ret;
+};
+
+/**
+ * Callback function for rte_mempool_obj_iter() to check whether a given
+ * mempool object looks like a mbuf.
+ *
+ * @param[in] mp
+ *   The mempool pointer
+ * @param[in] arg
+ *   Context data (struct mlx4_txq_mp2mr_mbuf_check_data). Contains the
+ *   return value.
+ * @param[in] obj
+ *   Object address.
+ * @param index
+ *   Object index, unused.
+ */
+static void
+mlx4_txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
+			  uint32_t index)
+{
+	struct txq_mp2mr_mbuf_check_data *data = arg;
+	struct rte_mbuf *buf = obj;
+
+	(void)index;
+	/*
+	 * Check whether mbuf structure fits element size and whether mempool
+	 * pointer is valid.
+	 */
+	if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
+		data->ret = -1;
+}
+
+/**
+ * Iterator function for rte_mempool_walk() to register existing mempools and
+ * fill the MP to MR cache of a Tx queue.
+ *
+ * @param[in] mp
+ *   Memory Pool to register.
+ * @param *arg
+ *   Pointer to Tx queue structure.
+ */
+static void
+mlx4_txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
+{
+	struct txq *txq = arg;
+	struct txq_mp2mr_mbuf_check_data data = {
+		.ret = 0,
+	};
+
+	/* Register mempool only if the first element looks like a mbuf. */
+	if (rte_mempool_obj_iter(mp, mlx4_txq_mp2mr_mbuf_check, &data) == 0 ||
+			data.ret == -1)
+		return;
+	mlx4_txq_mp2mr(txq, mp);
+}
+
+/**
+ * Retrieves information needed in order to directly access the Tx queue.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param mlxdv
+ *   Pointer to device information for this Tx queue.
+ */
+static void
+mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	struct mlx4_cq *cq = &txq->mcq;
+	struct mlx4dv_qp *dqp = mlxdv->qp.out;
+	struct mlx4dv_cq *dcq = mlxdv->cq.out;
+	uint32_t sq_size = (uint32_t)dqp->rq.offset - (uint32_t)dqp->sq.offset;
+
+	sq->buf = (uint8_t *)dqp->buf.buf + dqp->sq.offset;
+	/* Total length, including headroom and spare WQEs. */
+	sq->eob = sq->buf + sq_size;
+	sq->head = 0;
+	sq->tail = 0;
+	sq->txbb_cnt =
+		(dqp->sq.wqe_cnt << dqp->sq.wqe_shift) >> MLX4_TXBB_SHIFT;
+	sq->txbb_cnt_mask = sq->txbb_cnt - 1;
+	sq->db = dqp->sdb;
+	sq->doorbell_qpn = dqp->doorbell_qpn;
+	sq->headroom_txbbs =
+		(2048 + (1 << dqp->sq.wqe_shift)) >> MLX4_TXBB_SHIFT;
+	cq->buf = dcq->buf.buf;
+	cq->cqe_cnt = dcq->cqe_cnt;
+	cq->set_ci_db = dcq->set_ci_db;
+	cq->cqe_64 = (dcq->cqe_size & 64) ? 1 : 0;
+}
+
+/**
+ * DPDK callback to configure a Tx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   Tx queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ *   Thresholds parameters.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+		    unsigned int socket, const struct rte_eth_txconf *conf)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct mlx4dv_obj mlxdv;
+	struct mlx4dv_qp dv_qp;
+	struct mlx4dv_cq dv_cq;
+	struct txq_elt (*elts)[desc];
+	struct ibv_qp_init_attr qp_init_attr;
+	struct txq *txq;
+	uint8_t *bounce_buf;
+	struct mlx4_malloc_vec vec[] = {
+		{
+			.align = RTE_CACHE_LINE_SIZE,
+			.size = sizeof(*txq),
+			.addr = (void **)&txq,
+		},
+		{
+			.align = RTE_CACHE_LINE_SIZE,
+			.size = sizeof(*elts),
+			.addr = (void **)&elts,
+		},
+		{
+			.align = RTE_CACHE_LINE_SIZE,
+			.size = MLX4_MAX_WQE_SIZE,
+			.addr = (void **)&bounce_buf,
+		},
+	};
+	int ret;
+
+	(void)conf; /* Thresholds configuration (ignored). */
+	DEBUG("%p: configuring queue %u for %u descriptors",
+	      (void *)dev, idx, desc);
+	if (idx >= dev->data->nb_tx_queues) {
+		rte_errno = EOVERFLOW;
+		ERROR("%p: queue index out of range (%u >= %u)",
+		      (void *)dev, idx, dev->data->nb_tx_queues);
+		return -rte_errno;
+	}
+	txq = dev->data->tx_queues[idx];
+	if (txq) {
+		rte_errno = EEXIST;
+		DEBUG("%p: Tx queue %u already configured, release it first",
+		      (void *)dev, idx);
+		return -rte_errno;
+	}
+	if (!desc) {
+		rte_errno = EINVAL;
+		ERROR("%p: invalid number of Tx descriptors", (void *)dev);
+		return -rte_errno;
+	}
+	/* Allocate and initialize Tx queue. */
+	mlx4_zmallocv_socket("TXQ", vec, RTE_DIM(vec), socket);
+	if (!txq) {
+		ERROR("%p: unable to allocate queue index %u",
+		      (void *)dev, idx);
+		return -rte_errno;
+	}
+	*txq = (struct txq){
+		.priv = priv,
+		.stats = {
+			.idx = idx,
+		},
+		.socket = socket,
+		.elts_n = desc,
+		.elts = elts,
+		.elts_head = 0,
+		.elts_tail = 0,
+		.elts_comp = 0,
+		/*
+		 * Request send completion every MLX4_PMD_TX_PER_COMP_REQ
+		 * packets or at least 4 times per ring.
+		 */
+		.elts_comp_cd =
+			RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4),
+		.elts_comp_cd_init =
+			RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4),
+		.csum = priv->hw_csum,
+		.csum_l2tun = priv->hw_csum_l2tun,
+		/* Enable Tx loopback for VF devices. */
+		.lb = !!priv->vf,
+		.bounce_buf = bounce_buf,
+	};
+	txq->cq = ibv_create_cq(priv->ctx, desc, NULL, NULL, 0);
+	if (!txq->cq) {
+		rte_errno = ENOMEM;
+		ERROR("%p: CQ creation failure: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
+	qp_init_attr = (struct ibv_qp_init_attr){
+		.send_cq = txq->cq,
+		.recv_cq = txq->cq,
+		.cap = {
+			.max_send_wr =
+				RTE_MIN(priv->device_attr.max_qp_wr, desc),
+			.max_send_sge = 1,
+			.max_inline_data = MLX4_PMD_MAX_INLINE,
+		},
+		.qp_type = IBV_QPT_RAW_PACKET,
+		/* No completion events must occur by default. */
+		.sq_sig_all = 0,
+	};
+	txq->qp = ibv_create_qp(priv->pd, &qp_init_attr);
+	if (!txq->qp) {
+		rte_errno = errno ? errno : EINVAL;
+		ERROR("%p: QP creation failure: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
+	txq->max_inline = qp_init_attr.cap.max_inline_data;
+	ret = ibv_modify_qp
+		(txq->qp,
+		 &(struct ibv_qp_attr){
+			.qp_state = IBV_QPS_INIT,
+			.port_num = priv->port,
+		 },
+		 IBV_QP_STATE | IBV_QP_PORT);
+	if (ret) {
+		rte_errno = ret;
+		ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
+	ret = ibv_modify_qp
+		(txq->qp,
+		 &(struct ibv_qp_attr){
+			.qp_state = IBV_QPS_RTR,
+		 },
+		 IBV_QP_STATE);
+	if (ret) {
+		rte_errno = ret;
+		ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
+	ret = ibv_modify_qp
+		(txq->qp,
+		 &(struct ibv_qp_attr){
+			.qp_state = IBV_QPS_RTS,
+		 },
+		 IBV_QP_STATE);
+	if (ret) {
+		rte_errno = ret;
+		ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
+	/* Retrieve device queue information. */
+	mlxdv.cq.in = txq->cq;
+	mlxdv.cq.out = &dv_cq;
+	mlxdv.qp.in = txq->qp;
+	mlxdv.qp.out = &dv_qp;
+	ret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_QP | MLX4DV_OBJ_CQ);
+	if (ret) {
+		rte_errno = EINVAL;
+		ERROR("%p: failed to obtain information needed for"
+		      " accessing the device queues", (void *)dev);
+		goto error;
+	}
+	mlx4_txq_fill_dv_obj_info(txq, &mlxdv);
+	/* Pre-register known mempools. */
+	rte_mempool_walk(mlx4_txq_mp2mr_iter, txq);
+	DEBUG("%p: adding Tx queue %p to list", (void *)dev, (void *)txq);
+	dev->data->tx_queues[idx] = txq;
+	return 0;
+error:
+	dev->data->tx_queues[idx] = NULL;
+	ret = rte_errno;
+	mlx4_tx_queue_release(txq);
+	rte_errno = ret;
+	assert(rte_errno > 0);
+	return -rte_errno;
+}
+
+/**
+ * DPDK callback to release a Tx queue.
+ *
+ * @param dpdk_txq
+ *   Generic Tx queue pointer.
+ */
+void
+mlx4_tx_queue_release(void *dpdk_txq)
+{
+	struct txq *txq = (struct txq *)dpdk_txq;
+	struct priv *priv;
+	unsigned int i;
+
+	if (txq == NULL)
+		return;
+	priv = txq->priv;
+	for (i = 0; i != priv->dev->data->nb_tx_queues; ++i)
+		if (priv->dev->data->tx_queues[i] == txq) {
+			DEBUG("%p: removing Tx queue %p from list",
+			      (void *)priv->dev, (void *)txq);
+			priv->dev->data->tx_queues[i] = NULL;
+			break;
+		}
+	mlx4_txq_free_elts(txq);
+	if (txq->qp)
+		claim_zero(ibv_destroy_qp(txq->qp));
+	if (txq->cq)
+		claim_zero(ibv_destroy_cq(txq->cq));
+	for (i = 0; i != RTE_DIM(txq->mp2mr); ++i) {
+		if (!txq->mp2mr[i].mp)
+			break;
+		assert(txq->mp2mr[i].mr);
+		mlx4_mr_put(txq->mp2mr[i].mr);
+	}
+	rte_free(txq);
+}
diff --git a/drivers/net/mlx4/mlx4_utils.c b/drivers/net/mlx4/mlx4_utils.c
new file mode 100644
index 00000000..f18c7145
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_utils.c
@@ -0,0 +1,217 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Utility functions used by the mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_memory.h>
+
+#include "mlx4_utils.h"
+
+/**
+ * Make a file descriptor non-blocking.
+ *
+ * @param fd
+ *   File descriptor to alter.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_fd_set_non_blocking(int fd)
+{
+	int ret = fcntl(fd, F_GETFL);
+
+	if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK))
+		return 0;
+	assert(errno);
+	rte_errno = errno;
+	return -rte_errno;
+}
+
+/**
+ * Internal helper to allocate memory once for several disparate objects.
+ *
+ * The most restrictive alignment constraint for standard objects is assumed
+ * to be sizeof(double) and is used as a default value.
+ *
+ * C11 code would include stdalign.h and use alignof(max_align_t) however
+ * we'll stick with C99 for the time being.
+ */
+static inline size_t
+mlx4_mallocv_inline(const char *type, const struct mlx4_malloc_vec *vec,
+		    unsigned int cnt, int zero, int socket)
+{
+	unsigned int i;
+	size_t size;
+	size_t least;
+	uint8_t *data = NULL;
+	int fill = !vec[0].addr;
+
+fill:
+	size = 0;
+	least = 0;
+	for (i = 0; i < cnt; ++i) {
+		size_t align = (uintptr_t)vec[i].align;
+
+		if (!align) {
+			align = sizeof(double);
+		} else if (!rte_is_power_of_2(align)) {
+			rte_errno = EINVAL;
+			goto error;
+		}
+		if (least < align)
+			least = align;
+		align = RTE_ALIGN_CEIL(size, align);
+		size = align + vec[i].size;
+		if (fill && vec[i].addr)
+			*vec[i].addr = data + align;
+	}
+	if (fill)
+		return size;
+	if (!zero)
+		data = rte_malloc_socket(type, size, least, socket);
+	else
+		data = rte_zmalloc_socket(type, size, least, socket);
+	if (data) {
+		fill = 1;
+		goto fill;
+	}
+	rte_errno = ENOMEM;
+error:
+	for (i = 0; i != cnt; ++i)
+		if (vec[i].addr)
+			*vec[i].addr = NULL;
+	return 0;
+}
+
+/**
+ * Allocate memory once for several disparate objects.
+ *
+ * This function adds iovec-like semantics (e.g. readv()) to rte_malloc().
+ * Memory is allocated once for several contiguous objects of nonuniform
+ * sizes and alignment constraints.
+ *
+ * Each entry of @p vec describes the size, alignment constraint and
+ * provides a buffer address where the resulting object pointer must be
+ * stored.
+ *
+ * The buffer of the first entry is guaranteed to point to the beginning of
+ * the allocated region and is safe to use with rte_free().
+ *
+ * NULL buffers are silently ignored.
+ *
+ * Providing a NULL buffer in the first entry prevents this function from
+ * allocating any memory but has otherwise no effect on its behavior. In
+ * this case, the contents of remaining non-NULL buffers are updated with
+ * addresses relative to zero (i.e. offsets that would have been used during
+ * the allocation).
+ *
+ * @param[in] type
+ *   A string identifying the type of allocated objects (useful for debug
+ *   purposes, such as identifying the cause of a memory leak). Can be NULL.
+ * @param[in, out] vec
+ *   Description of objects to allocate memory for.
+ * @param cnt
+ *   Number of entries in @p vec.
+ *
+ * @return
+ *   Size in bytes of the allocated region including any padding. In case of
+ *   error, rte_errno is set, 0 is returned and NULL is stored in the
+ *   non-NULL buffers pointed by @p vec.
+ *
+ * @see struct mlx4_malloc_vec
+ * @see rte_malloc()
+ */
+size_t
+mlx4_mallocv(const char *type, const struct mlx4_malloc_vec *vec,
+	     unsigned int cnt)
+{
+	return mlx4_mallocv_inline(type, vec, cnt, 0, SOCKET_ID_ANY);
+}
+
+/**
+ * Combines the semantics of mlx4_mallocv() with those of rte_zmalloc().
+ *
+ * @see mlx4_mallocv()
+ * @see rte_zmalloc()
+ */
+size_t
+mlx4_zmallocv(const char *type, const struct mlx4_malloc_vec *vec,
+	      unsigned int cnt)
+{
+	return mlx4_mallocv_inline(type, vec, cnt, 1, SOCKET_ID_ANY);
+}
+
+/**
+ * Socket-aware version of mlx4_mallocv().
+ *
+ * This function takes one additional parameter.
+ *
+ * @param socket
+ *   NUMA socket to allocate memory on. If SOCKET_ID_ANY is used, this
+ *   function will behave the same as mlx4_mallocv().
+ *
+ * @see mlx4_mallocv()
+ * @see rte_malloc_socket()
+ */
+size_t
+mlx4_mallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+		    unsigned int cnt, int socket)
+{
+	return mlx4_mallocv_inline(type, vec, cnt, 0, socket);
+}
+
+/**
+ * Combines the semantics of mlx4_mallocv_socket() with those of
+ * mlx4_zmalloc_socket().
+ *
+ * @see mlx4_mallocv_socket()
+ * @see rte_zmalloc_socket()
+ */
+size_t
+mlx4_zmallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+		     unsigned int cnt, int socket)
+{
+	return mlx4_mallocv_inline(type, vec, cnt, 1, socket);
+}
diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h
new file mode 100644
index 00000000..dc529c9c
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_utils.h
@@ -0,0 +1,133 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MLX4_UTILS_H_
+#define MLX4_UTILS_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+
+#include "mlx4.h"
+
+#ifndef NDEBUG
+
+/*
+ * When debugging is enabled (NDEBUG not defined), file, line and function
+ * information replace the driver name (MLX4_DRIVER_NAME) in log messages.
+ */
+
+/** Return the file name part of a path. */
+static inline const char *
+pmd_drv_log_basename(const char *s)
+{
+	const char *n = s;
+
+	while (*n)
+		if (*(n++) == '/')
+			s = n;
+	return s;
+}
+
+#define PMD_DRV_LOG(level, ...) \
+	RTE_LOG(level, PMD, \
+		RTE_FMT("%s:%u: %s(): " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \
+			pmd_drv_log_basename(__FILE__), \
+			__LINE__, \
+			__func__, \
+			RTE_FMT_TAIL(__VA_ARGS__,)))
+#define DEBUG(...) PMD_DRV_LOG(DEBUG, __VA_ARGS__)
+#ifndef MLX4_PMD_DEBUG_BROKEN_VERBS
+#define claim_zero(...) assert((__VA_ARGS__) == 0)
+#else /* MLX4_PMD_DEBUG_BROKEN_VERBS */
+#define claim_zero(...) \
+	(void)(((__VA_ARGS__) == 0) || \
+		DEBUG("Assertion `(" # __VA_ARGS__ ") == 0' failed (IGNORED)."))
+#endif /* MLX4_PMD_DEBUG_BROKEN_VERBS */
+
+#else /* NDEBUG */
+
+/*
+ * Like assert(), DEBUG() becomes a no-op and claim_zero() does not perform
+ * any check when debugging is disabled.
+ */
+
+#define PMD_DRV_LOG(level, ...) \
+	RTE_LOG(level, PMD, \
+		RTE_FMT(MLX4_DRIVER_NAME ": " \
+			RTE_FMT_HEAD(__VA_ARGS__,) "\n", \
+		RTE_FMT_TAIL(__VA_ARGS__,)))
+#define DEBUG(...) (void)0
+#define claim_zero(...) (__VA_ARGS__)
+
+#endif /* NDEBUG */
+
+#define INFO(...) PMD_DRV_LOG(INFO, __VA_ARGS__)
+#define WARN(...) PMD_DRV_LOG(WARNING, __VA_ARGS__)
+#define ERROR(...) PMD_DRV_LOG(ERR, __VA_ARGS__)
+
+/** Allocate a buffer on the stack and fill it with a printf format string. */
+#define MKSTR(name, ...) \
+	char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \
+	\
+	snprintf(name, sizeof(name), __VA_ARGS__)
+
+/** Generate a string out of the provided arguments. */
+#define MLX4_STR(...) # __VA_ARGS__
+
+/** Similar to MLX4_STR() with enclosed macros expanded first. */
+#define MLX4_STR_EXPAND(...) MLX4_STR(__VA_ARGS__)
+
+/** Object description used with mlx4_mallocv() and similar functions. */
+struct mlx4_malloc_vec {
+	size_t align; /**< Alignment constraint (power of 2), 0 if unknown. */
+	size_t size; /**< Object size. */
+	void **addr; /**< Storage for allocation address. */
+};
+
+/* mlx4_utils.c */
+
+int mlx4_fd_set_non_blocking(int fd);
+size_t mlx4_mallocv(const char *type, const struct mlx4_malloc_vec *vec,
+		    unsigned int cnt);
+size_t mlx4_zmallocv(const char *type, const struct mlx4_malloc_vec *vec,
+		     unsigned int cnt);
+size_t mlx4_mallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+			   unsigned int cnt, int socket);
+size_t mlx4_zmallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+			    unsigned int cnt, int socket);
+
+#endif /* MLX4_UTILS_H_ */