New upstream version 17.11-rc3

Change-Id: I6a5baa40612fe0c20f30b5fa773a6cbbac63a685 Signed-off-by: Luca Boccassi <luca.boccassi@gmail.com>
author: Luca Boccassi <luca.boccassi@gmail.com> 2017-11-08 14:15:11 +0000
committer: Luca Boccassi <luca.boccassi@gmail.com> 2017-11-08 14:45:54 +0000
commit: 055c52583a2794da8ba1e85a48cce3832372b12f (patch)
tree: 8ceb1cb78fbb46a0f341f8ee24feb3c6b5540013 /drivers/net/mlx5
parent: f239aed5e674965691846e8ce3f187dd47523689 (diff)
25 files changed, 7167 insertions, 5522 deletions
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 8736de5d..a3984eb9 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -39,8 +39,9 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxq.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_txq.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx.c
-ifeq ($(CONFIG_RTE_ARCH_X86_64),y)
-SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx_vec_sse.c
+ifneq ($(filter y,$(CONFIG_RTE_ARCH_X86_64) \
+		  $(CONFIG_RTE_ARCH_ARM64)),)
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx_vec.c
 endif
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_trigger.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_ethdev.c
@@ -49,9 +50,9 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxmode.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_vlan.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_stats.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c
-SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_fdir.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
 
 # Basic CFLAGS.
 CFLAGS += -O3
@@ -63,7 +64,10 @@ CFLAGS += -D_DEFAULT_SOURCE
 CFLAGS += -D_XOPEN_SOURCE=600
 CFLAGS += $(WERROR_FLAGS)
 CFLAGS += -Wno-strict-prototypes
-LDLIBS += -libverbs
+LDLIBS += -libverbs -lmlx5
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_pci
 
 # A few warnings cannot be avoided in external headers.
 CFLAGS += -Wno-error=cast-qual
@@ -104,24 +108,24 @@ mlx5_autoconf.h.new: FORCE
 mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 	$Q $(RM) -f -- '$@'
 	$Q sh -- '$<' '$@' \
-		HAVE_VERBS_IBV_EXP_FLOW_SPEC_ACTION_DROP \
-		infiniband/verbs_exp.h \
-		enum IBV_EXP_FLOW_SPEC_ACTION_DROP \
+		HAVE_IBV_DEVICE_VXLAN_SUPPORT \
+		infiniband/verbs.h \
+		enum IBV_DEVICE_VXLAN_SUPPORT \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
-		HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \
-		infiniband/verbs_exp.h \
-		enum IBV_EXP_CQ_COMPRESSED_CQE \
+		HAVE_IBV_WQ_FLAG_RX_END_PADDING \
+		infiniband/verbs.h \
+		enum IBV_WQ_FLAG_RX_END_PADDING \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
-		HAVE_VERBS_MLX5_ETH_VLAN_INLINE_HEADER_SIZE \
-		infiniband/mlx5_hw.h \
-		enum MLX5_ETH_VLAN_INLINE_HEADER_SIZE \
+		HAVE_IBV_MLX5_MOD_MPW \
+		infiniband/mlx5dv.h \
+		enum MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
-		HAVE_VERBS_MLX5_OPCODE_TSO \
-		infiniband/mlx5_hw.h \
-		enum MLX5_OPCODE_TSO \
+		HAVE_IBV_MLX5_MOD_CQE_128B_COMP \
+		infiniband/mlx5dv.h \
+		enum MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
 		HAVE_ETHTOOL_LINK_MODE_25G \
@@ -139,9 +143,9 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		enum ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
-		HAVE_UPDATE_CQ_CI \
-		infiniband/mlx5_hw.h \
-		func ibv_mlx5_exp_update_cq_ci \
+		HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT \
+		infiniband/verbs.h \
+		enum IBV_FLOW_SPEC_ACTION_COUNT \
 		$(AUTOCONF_OUTPUT)
 
 # Create mlx5_autoconf.h or update it in case it differs from the new one.
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index b7e50463..0548d17a 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -50,19 +50,13 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_malloc.h>
 #include <rte_ethdev.h>
 #include <rte_ethdev_pci.h>
 #include <rte_pci.h>
+#include <rte_bus_pci.h>
 #include <rte_common.h>
 #include <rte_kvargs.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
@@ -103,6 +97,15 @@
 /* Default PMD specific parameter value. */
 #define MLX5_ARG_UNSET (-1)
 
+#ifndef HAVE_IBV_MLX5_MOD_MPW
+#define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
+#define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
+#endif
+
+#ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
+#define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
+#endif
+
 struct mlx5_args {
 	int cqe_comp;
 	int txq_inline;
@@ -134,6 +137,52 @@ mlx5_getenv_int(const char *name)
 }
 
 /**
+ * Verbs callback to allocate a memory. This function should allocate the space
+ * according to the size provided residing inside a huge page.
+ * Please note that all allocation must respect the alignment from libmlx5
+ * (i.e. currently sysconf(_SC_PAGESIZE)).
+ *
+ * @param[in] size
+ *   The size in bytes of the memory to allocate.
+ * @param[in] data
+ *   A pointer to the callback data.
+ *
+ * @return
+ *   a pointer to the allocate space.
+ */
+static void *
+mlx5_alloc_verbs_buf(size_t size, void *data)
+{
+	struct priv *priv = data;
+	void *ret;
+	size_t alignment = sysconf(_SC_PAGESIZE);
+
+	assert(data != NULL);
+	assert(!mlx5_is_secondary());
+	ret = rte_malloc_socket(__func__, size, alignment,
+				priv->dev->device->numa_node);
+	DEBUG("Extern alloc size: %lu, align: %lu: %p", size, alignment, ret);
+	return ret;
+}
+
+/**
+ * Verbs callback to free a memory.
+ *
+ * @param[in] ptr
+ *   A pointer to the memory to free.
+ * @param[in] data
+ *   A pointer to the callback data.
+ */
+static void
+mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
+{
+	assert(data != NULL);
+	assert(!mlx5_is_secondary());
+	DEBUG("Extern free request: %p", ptr);
+	rte_free(ptr);
+}
+
+/**
  * DPDK callback to close the device.
  *
  * Destroy all queues and objects, free memory.
@@ -146,6 +195,7 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 {
 	struct priv *priv = mlx5_get_priv(dev);
 	unsigned int i;
+	int ret;
 
 	priv_lock(priv);
 	DEBUG("%p: closing device \"%s\"",
@@ -153,48 +203,23 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 	      ((priv->ctx != NULL) ? priv->ctx->device->name : ""));
 	/* In case mlx5_dev_stop() has not been called. */
 	priv_dev_interrupt_handler_uninstall(priv, dev);
-	priv_special_flow_disable_all(priv);
-	priv_mac_addrs_disable(priv);
-	priv_destroy_hash_rxqs(priv);
-
-	/* Remove flow director elements. */
-	priv_fdir_disable(priv);
-	priv_fdir_delete_filters_list(priv);
-
+	priv_dev_traffic_disable(priv, dev);
 	/* Prevent crashes when queues are still in use. */
 	dev->rx_pkt_burst = removed_rx_burst;
 	dev->tx_pkt_burst = removed_tx_burst;
 	if (priv->rxqs != NULL) {
 		/* XXX race condition if mlx5_rx_burst() is still running. */
 		usleep(1000);
-		for (i = 0; (i != priv->rxqs_n); ++i) {
-			struct rxq *rxq = (*priv->rxqs)[i];
-			struct rxq_ctrl *rxq_ctrl;
-
-			if (rxq == NULL)
-				continue;
-			rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
-			(*priv->rxqs)[i] = NULL;
-			rxq_cleanup(rxq_ctrl);
-			rte_free(rxq_ctrl);
-		}
+		for (i = 0; (i != priv->rxqs_n); ++i)
+			mlx5_priv_rxq_release(priv, i);
 		priv->rxqs_n = 0;
 		priv->rxqs = NULL;
 	}
 	if (priv->txqs != NULL) {
 		/* XXX race condition if mlx5_tx_burst() is still running. */
 		usleep(1000);
-		for (i = 0; (i != priv->txqs_n); ++i) {
-			struct txq *txq = (*priv->txqs)[i];
-			struct txq_ctrl *txq_ctrl;
-
-			if (txq == NULL)
-				continue;
-			txq_ctrl = container_of(txq, struct txq_ctrl, txq);
-			(*priv->txqs)[i] = NULL;
-			txq_cleanup(txq_ctrl);
-			rte_free(txq_ctrl);
-		}
+		for (i = 0; (i != priv->txqs_n); ++i)
+			mlx5_priv_txq_release(priv, i);
 		priv->txqs_n = 0;
 		priv->txqs = NULL;
 	}
@@ -204,18 +229,40 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 		claim_zero(ibv_close_device(priv->ctx));
 	} else
 		assert(priv->ctx == NULL);
-	if (priv->rss_conf != NULL) {
-		for (i = 0; (i != hash_rxq_init_n); ++i)
-			rte_free((*priv->rss_conf)[i]);
-		rte_free(priv->rss_conf);
-	}
+	if (priv->rss_conf.rss_key != NULL)
+		rte_free(priv->rss_conf.rss_key);
 	if (priv->reta_idx != NULL)
 		rte_free(priv->reta_idx);
+	priv_socket_uninit(priv);
+	ret = mlx5_priv_hrxq_ibv_verify(priv);
+	if (ret)
+		WARN("%p: some Hash Rx queue still remain", (void *)priv);
+	ret = mlx5_priv_ind_table_ibv_verify(priv);
+	if (ret)
+		WARN("%p: some Indirection table still remain", (void *)priv);
+	ret = mlx5_priv_rxq_ibv_verify(priv);
+	if (ret)
+		WARN("%p: some Verbs Rx queue still remain", (void *)priv);
+	ret = mlx5_priv_rxq_verify(priv);
+	if (ret)
+		WARN("%p: some Rx Queues still remain", (void *)priv);
+	ret = mlx5_priv_txq_ibv_verify(priv);
+	if (ret)
+		WARN("%p: some Verbs Tx queue still remain", (void *)priv);
+	ret = mlx5_priv_txq_verify(priv);
+	if (ret)
+		WARN("%p: some Tx Queues still remain", (void *)priv);
+	ret = priv_flow_verify(priv);
+	if (ret)
+		WARN("%p: some flows still remain", (void *)priv);
+	ret = priv_mr_verify(priv);
+	if (ret)
+		WARN("%p: some Memory Region still remain", (void *)priv);
 	priv_unlock(priv);
 	memset(priv, 0, sizeof(*priv));
 }
 
-static const struct eth_dev_ops mlx5_dev_ops = {
+const struct eth_dev_ops mlx5_dev_ops = {
 	.dev_configure = mlx5_dev_configure,
 	.dev_start = mlx5_dev_start,
 	.dev_stop = mlx5_dev_stop,
@@ -254,10 +301,55 @@ static const struct eth_dev_ops mlx5_dev_ops = {
 	.filter_ctrl = mlx5_dev_filter_ctrl,
 	.rx_descriptor_status = mlx5_rx_descriptor_status,
 	.tx_descriptor_status = mlx5_tx_descriptor_status,
-#ifdef HAVE_UPDATE_CQ_CI
 	.rx_queue_intr_enable = mlx5_rx_intr_enable,
 	.rx_queue_intr_disable = mlx5_rx_intr_disable,
-#endif
+};
+
+static const struct eth_dev_ops mlx5_dev_sec_ops = {
+	.stats_get = mlx5_stats_get,
+	.stats_reset = mlx5_stats_reset,
+	.xstats_get = mlx5_xstats_get,
+	.xstats_reset = mlx5_xstats_reset,
+	.xstats_get_names = mlx5_xstats_get_names,
+	.dev_infos_get = mlx5_dev_infos_get,
+	.rx_descriptor_status = mlx5_rx_descriptor_status,
+	.tx_descriptor_status = mlx5_tx_descriptor_status,
+};
+
+/* Available operators in flow isolated mode. */
+const struct eth_dev_ops mlx5_dev_ops_isolate = {
+	.dev_configure = mlx5_dev_configure,
+	.dev_start = mlx5_dev_start,
+	.dev_stop = mlx5_dev_stop,
+	.dev_set_link_down = mlx5_set_link_down,
+	.dev_set_link_up = mlx5_set_link_up,
+	.dev_close = mlx5_dev_close,
+	.link_update = mlx5_link_update,
+	.stats_get = mlx5_stats_get,
+	.stats_reset = mlx5_stats_reset,
+	.xstats_get = mlx5_xstats_get,
+	.xstats_reset = mlx5_xstats_reset,
+	.xstats_get_names = mlx5_xstats_get_names,
+	.dev_infos_get = mlx5_dev_infos_get,
+	.dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
+	.vlan_filter_set = mlx5_vlan_filter_set,
+	.rx_queue_setup = mlx5_rx_queue_setup,
+	.tx_queue_setup = mlx5_tx_queue_setup,
+	.rx_queue_release = mlx5_rx_queue_release,
+	.tx_queue_release = mlx5_tx_queue_release,
+	.flow_ctrl_get = mlx5_dev_get_flow_ctrl,
+	.flow_ctrl_set = mlx5_dev_set_flow_ctrl,
+	.mac_addr_remove = mlx5_mac_addr_remove,
+	.mac_addr_add = mlx5_mac_addr_add,
+	.mac_addr_set = mlx5_mac_addr_set,
+	.mtu_set = mlx5_dev_set_mtu,
+	.vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
+	.vlan_offload_set = mlx5_vlan_offload_set,
+	.filter_ctrl = mlx5_dev_filter_ctrl,
+	.rx_descriptor_status = mlx5_rx_descriptor_status,
+	.tx_descriptor_status = mlx5_tx_descriptor_status,
+	.rx_queue_intr_enable = mlx5_rx_intr_enable,
+	.rx_queue_intr_disable = mlx5_rx_intr_disable,
 };
 
 static struct {
@@ -449,12 +541,17 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 	struct ibv_device *ibv_dev;
 	int err = 0;
 	struct ibv_context *attr_ctx = NULL;
-	struct ibv_device_attr device_attr;
+	struct ibv_device_attr_ex device_attr;
 	unsigned int sriov;
 	unsigned int mps;
-	unsigned int tunnel_en;
+	unsigned int cqe_comp;
+	unsigned int tunnel_en = 0;
 	int idx;
 	int i;
+	struct mlx5dv_context attrs_out;
+#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
+	struct ibv_counter_set_description cs_desc;
+#endif
 
 	(void)pci_drv;
 	assert(pci_drv == &mlx5_driver);
@@ -500,34 +597,24 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
 		      (pci_dev->id.device_id ==
 		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		/*
-		 * Multi-packet send is supported by ConnectX-4 Lx PF as well
-		 * as all ConnectX-5 devices.
-		 */
 		switch (pci_dev->id.device_id) {
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
 			tunnel_en = 1;
-			mps = MLX5_MPW_DISABLED;
 			break;
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
-			mps = MLX5_MPW;
-			break;
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
 			tunnel_en = 1;
-			mps = MLX5_MPW_ENHANCED;
 			break;
 		default:
-			mps = MLX5_MPW_DISABLED;
+			break;
 		}
 		INFO("PCI information matches, using device \"%s\""
-		     " (SR-IOV: %s, %sMPS: %s)",
+		     " (SR-IOV: %s)",
 		     list[i]->name,
-		     sriov ? "true" : "false",
-		     mps == MLX5_MPW_ENHANCED ? "Enhanced " : "",
-		     mps != MLX5_MPW_DISABLED ? "true" : "false");
+		     sriov ? "true" : "false");
 		attr_ctx = ibv_open_device(list[i]);
 		err = errno;
 		break;
@@ -548,11 +635,33 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 	ibv_dev = list[i];
 
 	DEBUG("device opened");
-	if (ibv_query_device(attr_ctx, &device_attr))
+	/*
+	 * Multi-packet send is supported by ConnectX-4 Lx PF as well
+	 * as all ConnectX-5 devices.
+	 */
+	mlx5dv_query_device(attr_ctx, &attrs_out);
+	if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
+		if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
+			DEBUG("Enhanced MPW is supported");
+			mps = MLX5_MPW_ENHANCED;
+		} else {
+			DEBUG("MPW is supported");
+			mps = MLX5_MPW;
+		}
+	} else {
+		DEBUG("MPW isn't supported");
+		mps = MLX5_MPW_DISABLED;
+	}
+	if (RTE_CACHE_LINE_SIZE == 128 &&
+	    !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
+		cqe_comp = 0;
+	else
+		cqe_comp = 1;
+	if (ibv_query_device_ex(attr_ctx, NULL, &device_attr))
 		goto error;
-	INFO("%u port(s) detected", device_attr.phys_port_cnt);
+	INFO("%u port(s) detected", device_attr.orig_attr.phys_port_cnt);
 
-	for (i = 0; i < device_attr.phys_port_cnt; i++) {
+	for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) {
 		uint32_t port = i + 1; /* ports are indexed from one */
 		uint32_t test = (1 << i);
 		struct ibv_context *ctx = NULL;
@@ -560,9 +669,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		struct ibv_pd *pd = NULL;
 		struct priv *priv = NULL;
 		struct rte_eth_dev *eth_dev;
-		struct ibv_exp_device_attr exp_device_attr;
+		struct ibv_device_attr_ex device_attr_ex;
 		struct ether_addr mac;
 		uint16_t num_vfs = 0;
+		struct ibv_device_attr_ex device_attr;
 		struct mlx5_args args = {
 			.cqe_comp = MLX5_ARG_UNSET,
 			.txq_inline = MLX5_ARG_UNSET,
@@ -575,20 +685,49 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			.rx_vec_en = MLX5_ARG_UNSET,
 		};
 
-		exp_device_attr.comp_mask =
-			IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS |
-			IBV_EXP_DEVICE_ATTR_RX_HASH |
-			IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS |
-			IBV_EXP_DEVICE_ATTR_RX_PAD_END_ALIGN |
-			IBV_EXP_DEVICE_ATTR_TSO_CAPS |
-			0;
+		mlx5_dev[idx].ports |= test;
+
+		if (mlx5_is_secondary()) {
+			/* from rte_ethdev.c */
+			char name[RTE_ETH_NAME_MAX_LEN];
+
+			snprintf(name, sizeof(name), "%s port %u",
+				 ibv_get_device_name(ibv_dev), port);
+			eth_dev = rte_eth_dev_attach_secondary(name);
+			if (eth_dev == NULL) {
+				ERROR("can not attach rte ethdev");
+				err = ENOMEM;
+				goto error;
+			}
+			eth_dev->device = &pci_dev->device;
+			eth_dev->dev_ops = &mlx5_dev_sec_ops;
+			priv = eth_dev->data->dev_private;
+			/* Receive command fd from primary process */
+			err = priv_socket_connect(priv);
+			if (err < 0) {
+				err = -err;
+				goto error;
+			}
+			/* Remap UAR for Tx queues. */
+			err = priv_tx_uar_remap(priv, err);
+			if (err < 0) {
+				err = -err;
+				goto error;
+			}
+			priv_dev_select_rx_function(priv, eth_dev);
+			priv_dev_select_tx_function(priv, eth_dev);
+			continue;
+		}
 
 		DEBUG("using port %u (%08" PRIx32 ")", port, test);
 
 		ctx = ibv_open_device(ibv_dev);
-		if (ctx == NULL)
+		if (ctx == NULL) {
+			err = ENODEV;
 			goto port_error;
+		}
 
+		ibv_query_device_ex(ctx, NULL, &device_attr);
 		/* Check port status. */
 		err = ibv_query_port(ctx, port, &port_attr);
 		if (err) {
@@ -599,6 +738,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
 			ERROR("port %d is not configured in Ethernet mode",
 			      port);
+			err = EINVAL;
 			goto port_error;
 		}
 
@@ -628,12 +768,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		}
 
 		priv->ctx = ctx;
+		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
+			sizeof(priv->ibdev_path));
 		priv->device_attr = device_attr;
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
 		priv->mps = mps; /* Enable MPW by default if supported. */
-		priv->cqe_comp = 1; /* Enable compression by default. */
+		priv->cqe_comp = cqe_comp;
 		priv->tunnel_en = tunnel_en;
 		/* Enable vector by default if supported. */
 		priv->tx_vec_en = 1;
@@ -645,25 +787,33 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			goto port_error;
 		}
 		mlx5_args_assign(priv, &args);
-		if (ibv_exp_query_device(ctx, &exp_device_attr)) {
-			ERROR("ibv_exp_query_device() failed");
+		if (ibv_query_device_ex(ctx, NULL, &device_attr_ex)) {
+			ERROR("ibv_query_device_ex() failed");
 			goto port_error;
 		}
 
 		priv->hw_csum =
-			((exp_device_attr.exp_device_cap_flags &
-			  IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT) &&
-			 (exp_device_attr.exp_device_cap_flags &
-			  IBV_EXP_DEVICE_RX_CSUM_IP_PKT));
+			!!(device_attr_ex.device_cap_flags_ex &
+			   IBV_DEVICE_RAW_IP_CSUM);
 		DEBUG("checksum offloading is %ssupported",
 		      (priv->hw_csum ? "" : "not "));
 
+#ifdef HAVE_IBV_DEVICE_VXLAN_SUPPORT
 		priv->hw_csum_l2tun = !!(exp_device_attr.exp_device_cap_flags &
-					 IBV_EXP_DEVICE_VXLAN_SUPPORT);
+					 IBV_DEVICE_VXLAN_SUPPORT);
+#endif
 		DEBUG("L2 tunnel checksum offloads are %ssupported",
 		      (priv->hw_csum_l2tun ? "" : "not "));
 
-		priv->ind_table_max_size = exp_device_attr.rx_hash_caps.max_rwq_indirection_table_size;
+#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
+		priv->counter_set_supported = !!(device_attr.max_counter_sets);
+		ibv_describe_counter_set(ctx, 0, &cs_desc);
+		DEBUG("counter type = %d, num of cs = %ld, attributes = %d",
+		      cs_desc.counter_type, cs_desc.num_of_cs,
+		      cs_desc.attributes);
+#endif
+		priv->ind_table_max_size =
+			device_attr_ex.rss_caps.max_rwq_indirection_table_size;
 		/* Remove this check once DPDK supports larger/variable
 		 * indirection tables. */
 		if (priv->ind_table_max_size >
@@ -671,29 +821,32 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			priv->ind_table_max_size = ETH_RSS_RETA_SIZE_512;
 		DEBUG("maximum RX indirection table size is %u",
 		      priv->ind_table_max_size);
-		priv->hw_vlan_strip = !!(exp_device_attr.wq_vlan_offloads_cap &
-					 IBV_EXP_RECEIVE_WQ_CVLAN_STRIP);
+		priv->hw_vlan_strip = !!(device_attr_ex.raw_packet_caps &
+					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
 		DEBUG("VLAN stripping is %ssupported",
 		      (priv->hw_vlan_strip ? "" : "not "));
 
-		priv->hw_fcs_strip = !!(exp_device_attr.exp_device_cap_flags &
-					IBV_EXP_DEVICE_SCATTER_FCS);
+		priv->hw_fcs_strip =
+				!!(device_attr_ex.orig_attr.device_cap_flags &
+				IBV_WQ_FLAGS_SCATTER_FCS);
 		DEBUG("FCS stripping configuration is %ssupported",
 		      (priv->hw_fcs_strip ? "" : "not "));
 
-		priv->hw_padding = !!exp_device_attr.rx_pad_end_addr_align;
+#ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
+		priv->hw_padding = !!device_attr_ex.rx_pad_end_addr_align;
+#endif
 		DEBUG("hardware RX end alignment padding is %ssupported",
 		      (priv->hw_padding ? "" : "not "));
 
 		priv_get_num_vfs(priv, &num_vfs);
 		priv->sriov = (num_vfs || sriov);
 		priv->tso = ((priv->tso) &&
-			    (exp_device_attr.tso_caps.max_tso > 0) &&
-			    (exp_device_attr.tso_caps.supported_qpts &
-			    (1 << IBV_QPT_RAW_ETH)));
+			    (device_attr_ex.tso_caps.max_tso > 0) &&
+			    (device_attr_ex.tso_caps.supported_qpts &
+			    (1 << IBV_QPT_RAW_PACKET)));
 		if (priv->tso)
 			priv->max_tso_payload_sz =
-				exp_device_attr.tso_caps.max_tso;
+				device_attr_ex.tso_caps.max_tso;
 		if (priv->mps && !mps) {
 			ERROR("multi-packet send not supported on this device"
 			      " (" MLX5_TXQ_MPW_EN ")");
@@ -718,23 +871,15 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 				priv->txq_inline = MLX5_WQE_SIZE_MAX -
 						   MLX5_WQE_SIZE;
 		}
-		/* Allocate and register default RSS hash keys. */
-		priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n,
-					    sizeof((*priv->rss_conf)[0]), 0);
-		if (priv->rss_conf == NULL) {
-			err = ENOMEM;
-			goto port_error;
+		if (priv->cqe_comp && !cqe_comp) {
+			WARN("Rx CQE compression isn't supported");
+			priv->cqe_comp = 0;
 		}
-		err = rss_hash_rss_conf_new_key(priv,
-						rss_hash_default_key,
-						rss_hash_default_key_len,
-						ETH_RSS_PROTO_MASK);
-		if (err)
-			goto port_error;
 		/* Configure the first MAC address by default. */
 		if (priv_get_mac(priv, &mac.addr_bytes)) {
 			ERROR("cannot get MAC address, is mlx5_en loaded?"
 			      " (errno: %s)", strerror(errno));
+			err = ENODEV;
 			goto port_error;
 		}
 		INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
@@ -742,14 +887,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		     mac.addr_bytes[0], mac.addr_bytes[1],
 		     mac.addr_bytes[2], mac.addr_bytes[3],
 		     mac.addr_bytes[4], mac.addr_bytes[5]);
-		/* Register MAC address. */
-		claim_zero(priv_mac_addr_add(priv, 0,
-					     (const uint8_t (*)[ETHER_ADDR_LEN])
-					     mac.addr_bytes));
-		/* Initialize FD filters list. */
-		err = fdir_init_filters_list(priv);
-		if (err)
-			goto port_error;
 #ifndef NDEBUG
 		{
 			char ifname[IF_NAMESIZE];
@@ -778,44 +915,26 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			err = ENOMEM;
 			goto port_error;
 		}
-
-		/* Secondary processes have to use local storage for their
-		 * private data as well as a copy of eth_dev->data, but this
-		 * pointer must not be modified before burst functions are
-		 * actually called. */
-		if (mlx5_is_secondary()) {
-			struct mlx5_secondary_data *sd =
-				&mlx5_secondary_data[eth_dev->data->port_id];
-			sd->primary_priv = eth_dev->data->dev_private;
-			if (sd->primary_priv == NULL) {
-				ERROR("no private data for port %u",
-						eth_dev->data->port_id);
-				err = EINVAL;
-				goto port_error;
-			}
-			sd->shared_dev_data = eth_dev->data;
-			rte_spinlock_init(&sd->lock);
-			memcpy(sd->data.name, sd->shared_dev_data->name,
-				   sizeof(sd->data.name));
-			sd->data.dev_private = priv;
-			sd->data.rx_mbuf_alloc_failed = 0;
-			sd->data.mtu = ETHER_MTU;
-			sd->data.port_id = sd->shared_dev_data->port_id;
-			sd->data.mac_addrs = priv->mac;
-			eth_dev->tx_pkt_burst = mlx5_tx_burst_secondary_setup;
-			eth_dev->rx_pkt_burst = mlx5_rx_burst_secondary_setup;
-		} else {
-			eth_dev->data->dev_private = priv;
-			eth_dev->data->mac_addrs = priv->mac;
-		}
-
+		eth_dev->data->dev_private = priv;
+		eth_dev->data->mac_addrs = priv->mac;
 		eth_dev->device = &pci_dev->device;
 		rte_eth_copy_pci_info(eth_dev, pci_dev);
-		eth_dev->data->dev_flags |= RTE_ETH_DEV_DETACHABLE;
 		eth_dev->device->driver = &mlx5_driver.driver;
 		priv->dev = eth_dev;
 		eth_dev->dev_ops = &mlx5_dev_ops;
+		/* Register MAC address. */
+		claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
 		TAILQ_INIT(&priv->flows);
+		TAILQ_INIT(&priv->ctrl_flows);
+
+		/* Hint libmlx5 to use PMD allocator for data plane resources */
+		struct mlx5dv_ctx_allocators alctr = {
+			.alloc = &mlx5_alloc_verbs_buf,
+			.free = &mlx5_free_verbs_buf,
+			.data = priv,
+		};
+		mlx5dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+					(void *)((uintptr_t)&alctr));
 
 		/* Bring Ethernet device up. */
 		DEBUG("forcing Ethernet interface up");
@@ -824,10 +943,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		continue;
 
 port_error:
-		if (priv) {
-			rte_free(priv->rss_conf);
+		if (priv)
 			rte_free(priv);
-		}
 		if (pd)
 			claim_zero(ibv_dealloc_pd(pd));
 		if (ctx)
@@ -901,7 +1018,7 @@ static struct rte_pci_driver mlx5_driver = {
 	},
 	.id_table = mlx5_pci_id_map,
 	.probe = mlx5_pci_probe,
-	.drv_flags = RTE_PCI_DRV_INTR_LSC,
+	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
 };
 
 /**
@@ -920,6 +1037,9 @@ rte_mlx5_pmd_init(void)
 	 * using this PMD, which is not supported in forked processes.
 	 */
 	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
+	/* Match the size of Rx completion entry to the size of a cacheline. */
+	if (RTE_CACHE_LINE_SIZE == 128)
+		setenv("MLX5_CQE_SIZE", "128", 0);
 	ibv_fork_init();
 	rte_pci_register(&mlx5_driver);
 }
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 43c53841..e6a69b82 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -39,6 +39,7 @@
 #include <limits.h>
 #include <net/if.h>
 #include <netinet/in.h>
+#include <sys/queue.h>
 
 /* Verbs header. */
 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
@@ -50,10 +51,6 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_pci.h>
 #include <rte_ether.h>
 #include <rte_ethdev.h>
@@ -61,20 +58,12 @@
 #include <rte_interrupts.h>
 #include <rte_errno.h>
 #include <rte_flow.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5_utils.h"
 #include "mlx5_rxtx.h"
 #include "mlx5_autoconf.h"
 #include "mlx5_defs.h"
 
-#if !defined(HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE) || \
-	!defined(HAVE_VERBS_MLX5_ETH_VLAN_INLINE_HEADER_SIZE)
-#error Mellanox OFED >= 3.3 is required, please refer to the documentation.
-#endif
-
 enum {
 	PCI_VENDOR_ID_MELLANOX = 0x15b3,
 };
@@ -98,26 +87,21 @@ struct mlx5_xstats_ctrl {
 	uint64_t base[MLX5_MAX_XSTATS];
 };
 
+/* Flow list . */
+TAILQ_HEAD(mlx5_flows, rte_flow);
+
 struct priv {
-	struct rte_eth_dev *dev; /* Ethernet device. */
+	struct rte_eth_dev *dev; /* Ethernet device of master process. */
 	struct ibv_context *ctx; /* Verbs context. */
-	struct ibv_device_attr device_attr; /* Device properties. */
+	struct ibv_device_attr_ex device_attr; /* Device properties. */
 	struct ibv_pd *pd; /* Protection Domain. */
-	/*
-	 * MAC addresses array and configuration bit-field.
-	 * An extra entry that cannot be modified by the DPDK is reserved
-	 * for broadcast frames (destination MAC address ff:ff:ff:ff:ff:ff).
-	 */
-	struct ether_addr mac[MLX5_MAX_MAC_ADDRESSES];
-	BITFIELD_DECLARE(mac_configured, uint32_t, MLX5_MAX_MAC_ADDRESSES);
+	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
+	struct ether_addr mac[MLX5_MAX_MAC_ADDRESSES]; /* MAC addresses. */
 	uint16_t vlan_filter[MLX5_MAX_VLAN_IDS]; /* VLAN filters table. */
 	unsigned int vlan_filter_n; /* Number of configured VLAN filters. */
 	/* Device properties. */
 	uint16_t mtu; /* Configured MTU. */
 	uint8_t port; /* Physical port number. */
-	unsigned int started:1; /* Device started, flows enabled. */
-	unsigned int promisc_req:1; /* Promiscuous mode requested. */
-	unsigned int allmulti_req:1; /* All multicast mode requested. */
 	unsigned int hw_csum:1; /* Checksum offload is supported. */
 	unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */
 	unsigned int hw_vlan_strip:1; /* VLAN stripping is supported. */
@@ -133,6 +117,7 @@ struct priv {
 	unsigned int isolated:1; /* Whether isolated mode is enabled. */
 	unsigned int tx_vec_en:1; /* Whether Tx vector is enabled. */
 	unsigned int rx_vec_en:1; /* Whether Rx vector is enabled. */
+	unsigned int counter_set_supported:1; /* Counter set is supported. */
 	/* Whether Tx offloads for tunneled packets are supported. */
 	unsigned int max_tso_payload_sz; /* Maximum TCP payload for TSO. */
 	unsigned int txq_inline; /* Maximum packet size for inlining. */
@@ -141,38 +126,31 @@ struct priv {
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
 	unsigned int txqs_n; /* TX queues array size. */
-	struct rxq *(*rxqs)[]; /* RX queues. */
-	struct txq *(*txqs)[]; /* TX queues. */
-	/* Indirection tables referencing all RX WQs. */
-	struct ibv_exp_rwq_ind_table *(*ind_tables)[];
-	unsigned int ind_tables_n; /* Number of indirection tables. */
+	struct mlx5_rxq_data *(*rxqs)[]; /* RX queues. */
+	struct mlx5_txq_data *(*txqs)[]; /* TX queues. */
 	unsigned int ind_table_max_size; /* Maximum indirection table size. */
-	/* Hash RX QPs feeding the indirection table. */
-	struct hash_rxq (*hash_rxqs)[];
-	unsigned int hash_rxqs_n; /* Hash RX QPs array size. */
-	/* RSS configuration array indexed by hash RX queue type. */
-	struct rte_eth_rss_conf *(*rss_conf)[];
-	uint64_t rss_hf; /* RSS DPDK bit field of active RSS. */
+	struct rte_eth_rss_conf rss_conf; /* RSS configuration. */
 	struct rte_intr_handle intr_handle; /* Interrupt handler. */
 	unsigned int (*reta_idx)[]; /* RETA index table. */
 	unsigned int reta_idx_n; /* RETA index size. */
-	struct fdir_filter_list *fdir_filter_list; /* Flow director rules. */
-	struct fdir_queue *fdir_drop_queue; /* Flow director drop queue. */
-	struct rte_flow_drop *flow_drop_queue; /* Flow drop queue. */
-	TAILQ_HEAD(mlx5_flows, rte_flow) flows; /* RTE Flow rules. */
+	struct mlx5_hrxq_drop *flow_drop_queue; /* Flow drop queue. */
+	struct mlx5_flows flows; /* RTE Flow rules. */
+	struct mlx5_flows ctrl_flows; /* Control flow rules. */
+	LIST_HEAD(mr, mlx5_mr) mr; /* Memory region. */
+	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
+	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
+	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
+	LIST_HEAD(txq, mlx5_txq_ctrl) txqsctrl; /* DPDK Tx queues. */
+	LIST_HEAD(txqibv, mlx5_txq_ibv) txqsibv; /* Verbs Tx queues. */
+	/* Verbs Indirection tables. */
+	LIST_HEAD(ind_tables, mlx5_ind_table_ibv) ind_tbls;
 	uint32_t link_speed_capa; /* Link speed capabilities. */
 	struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */
 	rte_spinlock_t lock; /* Lock for control functions. */
+	int primary_socket; /* Unix socket for primary process. */
+	struct rte_intr_handle intr_handle_socket; /* Interrupt handler. */
 };
 
-/* Local storage for secondary process data. */
-struct mlx5_secondary_data {
-	struct rte_eth_dev_data data; /* Local device data. */
-	struct priv *primary_priv; /* Private structure from primary. */
-	struct rte_eth_dev_data *shared_dev_data; /* Shared device data. */
-	rte_spinlock_t lock; /* Port configuration lock. */
-} mlx5_secondary_data[RTE_MAX_ETHPORTS];
-
 /**
  * Lock private structure to protect it from concurrent access in the
  * control path.
@@ -228,28 +206,19 @@ void priv_dev_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *);
 void priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *);
 int mlx5_set_link_down(struct rte_eth_dev *dev);
 int mlx5_set_link_up(struct rte_eth_dev *dev);
-struct priv *mlx5_secondary_data_setup(struct priv *priv);
-void priv_select_tx_function(struct priv *);
-void priv_select_rx_function(struct priv *);
+void priv_dev_select_tx_function(struct priv *priv, struct rte_eth_dev *dev);
+void priv_dev_select_rx_function(struct priv *priv, struct rte_eth_dev *dev);
 
 /* mlx5_mac.c */
 
 int priv_get_mac(struct priv *, uint8_t (*)[ETHER_ADDR_LEN]);
-void hash_rxq_mac_addrs_del(struct hash_rxq *);
-void priv_mac_addrs_disable(struct priv *);
 void mlx5_mac_addr_remove(struct rte_eth_dev *, uint32_t);
-int hash_rxq_mac_addrs_add(struct hash_rxq *);
-int priv_mac_addr_add(struct priv *, unsigned int,
-		      const uint8_t (*)[ETHER_ADDR_LEN]);
-int priv_mac_addrs_enable(struct priv *);
 int mlx5_mac_addr_add(struct rte_eth_dev *, struct ether_addr *, uint32_t,
 		      uint32_t);
 void mlx5_mac_addr_set(struct rte_eth_dev *, struct ether_addr *);
 
 /* mlx5_rss.c */
 
-int rss_hash_rss_conf_new_key(struct priv *, const uint8_t *, unsigned int,
-			      uint64_t);
 int mlx5_rss_hash_update(struct rte_eth_dev *, struct rte_eth_rss_conf *);
 int mlx5_rss_hash_conf_get(struct rte_eth_dev *, struct rte_eth_rss_conf *);
 int priv_rss_reta_index_resize(struct priv *, unsigned int);
@@ -260,10 +229,6 @@ int mlx5_dev_rss_reta_update(struct rte_eth_dev *,
 
 /* mlx5_rxmode.c */
 
-int priv_special_flow_enable(struct priv *, enum hash_rxq_flow_type);
-void priv_special_flow_disable(struct priv *, enum hash_rxq_flow_type);
-int priv_special_flow_enable_all(struct priv *);
-void priv_special_flow_disable_all(struct priv *);
 void mlx5_promiscuous_enable(struct rte_eth_dev *);
 void mlx5_promiscuous_disable(struct rte_eth_dev *);
 void mlx5_allmulticast_enable(struct rte_eth_dev *);
@@ -272,7 +237,7 @@ void mlx5_allmulticast_disable(struct rte_eth_dev *);
 /* mlx5_stats.c */
 
 void priv_xstats_init(struct priv *);
-void mlx5_stats_get(struct rte_eth_dev *, struct rte_eth_stats *);
+int mlx5_stats_get(struct rte_eth_dev *, struct rte_eth_stats *);
 void mlx5_stats_reset(struct rte_eth_dev *);
 int mlx5_xstats_get(struct rte_eth_dev *,
 		    struct rte_eth_xstat *, unsigned int);
@@ -283,26 +248,22 @@ int mlx5_xstats_get_names(struct rte_eth_dev *,
 /* mlx5_vlan.c */
 
 int mlx5_vlan_filter_set(struct rte_eth_dev *, uint16_t, int);
-void mlx5_vlan_offload_set(struct rte_eth_dev *, int);
+int mlx5_vlan_offload_set(struct rte_eth_dev *, int);
 void mlx5_vlan_strip_queue_set(struct rte_eth_dev *, uint16_t, int);
 
 /* mlx5_trigger.c */
 
 int mlx5_dev_start(struct rte_eth_dev *);
 void mlx5_dev_stop(struct rte_eth_dev *);
+int priv_dev_traffic_enable(struct priv *, struct rte_eth_dev *);
+int priv_dev_traffic_disable(struct priv *, struct rte_eth_dev *);
+int priv_dev_traffic_restart(struct priv *, struct rte_eth_dev *);
+int mlx5_traffic_restart(struct rte_eth_dev *);
 
-/* mlx5_fdir.c */
+/* mlx5_flow.c */
 
-void priv_fdir_queue_destroy(struct priv *, struct fdir_queue *);
-int fdir_init_filters_list(struct priv *);
-void priv_fdir_delete_filters_list(struct priv *);
-void priv_fdir_disable(struct priv *);
-void priv_fdir_enable(struct priv *);
 int mlx5_dev_filter_ctrl(struct rte_eth_dev *, enum rte_filter_type,
 			 enum rte_filter_op, void *);
-
-/* mlx5_flow.c */
-
 int mlx5_flow_validate(struct rte_eth_dev *, const struct rte_flow_attr *,
 		       const struct rte_flow_item [],
 		       const struct rte_flow_action [],
@@ -314,10 +275,35 @@ struct rte_flow *mlx5_flow_create(struct rte_eth_dev *,
 				  struct rte_flow_error *);
 int mlx5_flow_destroy(struct rte_eth_dev *, struct rte_flow *,
 		      struct rte_flow_error *);
+void priv_flow_flush(struct priv *, struct mlx5_flows *);
 int mlx5_flow_flush(struct rte_eth_dev *, struct rte_flow_error *);
+int mlx5_flow_query(struct rte_eth_dev *, struct rte_flow *,
+		    enum rte_flow_action_type, void *,
+		    struct rte_flow_error *);
 int mlx5_flow_isolate(struct rte_eth_dev *, int, struct rte_flow_error *);
-int priv_flow_start(struct priv *);
-void priv_flow_stop(struct priv *);
-int priv_flow_rxq_in_use(struct priv *, struct rxq *);
+int priv_flow_start(struct priv *, struct mlx5_flows *);
+void priv_flow_stop(struct priv *, struct mlx5_flows *);
+int priv_flow_verify(struct priv *);
+int mlx5_ctrl_flow_vlan(struct rte_eth_dev *, struct rte_flow_item_eth *,
+			struct rte_flow_item_eth *, struct rte_flow_item_vlan *,
+			struct rte_flow_item_vlan *);
+int mlx5_ctrl_flow(struct rte_eth_dev *, struct rte_flow_item_eth *,
+		   struct rte_flow_item_eth *);
+int priv_flow_create_drop_queue(struct priv *);
+void priv_flow_delete_drop_queue(struct priv *);
+
+/* mlx5_socket.c */
+
+int priv_socket_init(struct priv *priv);
+int priv_socket_uninit(struct priv *priv);
+void priv_socket_handle(struct priv *priv);
+int priv_socket_connect(struct priv *priv);
+
+/* mlx5_mr.c */
+
+struct mlx5_mr *priv_mr_new(struct priv *, struct rte_mempool *);
+struct mlx5_mr *priv_mr_get(struct priv *, struct rte_mempool *);
+int priv_mr_release(struct priv *, struct mlx5_mr *);
+int priv_mr_verify(struct priv *);
 
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index a76bc6f6..3a7706cf 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -45,9 +45,6 @@
 /* Maximum number of simultaneous VLAN filters. */
 #define MLX5_MAX_VLAN_IDS 128
 
-/* Maximum number of special flows. */
-#define MLX5_MAX_SPECIAL_FLOWS 4
-
 /*
  * Request TX completion every time descriptors reach this threshold since
  * the previous request. Must be a power of two for performance reasons.
@@ -100,7 +97,8 @@
 
 /*
  * Maximum size of burst for vectorized Tx. This is related to the maximum size
- * of Enhaned MPW (eMPW) WQE as vectorized Tx is supported with eMPW.
+ * of Enhanced MPW (eMPW) WQE as vectorized Tx is supported with eMPW.
+ * Careful when changing, large value can cause WQE DS to overlap.
  */
 #define MLX5_VPMD_TX_MAX_BURST        32U
 
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index b0eb3cdf..c31ea4b6 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -31,6 +31,8 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#define _GNU_SOURCE
+
 #include <stddef.h>
 #include <assert.h>
 #include <unistd.h>
@@ -49,21 +51,17 @@
 #include <linux/sockios.h>
 #include <linux/version.h>
 #include <fcntl.h>
+#include <stdalign.h>
+#include <sys/un.h>
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_atomic.h>
 #include <rte_ethdev.h>
+#include <rte_bus_pci.h>
 #include <rte_mbuf.h>
 #include <rte_common.h>
 #include <rte_interrupts.h>
 #include <rte_alarm.h>
 #include <rte_malloc.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
@@ -119,7 +117,6 @@ struct ethtool_link_settings {
 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
 #endif
-#define ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32 (SCHAR_MAX)
 
 /**
  * Return private structure associated with an Ethernet device.
@@ -133,12 +130,7 @@ struct ethtool_link_settings {
 struct priv *
 mlx5_get_priv(struct rte_eth_dev *dev)
 {
-	struct mlx5_secondary_data *sd;
-
-	if (!mlx5_is_secondary())
-		return dev->data->dev_private;
-	sd = &mlx5_secondary_data[dev->data->port_id];
-	return sd->data.dev_private;
+	return dev->data->dev_private;
 }
 
 /**
@@ -150,7 +142,7 @@ mlx5_get_priv(struct rte_eth_dev *dev)
 inline int
 mlx5_is_secondary(void)
 {
-	return rte_eal_process_type() != RTE_PROC_PRIMARY;
+	return rte_eal_process_type() == RTE_PROC_SECONDARY;
 }
 
 /**
@@ -174,7 +166,7 @@ priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
 	char match[IF_NAMESIZE] = "";
 
 	{
-		MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path);
+		MKSTR(path, "%s/device/net", priv->ibdev_path);
 
 		dir = opendir(path);
 		if (dir == NULL)
@@ -192,7 +184,7 @@ priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
 			continue;
 
 		MKSTR(path, "%s/device/net/%s/%s",
-		      priv->ctx->device->ibdev_path, name,
+		      priv->ibdev_path, name,
 		      (dev_type ? "dev_id" : "dev_port"));
 
 		file = fopen(path, "rb");
@@ -280,11 +272,11 @@ priv_sysfs_read(const struct priv *priv, const char *entry,
 
 	if (priv_is_ib_cntr(entry)) {
 		MKSTR(path, "%s/ports/1/hw_counters/%s",
-		      priv->ctx->device->ibdev_path, entry);
+		      priv->ibdev_path, entry);
 		file = fopen(path, "rb");
 	} else {
 		MKSTR(path, "%s/device/net/%s/%s",
-		      priv->ctx->device->ibdev_path, ifname, entry);
+		      priv->ibdev_path, ifname, entry);
 		file = fopen(path, "rb");
 	}
 	if (file == NULL)
@@ -327,8 +319,7 @@ priv_sysfs_write(const struct priv *priv, const char *entry,
 	if (priv_get_ifname(priv, &ifname))
 		return -1;
 
-	MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
-	      ifname, entry);
+	MKSTR(path, "%s/device/net/%s/%s", priv->ibdev_path, ifname, entry);
 
 	file = fopen(path, "wb");
 	if (file == NULL)
@@ -585,8 +576,29 @@ dev_configure(struct rte_eth_dev *dev)
 	unsigned int i;
 	unsigned int j;
 	unsigned int reta_idx_n;
+	const uint8_t use_app_rss_key =
+		!!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len;
 
-	priv->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
+	if (use_app_rss_key &&
+	    (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
+	     rss_hash_default_key_len)) {
+		/* MLX5 RSS only support 40bytes key. */
+		return EINVAL;
+	}
+	priv->rss_conf.rss_key =
+		rte_realloc(priv->rss_conf.rss_key,
+			    rss_hash_default_key_len, 0);
+	if (!priv->rss_conf.rss_key) {
+		ERROR("cannot allocate RSS hash key memory (%u)", rxqs_n);
+		return ENOMEM;
+	}
+	memcpy(priv->rss_conf.rss_key,
+	       use_app_rss_key ?
+	       dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
+	       rss_hash_default_key,
+	       rss_hash_default_key_len);
+	priv->rss_conf.rss_key_len = rss_hash_default_key_len;
+	priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
 	priv->rxqs = (void *)dev->data->rx_queues;
 	priv->txqs = (void *)dev->data->tx_queues;
 	if (txqs_n != priv->txqs_n) {
@@ -672,8 +684,8 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	 * Since we need one CQ per QP, the limit is the minimum number
 	 * between the two values.
 	 */
-	max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ?
-	       priv->device_attr.max_qp : priv->device_attr.max_cq);
+	max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
+		      priv->device_attr.orig_attr.max_qp);
 	/* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
 	if (max >= 65535)
 		max = 65535;
@@ -686,7 +698,9 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 		  DEV_RX_OFFLOAD_UDP_CKSUM |
 		  DEV_RX_OFFLOAD_TCP_CKSUM) :
 		 0) |
-		(priv->hw_vlan_strip ? DEV_RX_OFFLOAD_VLAN_STRIP : 0);
+		(priv->hw_vlan_strip ? DEV_RX_OFFLOAD_VLAN_STRIP : 0) |
+		DEV_RX_OFFLOAD_TIMESTAMP;
+
 	if (!priv->mps)
 		info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
 	if (priv->hw_csum)
@@ -704,9 +718,7 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 		info->if_index = if_nametoindex(ifname);
 	info->reta_size = priv->reta_idx_n ?
 		priv->reta_idx_n : priv->ind_table_max_size;
-	info->hash_key_size = ((*priv->rss_conf) ?
-			       (*priv->rss_conf)[0]->rss_key_len :
-			       0);
+	info->hash_key_size = priv->rss_conf.rss_key_len;
 	info->speed_capa = priv->link_speed_capa;
 	priv_unlock(priv);
 }
@@ -816,12 +828,7 @@ static int
 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete)
 {
 	struct priv *priv = mlx5_get_priv(dev);
-	__extension__ struct {
-		struct ethtool_link_settings edata;
-		uint32_t link_mode_data[3 *
-					ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32];
-	} ecmd;
-
+	struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
 	struct ifreq ifr;
 	struct rte_eth_link dev_link;
 	uint64_t sc;
@@ -834,23 +841,29 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete)
 	memset(&dev_link, 0, sizeof(dev_link));
 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
 				(ifr.ifr_flags & IFF_RUNNING));
-	memset(&ecmd, 0, sizeof(ecmd));
-	ecmd.edata.cmd = ETHTOOL_GLINKSETTINGS;
-	ifr.ifr_data = (void *)&ecmd;
+	ifr.ifr_data = (void *)&gcmd;
 	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
 		DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s",
 		      strerror(errno));
 		return -1;
 	}
-	ecmd.edata.link_mode_masks_nwords = -ecmd.edata.link_mode_masks_nwords;
+	gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
+
+	alignas(struct ethtool_link_settings)
+	uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
+		     sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
+	struct ethtool_link_settings *ecmd = (void *)data;
+
+	*ecmd = gcmd;
+	ifr.ifr_data = (void *)ecmd;
 	if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
 		DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s",
 		      strerror(errno));
 		return -1;
 	}
-	dev_link.link_speed = ecmd.edata.speed;
-	sc = ecmd.edata.link_mode_masks[0] |
-		((uint64_t)ecmd.edata.link_mode_masks[1] << 32);
+	dev_link.link_speed = ecmd->speed;
+	sc = ecmd->link_mode_masks[0] |
+		((uint64_t)ecmd->link_mode_masks[1] << 32);
 	priv->link_speed_capa = 0;
 	if (sc & ETHTOOL_LINK_MODE_Autoneg_BIT)
 		priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
@@ -886,7 +899,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete)
 		  ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT |
 		  ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT))
 		priv->link_speed_capa |= ETH_LINK_SPEED_100G;
-	dev_link.link_duplex = ((ecmd.edata.duplex == DUPLEX_HALF) ?
+	dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
 				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
 	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
 				  ETH_LINK_SPEED_FIXED);
@@ -1124,47 +1137,77 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
 }
 
 /**
- * Link status handler.
+ * Update the link status.
  *
  * @param priv
  *   Pointer to private structure.
- * @param dev
- *   Pointer to the rte_eth_dev structure.
  *
  * @return
- *   Nonzero if the callback process can be called immediately.
+ *   Zero if the callback process can be called immediately.
  */
 static int
-priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev)
+priv_link_status_update(struct priv *priv)
+{
+	struct rte_eth_link *link = &priv->dev->data->dev_link;
+
+	mlx5_link_update(priv->dev, 0);
+	if (((link->link_speed == 0) && link->link_status) ||
+		((link->link_speed != 0) && !link->link_status)) {
+		/*
+		 * Inconsistent status. Event likely occurred before the
+		 * kernel netdevice exposes the new status.
+		 */
+		if (!priv->pending_alarm) {
+			priv->pending_alarm = 1;
+			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
+					  mlx5_dev_link_status_handler,
+					  priv->dev);
+		}
+		return 1;
+	} else if (unlikely(priv->pending_alarm)) {
+		/* Link interrupt occurred while alarm is already scheduled. */
+		priv->pending_alarm = 0;
+		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
+	}
+	return 0;
+}
+
+/**
+ * Device status handler.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param events
+ *   Pointer to event flags holder.
+ *
+ * @return
+ *   Events bitmap of callback process which can be called immediately.
+ */
+static uint32_t
+priv_dev_status_handler(struct priv *priv)
 {
 	struct ibv_async_event event;
-	struct rte_eth_link *link = &dev->data->dev_link;
-	int ret = 0;
+	uint32_t ret = 0;
 
 	/* Read all message and acknowledge them. */
 	for (;;) {
 		if (ibv_get_async_event(priv->ctx, &event))
 			break;
-
-		if (event.event_type != IBV_EVENT_PORT_ACTIVE &&
-		    event.event_type != IBV_EVENT_PORT_ERR)
+		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
+			event.event_type == IBV_EVENT_PORT_ERR) &&
+			(priv->dev->data->dev_conf.intr_conf.lsc == 1))
+			ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
+		else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
+			priv->dev->data->dev_conf.intr_conf.rmv == 1)
+			ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
+		else
 			DEBUG("event type %d on port %d not handled",
 			      event.event_type, event.element.port_num);
 		ibv_ack_async_event(&event);
 	}
-	mlx5_link_update(dev, 0);
-	if (((link->link_speed == 0) && link->link_status) ||
-	    ((link->link_speed != 0) && !link->link_status)) {
-		if (!priv->pending_alarm) {
-			/* Inconsistent status, check again later. */
-			priv->pending_alarm = 1;
-			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
-					  mlx5_dev_link_status_handler,
-					  dev);
-		}
-	} else {
-		ret = 1;
-	}
+	if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
+		if (priv_link_status_update(priv))
+			ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
 	return ret;
 }
 
@@ -1184,9 +1227,9 @@ mlx5_dev_link_status_handler(void *arg)
 	priv_lock(priv);
 	assert(priv->pending_alarm == 1);
 	priv->pending_alarm = 0;
-	ret = priv_dev_link_status_handler(priv, dev);
+	ret = priv_link_status_update(priv);
 	priv_unlock(priv);
-	if (ret)
+	if (!ret)
 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
 					      NULL);
 }
@@ -1204,14 +1247,34 @@ mlx5_dev_interrupt_handler(void *cb_arg)
 {
 	struct rte_eth_dev *dev = cb_arg;
 	struct priv *priv = dev->data->dev_private;
-	int ret;
+	uint32_t events;
 
 	priv_lock(priv);
-	ret = priv_dev_link_status_handler(priv, dev);
+	events = priv_dev_status_handler(priv);
 	priv_unlock(priv);
-	if (ret)
+	if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
 					      NULL);
+	if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
+		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
+					      NULL);
+}
+
+/**
+ * Handle interrupts from the socket.
+ *
+ * @param cb_arg
+ *   Callback argument.
+ */
+static void
+mlx5_dev_handler_socket(void *cb_arg)
+{
+	struct rte_eth_dev *dev = cb_arg;
+	struct priv *priv = dev->data->dev_private;
+
+	priv_lock(priv);
+	priv_socket_handle(priv);
+	priv_unlock(priv);
 }
 
 /**
@@ -1225,16 +1288,20 @@ mlx5_dev_interrupt_handler(void *cb_arg)
 void
 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
 {
-	if (!dev->data->dev_conf.intr_conf.lsc)
-		return;
-	rte_intr_callback_unregister(&priv->intr_handle,
-				     mlx5_dev_interrupt_handler,
-				     dev);
+	if (dev->data->dev_conf.intr_conf.lsc ||
+	    dev->data->dev_conf.intr_conf.rmv)
+		rte_intr_callback_unregister(&priv->intr_handle,
+					     mlx5_dev_interrupt_handler, dev);
+	if (priv->primary_socket)
+		rte_intr_callback_unregister(&priv->intr_handle_socket,
+					     mlx5_dev_handler_socket, dev);
 	if (priv->pending_alarm)
 		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev);
 	priv->pending_alarm = 0;
 	priv->intr_handle.fd = 0;
 	priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+	priv->intr_handle_socket.fd = 0;
+	priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
 }
 
 /**
@@ -1250,20 +1317,29 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
 {
 	int rc, flags;
 
-	if (!dev->data->dev_conf.intr_conf.lsc)
-		return;
+	assert(!mlx5_is_secondary());
 	assert(priv->ctx->async_fd > 0);
 	flags = fcntl(priv->ctx->async_fd, F_GETFL);
 	rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
 	if (rc < 0) {
 		INFO("failed to change file descriptor async event queue");
 		dev->data->dev_conf.intr_conf.lsc = 0;
-	} else {
+		dev->data->dev_conf.intr_conf.rmv = 0;
+	}
+	if (dev->data->dev_conf.intr_conf.lsc ||
+	    dev->data->dev_conf.intr_conf.rmv) {
 		priv->intr_handle.fd = priv->ctx->async_fd;
 		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
 		rte_intr_callback_register(&priv->intr_handle,
-					   mlx5_dev_interrupt_handler,
-					   dev);
+					   mlx5_dev_interrupt_handler, dev);
+	}
+
+	rc = priv_socket_init(priv);
+	if (!rc && priv->primary_socket) {
+		priv->intr_handle_socket.fd = priv->primary_socket;
+		priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
+		rte_intr_callback_register(&priv->intr_handle_socket,
+					   mlx5_dev_handler_socket, dev);
 	}
 }
 
@@ -1271,7 +1347,9 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
  * Change the link state (UP / DOWN).
  *
  * @param priv
- *   Pointer to Ethernet device structure.
+ *   Pointer to private data structure.
+ * @param dev
+ *   Pointer to rte_eth_dev structure.
  * @param up
  *   Nonzero for link up, otherwise link down.
  *
@@ -1279,17 +1357,16 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
  *   0 on success, errno value on failure.
  */
 static int
-priv_set_link(struct priv *priv, int up)
+priv_dev_set_link(struct priv *priv, struct rte_eth_dev *dev, int up)
 {
-	struct rte_eth_dev *dev = priv->dev;
 	int err;
 
 	if (up) {
 		err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
 		if (err)
 			return err;
-		priv_select_tx_function(priv);
-		priv_select_rx_function(priv);
+		priv_dev_select_tx_function(priv, dev);
+		priv_dev_select_rx_function(priv, dev);
 	} else {
 		err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
 		if (err)
@@ -1316,7 +1393,7 @@ mlx5_set_link_down(struct rte_eth_dev *dev)
 	int err;
 
 	priv_lock(priv);
-	err = priv_set_link(priv, 0);
+	err = priv_dev_set_link(priv, dev, 0);
 	priv_unlock(priv);
 	return err;
 }
@@ -1337,195 +1414,42 @@ mlx5_set_link_up(struct rte_eth_dev *dev)
 	int err;
 
 	priv_lock(priv);
-	err = priv_set_link(priv, 1);
+	err = priv_dev_set_link(priv, dev, 1);
 	priv_unlock(priv);
 	return err;
 }
 
 /**
- * Configure secondary process queues from a private data pointer (primary
- * or secondary) and update burst callbacks. Can take place only once.
- *
- * All queues must have been previously created by the primary process to
- * avoid undefined behavior.
- *
- * @param priv
- *   Private data pointer from either primary or secondary process.
- *
- * @return
- *   Private data pointer from secondary process, NULL in case of error.
- */
-struct priv *
-mlx5_secondary_data_setup(struct priv *priv)
-{
-	unsigned int port_id = 0;
-	struct mlx5_secondary_data *sd;
-	void **tx_queues;
-	void **rx_queues;
-	unsigned int nb_tx_queues;
-	unsigned int nb_rx_queues;
-	unsigned int i;
-
-	/* priv must be valid at this point. */
-	assert(priv != NULL);
-	/* priv->dev must also be valid but may point to local memory from
-	 * another process, possibly with the same address and must not
-	 * be dereferenced yet. */
-	assert(priv->dev != NULL);
-	/* Determine port ID by finding out where priv comes from. */
-	while (1) {
-		sd = &mlx5_secondary_data[port_id];
-		rte_spinlock_lock(&sd->lock);
-		/* Primary process? */
-		if (sd->primary_priv == priv)
-			break;
-		/* Secondary process? */
-		if (sd->data.dev_private == priv)
-			break;
-		rte_spinlock_unlock(&sd->lock);
-		if (++port_id == RTE_DIM(mlx5_secondary_data))
-			port_id = 0;
-	}
-	/* Switch to secondary private structure. If private data has already
-	 * been updated by another thread, there is nothing else to do. */
-	priv = sd->data.dev_private;
-	if (priv->dev->data == &sd->data)
-		goto end;
-	/* Sanity checks. Secondary private structure is supposed to point
-	 * to local eth_dev, itself still pointing to the shared device data
-	 * structure allocated by the primary process. */
-	assert(sd->shared_dev_data != &sd->data);
-	assert(sd->data.nb_tx_queues == 0);
-	assert(sd->data.tx_queues == NULL);
-	assert(sd->data.nb_rx_queues == 0);
-	assert(sd->data.rx_queues == NULL);
-	assert(priv != sd->primary_priv);
-	assert(priv->dev->data == sd->shared_dev_data);
-	assert(priv->txqs_n == 0);
-	assert(priv->txqs == NULL);
-	assert(priv->rxqs_n == 0);
-	assert(priv->rxqs == NULL);
-	nb_tx_queues = sd->shared_dev_data->nb_tx_queues;
-	nb_rx_queues = sd->shared_dev_data->nb_rx_queues;
-	/* Allocate local storage for queues. */
-	tx_queues = rte_zmalloc("secondary ethdev->tx_queues",
-				sizeof(sd->data.tx_queues[0]) * nb_tx_queues,
-				RTE_CACHE_LINE_SIZE);
-	rx_queues = rte_zmalloc("secondary ethdev->rx_queues",
-				sizeof(sd->data.rx_queues[0]) * nb_rx_queues,
-				RTE_CACHE_LINE_SIZE);
-	if (tx_queues == NULL || rx_queues == NULL)
-		goto error;
-	/* Lock to prevent control operations during setup. */
-	priv_lock(priv);
-	/* TX queues. */
-	for (i = 0; i != nb_tx_queues; ++i) {
-		struct txq *primary_txq = (*sd->primary_priv->txqs)[i];
-		struct txq_ctrl *primary_txq_ctrl;
-		struct txq_ctrl *txq_ctrl;
-
-		if (primary_txq == NULL)
-			continue;
-		primary_txq_ctrl = container_of(primary_txq,
-						struct txq_ctrl, txq);
-		txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl) +
-					     (1 << primary_txq->elts_n) *
-					     sizeof(struct rte_mbuf *), 0,
-					     primary_txq_ctrl->socket);
-		if (txq_ctrl != NULL) {
-			if (txq_ctrl_setup(priv->dev,
-					   txq_ctrl,
-					   1 << primary_txq->elts_n,
-					   primary_txq_ctrl->socket,
-					   NULL) == 0) {
-				txq_ctrl->txq.stats.idx =
-					primary_txq->stats.idx;
-				tx_queues[i] = &txq_ctrl->txq;
-				continue;
-			}
-			rte_free(txq_ctrl);
-		}
-		while (i) {
-			txq_ctrl = tx_queues[--i];
-			txq_cleanup(txq_ctrl);
-			rte_free(txq_ctrl);
-		}
-		goto error;
-	}
-	/* RX queues. */
-	for (i = 0; i != nb_rx_queues; ++i) {
-		struct rxq_ctrl *primary_rxq =
-			container_of((*sd->primary_priv->rxqs)[i],
-				     struct rxq_ctrl, rxq);
-
-		if (primary_rxq == NULL)
-			continue;
-		/* Not supported yet. */
-		rx_queues[i] = NULL;
-	}
-	/* Update everything. */
-	priv->txqs = (void *)tx_queues;
-	priv->txqs_n = nb_tx_queues;
-	priv->rxqs = (void *)rx_queues;
-	priv->rxqs_n = nb_rx_queues;
-	sd->data.rx_queues = rx_queues;
-	sd->data.tx_queues = tx_queues;
-	sd->data.nb_rx_queues = nb_rx_queues;
-	sd->data.nb_tx_queues = nb_tx_queues;
-	sd->data.dev_link = sd->shared_dev_data->dev_link;
-	sd->data.mtu = sd->shared_dev_data->mtu;
-	memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state,
-	       sizeof(sd->data.rx_queue_state));
-	memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state,
-	       sizeof(sd->data.tx_queue_state));
-	sd->data.dev_flags = sd->shared_dev_data->dev_flags;
-	/* Use local data from now on. */
-	rte_mb();
-	priv->dev->data = &sd->data;
-	rte_mb();
-	priv_select_tx_function(priv);
-	priv_select_rx_function(priv);
-	priv_unlock(priv);
-end:
-	/* More sanity checks. */
-	assert(priv->dev->data == &sd->data);
-	rte_spinlock_unlock(&sd->lock);
-	return priv;
-error:
-	priv_unlock(priv);
-	rte_free(tx_queues);
-	rte_free(rx_queues);
-	rte_spinlock_unlock(&sd->lock);
-	return NULL;
-}
-
-/**
  * Configure the TX function to use.
  *
  * @param priv
- *   Pointer to private structure.
+ *   Pointer to private data structure.
+ * @param dev
+ *   Pointer to rte_eth_dev structure.
  */
 void
-priv_select_tx_function(struct priv *priv)
+priv_dev_select_tx_function(struct priv *priv, struct rte_eth_dev *dev)
 {
-	priv->dev->tx_pkt_burst = mlx5_tx_burst;
+	assert(priv != NULL);
+	assert(dev != NULL);
+	dev->tx_pkt_burst = mlx5_tx_burst;
 	/* Select appropriate TX function. */
 	if (priv->mps == MLX5_MPW_ENHANCED) {
 		if (priv_check_vec_tx_support(priv) > 0) {
 			if (priv_check_raw_vec_tx_support(priv) > 0)
-				priv->dev->tx_pkt_burst = mlx5_tx_burst_raw_vec;
+				dev->tx_pkt_burst = mlx5_tx_burst_raw_vec;
 			else
-				priv->dev->tx_pkt_burst = mlx5_tx_burst_vec;
+				dev->tx_pkt_burst = mlx5_tx_burst_vec;
 			DEBUG("selected Enhanced MPW TX vectorized function");
 		} else {
-			priv->dev->tx_pkt_burst = mlx5_tx_burst_empw;
+			dev->tx_pkt_burst = mlx5_tx_burst_empw;
 			DEBUG("selected Enhanced MPW TX function");
 		}
 	} else if (priv->mps && priv->txq_inline) {
-		priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
+		dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
 		DEBUG("selected MPW inline TX function");
 	} else if (priv->mps) {
-		priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw;
+		dev->tx_pkt_burst = mlx5_tx_burst_mpw;
 		DEBUG("selected MPW TX function");
 	}
 }
@@ -1534,16 +1458,19 @@ priv_select_tx_function(struct priv *priv)
  * Configure the RX function to use.
  *
  * @param priv
- *   Pointer to private structure.
+ *   Pointer to private data structure.
+ * @param dev
+ *   Pointer to rte_eth_dev structure.
  */
 void
-priv_select_rx_function(struct priv *priv)
+priv_dev_select_rx_function(struct priv *priv, struct rte_eth_dev *dev)
 {
+	assert(priv != NULL);
+	assert(dev != NULL);
 	if (priv_check_vec_rx_support(priv) > 0) {
-		priv_prep_vec_rx_function(priv);
-		priv->dev->rx_pkt_burst = mlx5_rx_burst_vec;
+		dev->rx_pkt_burst = mlx5_rx_burst_vec;
 		DEBUG("selected RX vectorized function");
 	} else {
-		priv->dev->rx_pkt_burst = mlx5_rx_burst;
+		dev->rx_pkt_burst = mlx5_rx_burst;
 	}
 }
diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c
deleted file mode 100644
index 34a7e69f..00000000
--- a/drivers/net/mlx5/mlx5_fdir.c
+++ /dev/null
@@ -1,1101 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright 2015 6WIND S.A.
- *   Copyright 2015 Mellanox.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of 6WIND S.A. nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stddef.h>
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-#include <errno.h>
-
-/* Verbs header. */
-/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
-#include <infiniband/verbs.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
-
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
-#include <rte_ether.h>
-#include <rte_malloc.h>
-#include <rte_ethdev.h>
-#include <rte_common.h>
-#include <rte_flow.h>
-#include <rte_flow_driver.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
-
-#include "mlx5.h"
-#include "mlx5_rxtx.h"
-
-struct fdir_flow_desc {
-	uint16_t dst_port;
-	uint16_t src_port;
-	uint32_t src_ip[4];
-	uint32_t dst_ip[4];
-	uint8_t	mac[6];
-	uint16_t vlan_tag;
-	enum hash_rxq_type type;
-};
-
-struct mlx5_fdir_filter {
-	LIST_ENTRY(mlx5_fdir_filter) next;
-	uint16_t queue; /* Queue assigned to if FDIR match. */
-	enum rte_eth_fdir_behavior behavior;
-	struct fdir_flow_desc desc;
-	struct ibv_exp_flow *flow;
-};
-
-LIST_HEAD(fdir_filter_list, mlx5_fdir_filter);
-
-/**
- * Convert struct rte_eth_fdir_filter to mlx5 filter descriptor.
- *
- * @param[in] fdir_filter
- *   DPDK filter structure to convert.
- * @param[out] desc
- *   Resulting mlx5 filter descriptor.
- * @param mode
- *   Flow director mode.
- */
-static void
-fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
-			 struct fdir_flow_desc *desc, enum rte_fdir_mode mode)
-{
-	/* Initialize descriptor. */
-	memset(desc, 0, sizeof(*desc));
-
-	/* Set VLAN ID. */
-	desc->vlan_tag = fdir_filter->input.flow_ext.vlan_tci;
-
-	/* Set MAC address. */
-	if (mode == RTE_FDIR_MODE_PERFECT_MAC_VLAN) {
-		rte_memcpy(desc->mac,
-			   fdir_filter->input.flow.mac_vlan_flow.mac_addr.
-				addr_bytes,
-			   sizeof(desc->mac));
-		desc->type = HASH_RXQ_ETH;
-		return;
-	}
-
-	/* Set mode */
-	switch (fdir_filter->input.flow_type) {
-	case RTE_ETH_FLOW_NONFRAG_IPV4_UDP:
-		desc->type = HASH_RXQ_UDPV4;
-		break;
-	case RTE_ETH_FLOW_NONFRAG_IPV4_TCP:
-		desc->type = HASH_RXQ_TCPV4;
-		break;
-	case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
-		desc->type = HASH_RXQ_IPV4;
-		break;
-	case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
-		desc->type = HASH_RXQ_UDPV6;
-		break;
-	case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
-		desc->type = HASH_RXQ_TCPV6;
-		break;
-	case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
-		desc->type = HASH_RXQ_IPV6;
-		break;
-	default:
-		break;
-	}
-
-	/* Set flow values */
-	switch (fdir_filter->input.flow_type) {
-	case RTE_ETH_FLOW_NONFRAG_IPV4_UDP:
-	case RTE_ETH_FLOW_NONFRAG_IPV4_TCP:
-		desc->src_port = fdir_filter->input.flow.udp4_flow.src_port;
-		desc->dst_port = fdir_filter->input.flow.udp4_flow.dst_port;
-		/* fallthrough */
-	case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
-		desc->src_ip[0] = fdir_filter->input.flow.ip4_flow.src_ip;
-		desc->dst_ip[0] = fdir_filter->input.flow.ip4_flow.dst_ip;
-		break;
-	case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
-	case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
-		desc->src_port = fdir_filter->input.flow.udp6_flow.src_port;
-		desc->dst_port = fdir_filter->input.flow.udp6_flow.dst_port;
-		/* Fall through. */
-	case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
-		rte_memcpy(desc->src_ip,
-			   fdir_filter->input.flow.ipv6_flow.src_ip,
-			   sizeof(desc->src_ip));
-		rte_memcpy(desc->dst_ip,
-			   fdir_filter->input.flow.ipv6_flow.dst_ip,
-			   sizeof(desc->dst_ip));
-		break;
-	default:
-		break;
-	}
-}
-
-/**
- * Check if two flow descriptors overlap according to configured mask.
- *
- * @param priv
- *   Private structure that provides flow director mask.
- * @param desc1
- *   First flow descriptor to compare.
- * @param desc2
- *   Second flow descriptor to compare.
- *
- * @return
- *   Nonzero if descriptors overlap.
- */
-static int
-priv_fdir_overlap(const struct priv *priv,
-		  const struct fdir_flow_desc *desc1,
-		  const struct fdir_flow_desc *desc2)
-{
-	const struct rte_eth_fdir_masks *mask =
-		&priv->dev->data->dev_conf.fdir_conf.mask;
-	unsigned int i;
-
-	if (desc1->type != desc2->type)
-		return 0;
-	/* Ignore non masked bits. */
-	for (i = 0; i != RTE_DIM(desc1->mac); ++i)
-		if ((desc1->mac[i] & mask->mac_addr_byte_mask) !=
-		    (desc2->mac[i] & mask->mac_addr_byte_mask))
-			return 0;
-	if (((desc1->src_port & mask->src_port_mask) !=
-	     (desc2->src_port & mask->src_port_mask)) ||
-	    ((desc1->dst_port & mask->dst_port_mask) !=
-	     (desc2->dst_port & mask->dst_port_mask)))
-		return 0;
-	switch (desc1->type) {
-	case HASH_RXQ_IPV4:
-	case HASH_RXQ_UDPV4:
-	case HASH_RXQ_TCPV4:
-		if (((desc1->src_ip[0] & mask->ipv4_mask.src_ip) !=
-		     (desc2->src_ip[0] & mask->ipv4_mask.src_ip)) ||
-		    ((desc1->dst_ip[0] & mask->ipv4_mask.dst_ip) !=
-		     (desc2->dst_ip[0] & mask->ipv4_mask.dst_ip)))
-			return 0;
-		break;
-	case HASH_RXQ_IPV6:
-	case HASH_RXQ_UDPV6:
-	case HASH_RXQ_TCPV6:
-		for (i = 0; i != RTE_DIM(desc1->src_ip); ++i)
-			if (((desc1->src_ip[i] & mask->ipv6_mask.src_ip[i]) !=
-			     (desc2->src_ip[i] & mask->ipv6_mask.src_ip[i])) ||
-			    ((desc1->dst_ip[i] & mask->ipv6_mask.dst_ip[i]) !=
-			     (desc2->dst_ip[i] & mask->ipv6_mask.dst_ip[i])))
-				return 0;
-		break;
-	default:
-		break;
-	}
-	return 1;
-}
-
-/**
- * Create flow director steering rule for a specific filter.
- *
- * @param priv
- *   Private structure.
- * @param mlx5_fdir_filter
- *   Filter to create a steering rule for.
- * @param fdir_queue
- *   Flow director queue for matching packets.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-priv_fdir_flow_add(struct priv *priv,
-		   struct mlx5_fdir_filter *mlx5_fdir_filter,
-		   struct fdir_queue *fdir_queue)
-{
-	struct ibv_exp_flow *flow;
-	struct fdir_flow_desc *desc = &mlx5_fdir_filter->desc;
-	enum rte_fdir_mode fdir_mode =
-		priv->dev->data->dev_conf.fdir_conf.mode;
-	struct rte_eth_fdir_masks *mask =
-		&priv->dev->data->dev_conf.fdir_conf.mask;
-	FLOW_ATTR_SPEC_ETH(data, priv_flow_attr(priv, NULL, 0, desc->type));
-	struct ibv_exp_flow_attr *attr = &data->attr;
-	uintptr_t spec_offset = (uintptr_t)&data->spec;
-	struct ibv_exp_flow_spec_eth *spec_eth;
-	struct ibv_exp_flow_spec_ipv4 *spec_ipv4;
-	struct ibv_exp_flow_spec_ipv6 *spec_ipv6;
-	struct ibv_exp_flow_spec_tcp_udp *spec_tcp_udp;
-	struct mlx5_fdir_filter *iter_fdir_filter;
-	unsigned int i;
-
-	/* Abort if an existing flow overlaps this one to avoid packet
-	 * duplication, even if it targets another queue. */
-	LIST_FOREACH(iter_fdir_filter, priv->fdir_filter_list, next)
-		if ((iter_fdir_filter != mlx5_fdir_filter) &&
-		    (iter_fdir_filter->flow != NULL) &&
-		    (priv_fdir_overlap(priv,
-				       &mlx5_fdir_filter->desc,
-				       &iter_fdir_filter->desc)))
-			return EEXIST;
-
-	/*
-	 * No padding must be inserted by the compiler between attr and spec.
-	 * This layout is expected by libibverbs.
-	 */
-	assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec_offset);
-	priv_flow_attr(priv, attr, sizeof(data), desc->type);
-
-	/* Set Ethernet spec */
-	spec_eth = (struct ibv_exp_flow_spec_eth *)spec_offset;
-
-	/* The first specification must be Ethernet. */
-	assert(spec_eth->type == IBV_EXP_FLOW_SPEC_ETH);
-	assert(spec_eth->size == sizeof(*spec_eth));
-
-	/* VLAN ID */
-	spec_eth->val.vlan_tag = desc->vlan_tag & mask->vlan_tci_mask;
-	spec_eth->mask.vlan_tag = mask->vlan_tci_mask;
-
-	/* Update priority */
-	attr->priority = 2;
-
-	if (fdir_mode == RTE_FDIR_MODE_PERFECT_MAC_VLAN) {
-		/* MAC Address */
-		for (i = 0; i != RTE_DIM(spec_eth->mask.dst_mac); ++i) {
-			spec_eth->val.dst_mac[i] =
-				desc->mac[i] & mask->mac_addr_byte_mask;
-			spec_eth->mask.dst_mac[i] = mask->mac_addr_byte_mask;
-		}
-		goto create_flow;
-	}
-
-	switch (desc->type) {
-	case HASH_RXQ_IPV4:
-	case HASH_RXQ_UDPV4:
-	case HASH_RXQ_TCPV4:
-		spec_offset += spec_eth->size;
-
-		/* Set IP spec */
-		spec_ipv4 = (struct ibv_exp_flow_spec_ipv4 *)spec_offset;
-
-		/* The second specification must be IP. */
-		assert(spec_ipv4->type == IBV_EXP_FLOW_SPEC_IPV4);
-		assert(spec_ipv4->size == sizeof(*spec_ipv4));
-
-		spec_ipv4->val.src_ip =
-			desc->src_ip[0] & mask->ipv4_mask.src_ip;
-		spec_ipv4->val.dst_ip =
-			desc->dst_ip[0] & mask->ipv4_mask.dst_ip;
-		spec_ipv4->mask.src_ip = mask->ipv4_mask.src_ip;
-		spec_ipv4->mask.dst_ip = mask->ipv4_mask.dst_ip;
-
-		/* Update priority */
-		attr->priority = 1;
-
-		if (desc->type == HASH_RXQ_IPV4)
-			goto create_flow;
-
-		spec_offset += spec_ipv4->size;
-		break;
-	case HASH_RXQ_IPV6:
-	case HASH_RXQ_UDPV6:
-	case HASH_RXQ_TCPV6:
-		spec_offset += spec_eth->size;
-
-		/* Set IP spec */
-		spec_ipv6 = (struct ibv_exp_flow_spec_ipv6 *)spec_offset;
-
-		/* The second specification must be IP. */
-		assert(spec_ipv6->type == IBV_EXP_FLOW_SPEC_IPV6);
-		assert(spec_ipv6->size == sizeof(*spec_ipv6));
-
-		for (i = 0; i != RTE_DIM(desc->src_ip); ++i) {
-			((uint32_t *)spec_ipv6->val.src_ip)[i] =
-				desc->src_ip[i] & mask->ipv6_mask.src_ip[i];
-			((uint32_t *)spec_ipv6->val.dst_ip)[i] =
-				desc->dst_ip[i] & mask->ipv6_mask.dst_ip[i];
-		}
-		rte_memcpy(spec_ipv6->mask.src_ip,
-			   mask->ipv6_mask.src_ip,
-			   sizeof(spec_ipv6->mask.src_ip));
-		rte_memcpy(spec_ipv6->mask.dst_ip,
-			   mask->ipv6_mask.dst_ip,
-			   sizeof(spec_ipv6->mask.dst_ip));
-
-		/* Update priority */
-		attr->priority = 1;
-
-		if (desc->type == HASH_RXQ_IPV6)
-			goto create_flow;
-
-		spec_offset += spec_ipv6->size;
-		break;
-	default:
-		ERROR("invalid flow attribute type");
-		return EINVAL;
-	}
-
-	/* Set TCP/UDP flow specification. */
-	spec_tcp_udp = (struct ibv_exp_flow_spec_tcp_udp *)spec_offset;
-
-	/* The third specification must be TCP/UDP. */
-	assert(spec_tcp_udp->type == IBV_EXP_FLOW_SPEC_TCP ||
-	       spec_tcp_udp->type == IBV_EXP_FLOW_SPEC_UDP);
-	assert(spec_tcp_udp->size == sizeof(*spec_tcp_udp));
-
-	spec_tcp_udp->val.src_port = desc->src_port & mask->src_port_mask;
-	spec_tcp_udp->val.dst_port = desc->dst_port & mask->dst_port_mask;
-	spec_tcp_udp->mask.src_port = mask->src_port_mask;
-	spec_tcp_udp->mask.dst_port = mask->dst_port_mask;
-
-	/* Update priority */
-	attr->priority = 0;
-
-create_flow:
-
-	errno = 0;
-	flow = ibv_exp_create_flow(fdir_queue->qp, attr);
-	if (flow == NULL) {
-		/* It's not clear whether errno is always set in this case. */
-		ERROR("%p: flow director configuration failed, errno=%d: %s",
-		      (void *)priv, errno,
-		      (errno ? strerror(errno) : "Unknown error"));
-		if (errno)
-			return errno;
-		return EINVAL;
-	}
-
-	DEBUG("%p: added flow director rule (%p)", (void *)priv, (void *)flow);
-	mlx5_fdir_filter->flow = flow;
-	return 0;
-}
-
-/**
- * Destroy a flow director queue.
- *
- * @param fdir_queue
- *   Flow director queue to be destroyed.
- */
-void
-priv_fdir_queue_destroy(struct priv *priv, struct fdir_queue *fdir_queue)
-{
-	struct mlx5_fdir_filter *fdir_filter;
-
-	/* Disable filter flows still applying to this queue. */
-	LIST_FOREACH(fdir_filter, priv->fdir_filter_list, next) {
-		unsigned int idx = fdir_filter->queue;
-		struct rxq_ctrl *rxq_ctrl =
-			container_of((*priv->rxqs)[idx], struct rxq_ctrl, rxq);
-
-		assert(idx < priv->rxqs_n);
-		if (fdir_queue == rxq_ctrl->fdir_queue &&
-		    fdir_filter->flow != NULL) {
-			claim_zero(ibv_exp_destroy_flow(fdir_filter->flow));
-			fdir_filter->flow = NULL;
-		}
-	}
-	assert(fdir_queue->qp);
-	claim_zero(ibv_destroy_qp(fdir_queue->qp));
-	assert(fdir_queue->ind_table);
-	claim_zero(ibv_exp_destroy_rwq_ind_table(fdir_queue->ind_table));
-	if (fdir_queue->wq)
-		claim_zero(ibv_exp_destroy_wq(fdir_queue->wq));
-	if (fdir_queue->cq)
-		claim_zero(ibv_destroy_cq(fdir_queue->cq));
-#ifndef NDEBUG
-	memset(fdir_queue, 0x2a, sizeof(*fdir_queue));
-#endif
-	rte_free(fdir_queue);
-}
-
-/**
- * Create a flow director queue.
- *
- * @param priv
- *   Private structure.
- * @param wq
- *   Work queue to route matched packets to, NULL if one needs to
- *   be created.
- *
- * @return
- *   Related flow director queue on success, NULL otherwise.
- */
-static struct fdir_queue *
-priv_fdir_queue_create(struct priv *priv, struct ibv_exp_wq *wq,
-		       unsigned int socket)
-{
-	struct fdir_queue *fdir_queue;
-
-	fdir_queue = rte_calloc_socket(__func__, 1, sizeof(*fdir_queue),
-				       0, socket);
-	if (!fdir_queue) {
-		ERROR("cannot allocate flow director queue");
-		return NULL;
-	}
-	assert(priv->pd);
-	assert(priv->ctx);
-	if (!wq) {
-		fdir_queue->cq = ibv_exp_create_cq(
-			priv->ctx, 1, NULL, NULL, 0,
-			&(struct ibv_exp_cq_init_attr){
-				.comp_mask = 0,
-			});
-		if (!fdir_queue->cq) {
-			ERROR("cannot create flow director CQ");
-			goto error;
-		}
-		fdir_queue->wq = ibv_exp_create_wq(
-			priv->ctx,
-			&(struct ibv_exp_wq_init_attr){
-				.wq_type = IBV_EXP_WQT_RQ,
-				.max_recv_wr = 1,
-				.max_recv_sge = 1,
-				.pd = priv->pd,
-				.cq = fdir_queue->cq,
-			});
-		if (!fdir_queue->wq) {
-			ERROR("cannot create flow director WQ");
-			goto error;
-		}
-		wq = fdir_queue->wq;
-	}
-	fdir_queue->ind_table = ibv_exp_create_rwq_ind_table(
-		priv->ctx,
-		&(struct ibv_exp_rwq_ind_table_init_attr){
-			.pd = priv->pd,
-			.log_ind_tbl_size = 0,
-			.ind_tbl = &wq,
-			.comp_mask = 0,
-		});
-	if (!fdir_queue->ind_table) {
-		ERROR("cannot create flow director indirection table");
-		goto error;
-	}
-	fdir_queue->qp = ibv_exp_create_qp(
-		priv->ctx,
-		&(struct ibv_exp_qp_init_attr){
-			.qp_type = IBV_QPT_RAW_PACKET,
-			.comp_mask =
-				IBV_EXP_QP_INIT_ATTR_PD |
-				IBV_EXP_QP_INIT_ATTR_PORT |
-				IBV_EXP_QP_INIT_ATTR_RX_HASH,
-			.pd = priv->pd,
-			.rx_hash_conf = &(struct ibv_exp_rx_hash_conf){
-				.rx_hash_function =
-					IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
-				.rx_hash_key_len = rss_hash_default_key_len,
-				.rx_hash_key = rss_hash_default_key,
-				.rx_hash_fields_mask = 0,
-				.rwq_ind_tbl = fdir_queue->ind_table,
-			},
-			.port_num = priv->port,
-		});
-	if (!fdir_queue->qp) {
-		ERROR("cannot create flow director hash RX QP");
-		goto error;
-	}
-	return fdir_queue;
-error:
-	assert(fdir_queue);
-	assert(!fdir_queue->qp);
-	if (fdir_queue->ind_table)
-		claim_zero(ibv_exp_destroy_rwq_ind_table
-			   (fdir_queue->ind_table));
-	if (fdir_queue->wq)
-		claim_zero(ibv_exp_destroy_wq(fdir_queue->wq));
-	if (fdir_queue->cq)
-		claim_zero(ibv_destroy_cq(fdir_queue->cq));
-	rte_free(fdir_queue);
-	return NULL;
-}
-
-/**
- * Get flow director queue for a specific RX queue, create it in case
- * it does not exist.
- *
- * @param priv
- *   Private structure.
- * @param idx
- *   RX queue index.
- *
- * @return
- *   Related flow director queue on success, NULL otherwise.
- */
-static struct fdir_queue *
-priv_get_fdir_queue(struct priv *priv, uint16_t idx)
-{
-	struct rxq_ctrl *rxq_ctrl =
-		container_of((*priv->rxqs)[idx], struct rxq_ctrl, rxq);
-	struct fdir_queue *fdir_queue = rxq_ctrl->fdir_queue;
-
-	assert(rxq_ctrl->wq);
-	if (fdir_queue == NULL) {
-		fdir_queue = priv_fdir_queue_create(priv, rxq_ctrl->wq,
-						    rxq_ctrl->socket);
-		rxq_ctrl->fdir_queue = fdir_queue;
-	}
-	return fdir_queue;
-}
-
-/**
- * Get or flow director drop queue. Create it if it does not exist.
- *
- * @param priv
- *   Private structure.
- *
- * @return
- *   Flow director drop queue on success, NULL otherwise.
- */
-static struct fdir_queue *
-priv_get_fdir_drop_queue(struct priv *priv)
-{
-	struct fdir_queue *fdir_queue = priv->fdir_drop_queue;
-
-	if (fdir_queue == NULL) {
-		unsigned int socket = SOCKET_ID_ANY;
-
-		/* Select a known NUMA socket if possible. */
-		if (priv->rxqs_n && (*priv->rxqs)[0])
-			socket = container_of((*priv->rxqs)[0],
-					      struct rxq_ctrl, rxq)->socket;
-		fdir_queue = priv_fdir_queue_create(priv, NULL, socket);
-		priv->fdir_drop_queue = fdir_queue;
-	}
-	return fdir_queue;
-}
-
-/**
- * Enable flow director filter and create steering rules.
- *
- * @param priv
- *   Private structure.
- * @param mlx5_fdir_filter
- *   Filter to create steering rule for.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-priv_fdir_filter_enable(struct priv *priv,
-			struct mlx5_fdir_filter *mlx5_fdir_filter)
-{
-	struct fdir_queue *fdir_queue;
-
-	/* Check if flow already exists. */
-	if (mlx5_fdir_filter->flow != NULL)
-		return 0;
-
-	/* Get fdir_queue for specific queue. */
-	if (mlx5_fdir_filter->behavior == RTE_ETH_FDIR_REJECT)
-		fdir_queue = priv_get_fdir_drop_queue(priv);
-	else
-		fdir_queue = priv_get_fdir_queue(priv,
-						 mlx5_fdir_filter->queue);
-
-	if (fdir_queue == NULL) {
-		ERROR("failed to create flow director rxq for queue %d",
-		      mlx5_fdir_filter->queue);
-		return EINVAL;
-	}
-
-	/* Create flow */
-	return priv_fdir_flow_add(priv, mlx5_fdir_filter, fdir_queue);
-}
-
-/**
- * Initialize flow director filters list.
- *
- * @param priv
- *   Private structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-int
-fdir_init_filters_list(struct priv *priv)
-{
-	/* Filter list initialization should be done only once. */
-	if (priv->fdir_filter_list)
-		return 0;
-
-	/* Create filters list. */
-	priv->fdir_filter_list =
-		rte_calloc(__func__, 1, sizeof(*priv->fdir_filter_list), 0);
-
-	if (priv->fdir_filter_list == NULL) {
-		int err = ENOMEM;
-
-		ERROR("cannot allocate flow director filter list: %s",
-		      strerror(err));
-		return err;
-	}
-
-	LIST_INIT(priv->fdir_filter_list);
-
-	return 0;
-}
-
-/**
- * Flush all filters.
- *
- * @param priv
- *   Private structure.
- */
-static void
-priv_fdir_filter_flush(struct priv *priv)
-{
-	struct mlx5_fdir_filter *mlx5_fdir_filter;
-
-	while ((mlx5_fdir_filter = LIST_FIRST(priv->fdir_filter_list))) {
-		struct ibv_exp_flow *flow = mlx5_fdir_filter->flow;
-
-		DEBUG("%p: flushing flow director filter %p",
-		      (void *)priv, (void *)mlx5_fdir_filter);
-		LIST_REMOVE(mlx5_fdir_filter, next);
-		if (flow != NULL)
-			claim_zero(ibv_exp_destroy_flow(flow));
-		rte_free(mlx5_fdir_filter);
-	}
-}
-
-/**
- * Remove all flow director filters and delete list.
- *
- * @param priv
- *   Private structure.
- */
-void
-priv_fdir_delete_filters_list(struct priv *priv)
-{
-	priv_fdir_filter_flush(priv);
-	rte_free(priv->fdir_filter_list);
-	priv->fdir_filter_list = NULL;
-}
-
-/**
- * Disable flow director, remove all steering rules.
- *
- * @param priv
- *   Private structure.
- */
-void
-priv_fdir_disable(struct priv *priv)
-{
-	unsigned int i;
-	struct mlx5_fdir_filter *mlx5_fdir_filter;
-
-	/* Run on every flow director filter and destroy flow handle. */
-	LIST_FOREACH(mlx5_fdir_filter, priv->fdir_filter_list, next) {
-		struct ibv_exp_flow *flow;
-
-		/* Only valid elements should be in the list */
-		assert(mlx5_fdir_filter != NULL);
-		flow = mlx5_fdir_filter->flow;
-
-		/* Destroy flow handle */
-		if (flow != NULL) {
-			claim_zero(ibv_exp_destroy_flow(flow));
-			mlx5_fdir_filter->flow = NULL;
-		}
-	}
-
-	/* Destroy flow director context in each RX queue. */
-	for (i = 0; (i != priv->rxqs_n); i++) {
-		struct rxq_ctrl *rxq_ctrl;
-
-		if (!(*priv->rxqs)[i])
-			continue;
-		rxq_ctrl = container_of((*priv->rxqs)[i], struct rxq_ctrl, rxq);
-		if (!rxq_ctrl->fdir_queue)
-			continue;
-		priv_fdir_queue_destroy(priv, rxq_ctrl->fdir_queue);
-		rxq_ctrl->fdir_queue = NULL;
-	}
-	if (priv->fdir_drop_queue) {
-		priv_fdir_queue_destroy(priv, priv->fdir_drop_queue);
-		priv->fdir_drop_queue = NULL;
-	}
-}
-
-/**
- * Enable flow director, create steering rules.
- *
- * @param priv
- *   Private structure.
- */
-void
-priv_fdir_enable(struct priv *priv)
-{
-	struct mlx5_fdir_filter *mlx5_fdir_filter;
-
-	/* Run on every fdir filter and create flow handle */
-	LIST_FOREACH(mlx5_fdir_filter, priv->fdir_filter_list, next) {
-		/* Only valid elements should be in the list */
-		assert(mlx5_fdir_filter != NULL);
-
-		priv_fdir_filter_enable(priv, mlx5_fdir_filter);
-	}
-}
-
-/**
- * Find specific filter in list.
- *
- * @param priv
- *   Private structure.
- * @param fdir_filter
- *   Flow director filter to find.
- *
- * @return
- *   Filter element if found, otherwise NULL.
- */
-static struct mlx5_fdir_filter *
-priv_find_filter_in_list(struct priv *priv,
-			 const struct rte_eth_fdir_filter *fdir_filter)
-{
-	struct fdir_flow_desc desc;
-	struct mlx5_fdir_filter *mlx5_fdir_filter;
-	enum rte_fdir_mode fdir_mode = priv->dev->data->dev_conf.fdir_conf.mode;
-
-	/* Get flow director filter to look for. */
-	fdir_filter_to_flow_desc(fdir_filter, &desc, fdir_mode);
-
-	/* Look for the requested element. */
-	LIST_FOREACH(mlx5_fdir_filter, priv->fdir_filter_list, next) {
-		/* Only valid elements should be in the list. */
-		assert(mlx5_fdir_filter != NULL);
-
-		/* Return matching filter. */
-		if (!memcmp(&desc, &mlx5_fdir_filter->desc, sizeof(desc)))
-			return mlx5_fdir_filter;
-	}
-
-	/* Filter not found */
-	return NULL;
-}
-
-/**
- * Add new flow director filter and store it in list.
- *
- * @param priv
- *   Private structure.
- * @param fdir_filter
- *   Flow director filter to add.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-priv_fdir_filter_add(struct priv *priv,
-		     const struct rte_eth_fdir_filter *fdir_filter)
-{
-	struct mlx5_fdir_filter *mlx5_fdir_filter;
-	enum rte_fdir_mode fdir_mode = priv->dev->data->dev_conf.fdir_conf.mode;
-	int err = 0;
-
-	/* Validate queue number. */
-	if (fdir_filter->action.rx_queue >= priv->rxqs_n) {
-		ERROR("invalid queue number %d", fdir_filter->action.rx_queue);
-		return EINVAL;
-	}
-
-	/* Duplicate filters are currently unsupported. */
-	mlx5_fdir_filter = priv_find_filter_in_list(priv, fdir_filter);
-	if (mlx5_fdir_filter != NULL) {
-		ERROR("filter already exists");
-		return EINVAL;
-	}
-
-	/* Create new flow director filter. */
-	mlx5_fdir_filter =
-		rte_calloc(__func__, 1, sizeof(*mlx5_fdir_filter), 0);
-	if (mlx5_fdir_filter == NULL) {
-		err = ENOMEM;
-		ERROR("cannot allocate flow director filter: %s",
-		      strerror(err));
-		return err;
-	}
-
-	/* Set action parameters. */
-	mlx5_fdir_filter->queue = fdir_filter->action.rx_queue;
-	mlx5_fdir_filter->behavior = fdir_filter->action.behavior;
-
-	/* Convert to mlx5 filter descriptor. */
-	fdir_filter_to_flow_desc(fdir_filter,
-				 &mlx5_fdir_filter->desc, fdir_mode);
-
-	/* Insert new filter into list. */
-	LIST_INSERT_HEAD(priv->fdir_filter_list, mlx5_fdir_filter, next);
-
-	DEBUG("%p: flow director filter %p added",
-	      (void *)priv, (void *)mlx5_fdir_filter);
-
-	/* Enable filter immediately if device is started. */
-	if (priv->started)
-		err = priv_fdir_filter_enable(priv, mlx5_fdir_filter);
-
-	return err;
-}
-
-/**
- * Update queue for specific filter.
- *
- * @param priv
- *   Private structure.
- * @param fdir_filter
- *   Filter to be updated.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-priv_fdir_filter_update(struct priv *priv,
-			const struct rte_eth_fdir_filter *fdir_filter)
-{
-	struct mlx5_fdir_filter *mlx5_fdir_filter;
-
-	/* Validate queue number. */
-	if (fdir_filter->action.rx_queue >= priv->rxqs_n) {
-		ERROR("invalid queue number %d", fdir_filter->action.rx_queue);
-		return EINVAL;
-	}
-
-	mlx5_fdir_filter = priv_find_filter_in_list(priv, fdir_filter);
-	if (mlx5_fdir_filter != NULL) {
-		struct ibv_exp_flow *flow = mlx5_fdir_filter->flow;
-		int err = 0;
-
-		/* Update queue number. */
-		mlx5_fdir_filter->queue = fdir_filter->action.rx_queue;
-
-		/* Destroy flow handle. */
-		if (flow != NULL) {
-			claim_zero(ibv_exp_destroy_flow(flow));
-			mlx5_fdir_filter->flow = NULL;
-		}
-		DEBUG("%p: flow director filter %p updated",
-		      (void *)priv, (void *)mlx5_fdir_filter);
-
-		/* Enable filter if device is started. */
-		if (priv->started)
-			err = priv_fdir_filter_enable(priv, mlx5_fdir_filter);
-
-		return err;
-	}
-
-	/* Filter not found, create it. */
-	DEBUG("%p: filter not found for update, creating new filter",
-	      (void *)priv);
-	return priv_fdir_filter_add(priv, fdir_filter);
-}
-
-/**
- * Delete specific filter.
- *
- * @param priv
- *   Private structure.
- * @param fdir_filter
- *   Filter to be deleted.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-priv_fdir_filter_delete(struct priv *priv,
-			const struct rte_eth_fdir_filter *fdir_filter)
-{
-	struct mlx5_fdir_filter *mlx5_fdir_filter;
-
-	mlx5_fdir_filter = priv_find_filter_in_list(priv, fdir_filter);
-	if (mlx5_fdir_filter != NULL) {
-		struct ibv_exp_flow *flow = mlx5_fdir_filter->flow;
-
-		/* Remove element from list. */
-		LIST_REMOVE(mlx5_fdir_filter, next);
-
-		/* Destroy flow handle. */
-		if (flow != NULL) {
-			claim_zero(ibv_exp_destroy_flow(flow));
-			mlx5_fdir_filter->flow = NULL;
-		}
-
-		DEBUG("%p: flow director filter %p deleted",
-		      (void *)priv, (void *)mlx5_fdir_filter);
-
-		/* Delete filter. */
-		rte_free(mlx5_fdir_filter);
-
-		return 0;
-	}
-
-	ERROR("%p: flow director delete failed, cannot find filter",
-	      (void *)priv);
-	return EINVAL;
-}
-
-/**
- * Get flow director information.
- *
- * @param priv
- *   Private structure.
- * @param[out] fdir_info
- *   Resulting flow director information.
- */
-static void
-priv_fdir_info_get(struct priv *priv, struct rte_eth_fdir_info *fdir_info)
-{
-	struct rte_eth_fdir_masks *mask =
-		&priv->dev->data->dev_conf.fdir_conf.mask;
-
-	fdir_info->mode = priv->dev->data->dev_conf.fdir_conf.mode;
-	fdir_info->guarant_spc = 0;
-
-	rte_memcpy(&fdir_info->mask, mask, sizeof(fdir_info->mask));
-
-	fdir_info->max_flexpayload = 0;
-	fdir_info->flow_types_mask[0] = 0;
-
-	fdir_info->flex_payload_unit = 0;
-	fdir_info->max_flex_payload_segment_num = 0;
-	fdir_info->flex_payload_limit = 0;
-	memset(&fdir_info->flex_conf, 0, sizeof(fdir_info->flex_conf));
-}
-
-/**
- * Deal with flow director operations.
- *
- * @param priv
- *   Pointer to private structure.
- * @param filter_op
- *   Operation to perform.
- * @param arg
- *   Pointer to operation-specific structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-priv_fdir_ctrl_func(struct priv *priv, enum rte_filter_op filter_op, void *arg)
-{
-	enum rte_fdir_mode fdir_mode =
-		priv->dev->data->dev_conf.fdir_conf.mode;
-	int ret = 0;
-
-	if (filter_op == RTE_ETH_FILTER_NOP)
-		return 0;
-
-	if (fdir_mode != RTE_FDIR_MODE_PERFECT &&
-	    fdir_mode != RTE_FDIR_MODE_PERFECT_MAC_VLAN) {
-		ERROR("%p: flow director mode %d not supported",
-		      (void *)priv, fdir_mode);
-		return EINVAL;
-	}
-
-	switch (filter_op) {
-	case RTE_ETH_FILTER_ADD:
-		ret = priv_fdir_filter_add(priv, arg);
-		break;
-	case RTE_ETH_FILTER_UPDATE:
-		ret = priv_fdir_filter_update(priv, arg);
-		break;
-	case RTE_ETH_FILTER_DELETE:
-		ret = priv_fdir_filter_delete(priv, arg);
-		break;
-	case RTE_ETH_FILTER_FLUSH:
-		priv_fdir_filter_flush(priv);
-		break;
-	case RTE_ETH_FILTER_INFO:
-		priv_fdir_info_get(priv, arg);
-		break;
-	default:
-		DEBUG("%p: unknown operation %u", (void *)priv, filter_op);
-		ret = EINVAL;
-		break;
-	}
-	return ret;
-}
-
-static const struct rte_flow_ops mlx5_flow_ops = {
-	.validate = mlx5_flow_validate,
-	.create = mlx5_flow_create,
-	.destroy = mlx5_flow_destroy,
-	.flush = mlx5_flow_flush,
-	.query = NULL,
-	.isolate = mlx5_flow_isolate,
-};
-
-/**
- * Manage filter operations.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param filter_type
- *   Filter type.
- * @param filter_op
- *   Operation to perform.
- * @param arg
- *   Pointer to operation-specific structure.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-int
-mlx5_dev_filter_ctrl(struct rte_eth_dev *dev,
-		     enum rte_filter_type filter_type,
-		     enum rte_filter_op filter_op,
-		     void *arg)
-{
-	int ret = EINVAL;
-	struct priv *priv = dev->data->dev_private;
-
-	switch (filter_type) {
-	case RTE_ETH_FILTER_GENERIC:
-		if (filter_op != RTE_ETH_FILTER_GET)
-			return -EINVAL;
-		*(const void **)arg = &mlx5_flow_ops;
-		return 0;
-	case RTE_ETH_FILTER_FDIR:
-		priv_lock(priv);
-		ret = priv_fdir_ctrl_func(priv, filter_op, arg);
-		priv_unlock(priv);
-		break;
-	default:
-		ERROR("%p: filter type (%d) not supported",
-		      (void *)dev, filter_type);
-		break;
-	}
-
-	return -ret;
-}
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 86be9291..cd99cb07 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -52,13 +52,36 @@
 #include "mlx5.h"
 #include "mlx5_prm.h"
 
-/* Number of Work Queue necessary for the DROP queue. */
-#ifndef HAVE_VERBS_IBV_EXP_FLOW_SPEC_ACTION_DROP
-#define MLX5_DROP_WQ_N 4
-#else
-#define MLX5_DROP_WQ_N 1
+/* Define minimal priority for control plane flows. */
+#define MLX5_CTRL_FLOW_PRIORITY 4
+
+/* Internet Protocol versions. */
+#define MLX5_IPV4 4
+#define MLX5_IPV6 6
+
+#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
+struct ibv_counter_set_init_attr {
+	int dummy;
+};
+struct ibv_flow_spec_counter_action {
+	int dummy;
+};
+struct ibv_counter_set {
+	int dummy;
+};
+
+static inline int
+ibv_destroy_counter_set(struct ibv_counter_set *cs)
+{
+	(void)cs;
+	return -ENOTSUP;
+}
 #endif
 
+/* Dev ops structure defined in mlx5.c */
+extern const struct eth_dev_ops mlx5_dev_ops;
+extern const struct eth_dev_ops mlx5_dev_ops_isolate;
+
 static int
 mlx5_flow_create_eth(const struct rte_flow_item *item,
 		     const void *default_mask,
@@ -94,19 +117,144 @@ mlx5_flow_create_vxlan(const struct rte_flow_item *item,
 		       const void *default_mask,
 		       void *data);
 
-struct rte_flow {
-	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
-	struct ibv_exp_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */
-	struct ibv_exp_rwq_ind_table *ind_table; /**< Indirection table. */
+struct mlx5_flow_parse;
+
+static void
+mlx5_flow_create_copy(struct mlx5_flow_parse *parser, void *src,
+		      unsigned int size);
+
+static int
+mlx5_flow_create_flag_mark(struct mlx5_flow_parse *parser, uint32_t mark_id);
+
+static int
+mlx5_flow_create_count(struct priv *priv, struct mlx5_flow_parse *parser);
+
+/* Hash RX queue types. */
+enum hash_rxq_type {
+	HASH_RXQ_TCPV4,
+	HASH_RXQ_UDPV4,
+	HASH_RXQ_IPV4,
+	HASH_RXQ_TCPV6,
+	HASH_RXQ_UDPV6,
+	HASH_RXQ_IPV6,
+	HASH_RXQ_ETH,
+};
+
+/* Initialization data for hash RX queue. */
+struct hash_rxq_init {
+	uint64_t hash_fields; /* Fields that participate in the hash. */
+	uint64_t dpdk_rss_hf; /* Matching DPDK RSS hash fields. */
+	unsigned int flow_priority; /* Flow priority to use. */
+	unsigned int ip_version; /* Internet protocol. */
+};
+
+/* Initialization data for hash RX queues. */
+const struct hash_rxq_init hash_rxq_init[] = {
+	[HASH_RXQ_TCPV4] = {
+		.hash_fields = (IBV_RX_HASH_SRC_IPV4 |
+				IBV_RX_HASH_DST_IPV4 |
+				IBV_RX_HASH_SRC_PORT_TCP |
+				IBV_RX_HASH_DST_PORT_TCP),
+		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
+		.flow_priority = 0,
+		.ip_version = MLX5_IPV4,
+	},
+	[HASH_RXQ_UDPV4] = {
+		.hash_fields = (IBV_RX_HASH_SRC_IPV4 |
+				IBV_RX_HASH_DST_IPV4 |
+				IBV_RX_HASH_SRC_PORT_UDP |
+				IBV_RX_HASH_DST_PORT_UDP),
+		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
+		.flow_priority = 0,
+		.ip_version = MLX5_IPV4,
+	},
+	[HASH_RXQ_IPV4] = {
+		.hash_fields = (IBV_RX_HASH_SRC_IPV4 |
+				IBV_RX_HASH_DST_IPV4),
+		.dpdk_rss_hf = (ETH_RSS_IPV4 |
+				ETH_RSS_FRAG_IPV4),
+		.flow_priority = 1,
+		.ip_version = MLX5_IPV4,
+	},
+	[HASH_RXQ_TCPV6] = {
+		.hash_fields = (IBV_RX_HASH_SRC_IPV6 |
+				IBV_RX_HASH_DST_IPV6 |
+				IBV_RX_HASH_SRC_PORT_TCP |
+				IBV_RX_HASH_DST_PORT_TCP),
+		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
+		.flow_priority = 0,
+		.ip_version = MLX5_IPV6,
+	},
+	[HASH_RXQ_UDPV6] = {
+		.hash_fields = (IBV_RX_HASH_SRC_IPV6 |
+				IBV_RX_HASH_DST_IPV6 |
+				IBV_RX_HASH_SRC_PORT_UDP |
+				IBV_RX_HASH_DST_PORT_UDP),
+		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
+		.flow_priority = 0,
+		.ip_version = MLX5_IPV6,
+	},
+	[HASH_RXQ_IPV6] = {
+		.hash_fields = (IBV_RX_HASH_SRC_IPV6 |
+				IBV_RX_HASH_DST_IPV6),
+		.dpdk_rss_hf = (ETH_RSS_IPV6 |
+				ETH_RSS_FRAG_IPV6),
+		.flow_priority = 1,
+		.ip_version = MLX5_IPV6,
+	},
+	[HASH_RXQ_ETH] = {
+		.hash_fields = 0,
+		.dpdk_rss_hf = 0,
+		.flow_priority = 2,
+	},
+};
+
+/* Number of entries in hash_rxq_init[]. */
+const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
+
+/** Structure for holding counter stats. */
+struct mlx5_flow_counter_stats {
+	uint64_t hits; /**< Number of packets matched by the rule. */
+	uint64_t bytes; /**< Number of bytes matched by the rule. */
+};
+
+/** Structure for Drop queue. */
+struct mlx5_hrxq_drop {
+	struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */
 	struct ibv_qp *qp; /**< Verbs queue pair. */
-	struct ibv_exp_flow *ibv_flow; /**< Verbs flow. */
-	struct ibv_exp_wq *wq; /**< Verbs work queue. */
+	struct ibv_wq *wq; /**< Verbs work queue. */
 	struct ibv_cq *cq; /**< Verbs completion queue. */
-	uint16_t rxqs_n; /**< Number of queues in this flow, 0 if drop queue. */
+};
+
+/* Flows structures. */
+struct mlx5_flow {
+	uint64_t hash_fields; /**< Fields that participate in the hash. */
+	struct ibv_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */
+	struct ibv_flow *ibv_flow; /**< Verbs flow. */
+	struct mlx5_hrxq *hrxq; /**< Hash Rx queues. */
+};
+
+/* Drop flows structures. */
+struct mlx5_flow_drop {
+	struct ibv_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */
+	struct ibv_flow *ibv_flow; /**< Verbs flow. */
+};
+
+struct rte_flow {
+	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
 	uint32_t mark:1; /**< Set if the flow is marked. */
 	uint32_t drop:1; /**< Drop queue. */
-	uint64_t hash_fields; /**< Fields that participate in the hash. */
-	struct rxq *rxqs[]; /**< Pointer to the queues array. */
+	uint16_t queues_n; /**< Number of entries in queue[]. */
+	uint16_t (*queues)[]; /**< Queues indexes to use. */
+	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
+	uint8_t rss_key[40]; /**< copy of the RSS key. */
+	struct ibv_counter_set *cs; /**< Holds the counters for the rule. */
+	struct mlx5_flow_counter_stats counter_stats;/**<The counter stats. */
+	union {
+		struct mlx5_flow frxq[RTE_DIM(hash_rxq_init)];
+		/**< Flow with Rx queue. */
+		struct mlx5_flow_drop drxq; /**< Flow with drop Rx queue. */
+	};
 };
 
 /** Static initializer for items. */
@@ -157,6 +305,9 @@ static const enum rte_flow_action_type valid_actions[] = {
 	RTE_FLOW_ACTION_TYPE_QUEUE,
 	RTE_FLOW_ACTION_TYPE_MARK,
 	RTE_FLOW_ACTION_TYPE_FLAG,
+#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
+	RTE_FLOW_ACTION_TYPE_COUNT,
+#endif
 	RTE_FLOW_ACTION_TYPE_END,
 };
 
@@ -179,7 +330,7 @@ static const struct mlx5_flow_items mlx5_flow_items[] = {
 		.default_mask = &rte_flow_item_eth_mask,
 		.mask_sz = sizeof(struct rte_flow_item_eth),
 		.convert = mlx5_flow_create_eth,
-		.dst_sz = sizeof(struct ibv_exp_flow_spec_eth),
+		.dst_sz = sizeof(struct ibv_flow_spec_eth),
 	},
 	[RTE_FLOW_ITEM_TYPE_VLAN] = {
 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
@@ -208,7 +359,7 @@ static const struct mlx5_flow_items mlx5_flow_items[] = {
 		.default_mask = &rte_flow_item_ipv4_mask,
 		.mask_sz = sizeof(struct rte_flow_item_ipv4),
 		.convert = mlx5_flow_create_ipv4,
-		.dst_sz = sizeof(struct ibv_exp_flow_spec_ipv4_ext),
+		.dst_sz = sizeof(struct ibv_flow_spec_ipv4_ext),
 	},
 	[RTE_FLOW_ITEM_TYPE_IPV6] = {
 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
@@ -236,7 +387,7 @@ static const struct mlx5_flow_items mlx5_flow_items[] = {
 		.default_mask = &rte_flow_item_ipv6_mask,
 		.mask_sz = sizeof(struct rte_flow_item_ipv6),
 		.convert = mlx5_flow_create_ipv6,
-		.dst_sz = sizeof(struct ibv_exp_flow_spec_ipv6_ext),
+		.dst_sz = sizeof(struct ibv_flow_spec_ipv6),
 	},
 	[RTE_FLOW_ITEM_TYPE_UDP] = {
 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_VXLAN),
@@ -250,7 +401,7 @@ static const struct mlx5_flow_items mlx5_flow_items[] = {
 		.default_mask = &rte_flow_item_udp_mask,
 		.mask_sz = sizeof(struct rte_flow_item_udp),
 		.convert = mlx5_flow_create_udp,
-		.dst_sz = sizeof(struct ibv_exp_flow_spec_tcp_udp),
+		.dst_sz = sizeof(struct ibv_flow_spec_tcp_udp),
 	},
 	[RTE_FLOW_ITEM_TYPE_TCP] = {
 		.actions = valid_actions,
@@ -263,7 +414,7 @@ static const struct mlx5_flow_items mlx5_flow_items[] = {
 		.default_mask = &rte_flow_item_tcp_mask,
 		.mask_sz = sizeof(struct rte_flow_item_tcp),
 		.convert = mlx5_flow_create_tcp,
-		.dst_sz = sizeof(struct ibv_exp_flow_spec_tcp_udp),
+		.dst_sz = sizeof(struct ibv_flow_spec_tcp_udp),
 	},
 	[RTE_FLOW_ITEM_TYPE_VXLAN] = {
 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
@@ -274,33 +425,76 @@ static const struct mlx5_flow_items mlx5_flow_items[] = {
 		.default_mask = &rte_flow_item_vxlan_mask,
 		.mask_sz = sizeof(struct rte_flow_item_vxlan),
 		.convert = mlx5_flow_create_vxlan,
-		.dst_sz = sizeof(struct ibv_exp_flow_spec_tunnel),
+		.dst_sz = sizeof(struct ibv_flow_spec_tunnel),
 	},
 };
 
 /** Structure to pass to the conversion function. */
-struct mlx5_flow {
-	struct ibv_exp_flow_attr *ibv_attr; /**< Verbs attribute. */
-	unsigned int offset; /**< Offset in bytes in the ibv_attr buffer. */
+struct mlx5_flow_parse {
 	uint32_t inner; /**< Set once VXLAN is encountered. */
-	uint64_t hash_fields; /**< Fields that participate in the hash. */
-};
-
-/** Structure for Drop queue. */
-struct rte_flow_drop {
-	struct ibv_exp_rwq_ind_table *ind_table; /**< Indirection table. */
-	struct ibv_qp *qp; /**< Verbs queue pair. */
-	struct ibv_exp_wq *wqs[MLX5_DROP_WQ_N]; /**< Verbs work queue. */
-	struct ibv_cq *cq; /**< Verbs completion queue. */
-};
-
-struct mlx5_flow_action {
-	uint32_t queue:1; /**< Target is a receive queue. */
+	uint32_t create:1;
+	/**< Whether resources should remain after a validate. */
 	uint32_t drop:1; /**< Target is a drop queue. */
 	uint32_t mark:1; /**< Mark is present in the flow. */
+	uint32_t count:1; /**< Count is present in the flow. */
 	uint32_t mark_id; /**< Mark identifier. */
 	uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queues indexes to use. */
 	uint16_t queues_n; /**< Number of entries in queue[]. */
+	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
+	uint8_t rss_key[40]; /**< copy of the RSS key. */
+	enum hash_rxq_type layer; /**< Last pattern layer detected. */
+	struct ibv_counter_set *cs; /**< Holds the counter set for the rule */
+	union {
+		struct {
+			struct ibv_flow_attr *ibv_attr;
+			/**< Pointer to Verbs attributes. */
+			unsigned int offset;
+			/**< Current position or total size of the attribute. */
+		} queue[RTE_DIM(hash_rxq_init)];
+		struct {
+			struct ibv_flow_attr *ibv_attr;
+			/**< Pointer to Verbs attributes. */
+			unsigned int offset;
+			/**< Current position or total size of the attribute. */
+		} drop_q;
+	};
+};
+
+static const struct rte_flow_ops mlx5_flow_ops = {
+	.validate = mlx5_flow_validate,
+	.create = mlx5_flow_create,
+	.destroy = mlx5_flow_destroy,
+	.flush = mlx5_flow_flush,
+#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
+	.query = mlx5_flow_query,
+#else
+	.query = NULL,
+#endif
+	.isolate = mlx5_flow_isolate,
+};
+
+/* Convert FDIR request to Generic flow. */
+struct mlx5_fdir {
+	struct rte_flow_attr attr;
+	struct rte_flow_action actions[2];
+	struct rte_flow_item items[4];
+	struct rte_flow_item_eth l2;
+	struct rte_flow_item_eth l2_mask;
+	union {
+		struct rte_flow_item_ipv4 ipv4;
+		struct rte_flow_item_ipv6 ipv6;
+	} l3;
+	union {
+		struct rte_flow_item_udp udp;
+		struct rte_flow_item_tcp tcp;
+	} l4;
+	struct rte_flow_action_queue queue;
+};
+
+/* Verbs specification header. */
+struct ibv_spec_header {
+	enum ibv_flow_spec_type type;
+	uint16_t size;
 };
 
 /**
@@ -367,38 +561,58 @@ mlx5_flow_item_validate(const struct rte_flow_item *item,
 }
 
 /**
- * Validate a flow supported by the NIC.
+ * Copy the RSS configuration from the user ones.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param parser
+ *   Internal parser structure.
+ * @param rss_conf
+ *   User RSS configuration to save.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+priv_flow_convert_rss_conf(struct priv *priv,
+			   struct mlx5_flow_parse *parser,
+			   const struct rte_eth_rss_conf *rss_conf)
+{
+	const struct rte_eth_rss_conf *rss =
+		rss_conf ? rss_conf : &priv->rss_conf;
+
+	if (rss->rss_key_len > 40)
+		return EINVAL;
+	parser->rss_conf.rss_key_len = rss->rss_key_len;
+	parser->rss_conf.rss_hf = rss->rss_hf;
+	memcpy(parser->rss_key, rss->rss_key, rss->rss_key_len);
+	parser->rss_conf.rss_key = parser->rss_key;
+	return 0;
+}
+
+/**
+ * Extract attribute to the parser.
  *
  * @param priv
  *   Pointer to private structure.
  * @param[in] attr
  *   Flow rule attributes.
- * @param[in] pattern
- *   Pattern specification (list terminated by the END pattern item).
- * @param[in] actions
- *   Associated actions (list terminated by the END action).
  * @param[out] error
  *   Perform verbose error reporting if not NULL.
- * @param[in, out] flow
- *   Flow structure to update.
- * @param[in, out] action
- *   Action structure to update.
+ * @param[in, out] parser
+ *   Internal parser structure.
  *
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-priv_flow_validate(struct priv *priv,
-		   const struct rte_flow_attr *attr,
-		   const struct rte_flow_item items[],
-		   const struct rte_flow_action actions[],
-		   struct rte_flow_error *error,
-		   struct mlx5_flow *flow,
-		   struct mlx5_flow_action *action)
+priv_flow_convert_attributes(struct priv *priv,
+			     const struct rte_flow_attr *attr,
+			     struct rte_flow_error *error,
+			     struct mlx5_flow_parse *parser)
 {
-	const struct mlx5_flow_items *cur_item = mlx5_flow_items;
-
 	(void)priv;
+	(void)parser;
 	if (attr->group) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
@@ -406,7 +620,7 @@ priv_flow_validate(struct priv *priv,
 				   "groups are not supported");
 		return -rte_errno;
 	}
-	if (attr->priority) {
+	if (attr->priority && attr->priority != MLX5_CTRL_FLOW_PRIORITY) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
 				   NULL,
@@ -427,56 +641,42 @@ priv_flow_validate(struct priv *priv,
 				   "only ingress is supported");
 		return -rte_errno;
 	}
-	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
-		const struct mlx5_flow_items *token = NULL;
-		unsigned int i;
-		int err;
+	return 0;
+}
 
-		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
-			continue;
-		for (i = 0;
-		     cur_item->items &&
-		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
-		     ++i) {
-			if (cur_item->items[i] == items->type) {
-				token = &mlx5_flow_items[items->type];
-				break;
-			}
-		}
-		if (!token)
-			goto exit_item_not_supported;
-		cur_item = token;
-		err = mlx5_flow_item_validate(items,
-					      (const uint8_t *)cur_item->mask,
-					      cur_item->mask_sz);
-		if (err)
-			goto exit_item_not_supported;
-		if (flow->ibv_attr && cur_item->convert) {
-			err = cur_item->convert(items,
-						(cur_item->default_mask ?
-						 cur_item->default_mask :
-						 cur_item->mask),
-						flow);
-			if (err)
-				goto exit_item_not_supported;
-		} else if (items->type == RTE_FLOW_ITEM_TYPE_VXLAN) {
-			if (flow->inner) {
-				rte_flow_error_set(error, ENOTSUP,
-						   RTE_FLOW_ERROR_TYPE_ITEM,
-						   items,
-						   "cannot recognize multiple"
-						   " VXLAN encapsulations");
-				return -rte_errno;
-			}
-			flow->inner = 1;
-		}
-		flow->offset += cur_item->dst_sz;
-	}
+/**
+ * Extract actions request to the parser.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in, out] parser
+ *   Internal parser structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+priv_flow_convert_actions(struct priv *priv,
+			  const struct rte_flow_action actions[],
+			  struct rte_flow_error *error,
+			  struct mlx5_flow_parse *parser)
+{
+	/*
+	 * Add default RSS configuration necessary for Verbs to create QP even
+	 * if no RSS is necessary.
+	 */
+	priv_flow_convert_rss_conf(priv, parser,
+				   (const struct rte_eth_rss_conf *)
+				   &priv->rss_conf);
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
-			action->drop = 1;
+			parser->drop = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
@@ -486,13 +686,13 @@ priv_flow_validate(struct priv *priv,
 
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			for (n = 0; n < action->queues_n; ++n) {
-				if (action->queues[n] == queue->index) {
+			for (n = 0; n < parser->queues_n; ++n) {
+				if (parser->queues[n] == queue->index) {
 					found = 1;
 					break;
 				}
 			}
-			if (action->queues_n > 1 && !found) {
+			if (parser->queues_n > 1 && !found) {
 				rte_flow_error_set(error, ENOTSUP,
 					   RTE_FLOW_ERROR_TYPE_ACTION,
 					   actions,
@@ -500,9 +700,8 @@ priv_flow_validate(struct priv *priv,
 				return -rte_errno;
 			}
 			if (!found) {
-				action->queue = 1;
-				action->queues_n = 1;
-				action->queues[0] = queue->index;
+				parser->queues_n = 1;
+				parser->queues[0] = queue->index;
 			}
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
@@ -517,12 +716,12 @@ priv_flow_validate(struct priv *priv,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (action->queues_n == 1) {
+			if (parser->queues_n == 1) {
 				uint16_t found = 0;
 
-				assert(action->queues_n);
+				assert(parser->queues_n);
 				for (n = 0; n < rss->num; ++n) {
-					if (action->queues[0] ==
+					if (parser->queues[0] ==
 					    rss->queue[n]) {
 						found = 1;
 						break;
@@ -547,10 +746,17 @@ priv_flow_validate(struct priv *priv,
 					return -rte_errno;
 				}
 			}
-			action->queue = 1;
 			for (n = 0; n < rss->num; ++n)
-				action->queues[n] = rss->queue[n];
-			action->queues_n = rss->num;
+				parser->queues[n] = rss->queue[n];
+			parser->queues_n = rss->num;
+			if (priv_flow_convert_rss_conf(priv, parser,
+						       rss->rss_conf)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "wrong RSS configuration");
+				return -rte_errno;
+			}
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_MARK) {
 			const struct rte_flow_action_mark *mark =
 				(const struct rte_flow_action_mark *)
@@ -570,30 +776,25 @@ priv_flow_validate(struct priv *priv,
 						   " and 16777199");
 				return -rte_errno;
 			}
-			action->mark = 1;
-			action->mark_id = mark->id;
+			parser->mark = 1;
+			parser->mark_id = mark->id;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_FLAG) {
-			action->mark = 1;
+			parser->mark = 1;
+		} else if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT &&
+			   priv->counter_set_supported) {
+			parser->count = 1;
 		} else {
 			goto exit_action_not_supported;
 		}
 	}
-	if (action->mark && !flow->ibv_attr && !action->drop)
-		flow->offset += sizeof(struct ibv_exp_flow_spec_action_tag);
-#ifdef HAVE_VERBS_IBV_EXP_FLOW_SPEC_ACTION_DROP
-	if (!flow->ibv_attr && action->drop)
-		flow->offset += sizeof(struct ibv_exp_flow_spec_action_drop);
-#endif
-	if (!action->queue && !action->drop) {
+	if (parser->drop && parser->mark)
+		parser->mark = 0;
+	if (!parser->queues_n && !parser->drop) {
 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "no valid action");
 		return -rte_errno;
 	}
 	return 0;
-exit_item_not_supported:
-	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
-			   items, "item not supported");
-	return -rte_errno;
 exit_action_not_supported:
 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
 			   actions, "action not supported");
@@ -601,34 +802,467 @@ exit_action_not_supported:
 }
 
 /**
- * Validate a flow supported by the NIC.
+ * Validate items.
  *
- * @see rte_flow_validate()
- * @see rte_flow_ops
+ * @param priv
+ *   Pointer to private structure.
+ * @param[in] items
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in, out] parser
+ *   Internal parser structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
-int
-mlx5_flow_validate(struct rte_eth_dev *dev,
-		   const struct rte_flow_attr *attr,
-		   const struct rte_flow_item items[],
-		   const struct rte_flow_action actions[],
-		   struct rte_flow_error *error)
+static int
+priv_flow_convert_items_validate(struct priv *priv,
+				 const struct rte_flow_item items[],
+				 struct rte_flow_error *error,
+				 struct mlx5_flow_parse *parser)
 {
-	struct priv *priv = dev->data->dev_private;
+	const struct mlx5_flow_items *cur_item = mlx5_flow_items;
+	unsigned int i;
+
+	(void)priv;
+	/* Initialise the offsets to start after verbs attribute. */
+	if (parser->drop) {
+		parser->drop_q.offset = sizeof(struct ibv_flow_attr);
+	} else {
+		for (i = 0; i != hash_rxq_init_n; ++i)
+			parser->queue[i].offset = sizeof(struct ibv_flow_attr);
+	}
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
+		const struct mlx5_flow_items *token = NULL;
+		unsigned int n;
+		int err;
+
+		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
+			continue;
+		for (i = 0;
+		     cur_item->items &&
+		     cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
+		     ++i) {
+			if (cur_item->items[i] == items->type) {
+				token = &mlx5_flow_items[items->type];
+				break;
+			}
+		}
+		if (!token)
+			goto exit_item_not_supported;
+		cur_item = token;
+		err = mlx5_flow_item_validate(items,
+					      (const uint8_t *)cur_item->mask,
+					      cur_item->mask_sz);
+		if (err)
+			goto exit_item_not_supported;
+		if (items->type == RTE_FLOW_ITEM_TYPE_VXLAN) {
+			if (parser->inner) {
+				rte_flow_error_set(error, ENOTSUP,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   items,
+						   "cannot recognize multiple"
+						   " VXLAN encapsulations");
+				return -rte_errno;
+			}
+			parser->inner = IBV_FLOW_SPEC_INNER;
+		}
+		if (parser->drop) {
+			parser->drop_q.offset += cur_item->dst_sz;
+		} else if (parser->queues_n == 1) {
+			parser->queue[HASH_RXQ_ETH].offset += cur_item->dst_sz;
+		} else {
+			for (n = 0; n != hash_rxq_init_n; ++n)
+				parser->queue[n].offset += cur_item->dst_sz;
+		}
+	}
+	if (parser->mark) {
+		for (i = 0; i != hash_rxq_init_n; ++i)
+			parser->queue[i].offset +=
+				sizeof(struct ibv_flow_spec_action_tag);
+	}
+	if (parser->count) {
+		unsigned int size = sizeof(struct ibv_flow_spec_counter_action);
+
+		if (parser->drop) {
+			parser->drop_q.offset += size;
+		} else {
+			for (i = 0; i != hash_rxq_init_n; ++i)
+				parser->queue[i].offset += size;
+		}
+	}
+	return 0;
+exit_item_not_supported:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+			   items, "item not supported");
+	return -rte_errno;
+}
+
+/**
+ * Allocate memory space to store verbs flow attributes.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[in] priority
+ *   Flow priority.
+ * @param[in] size
+ *   Amount of byte to allocate.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   A verbs flow attribute on success, NULL otherwise.
+ */
+static struct ibv_flow_attr*
+priv_flow_convert_allocate(struct priv *priv,
+			   unsigned int priority,
+			   unsigned int size,
+			   struct rte_flow_error *error)
+{
+	struct ibv_flow_attr *ibv_attr;
+
+	(void)priv;
+	ibv_attr = rte_calloc(__func__, 1, size, 0);
+	if (!ibv_attr) {
+		rte_flow_error_set(error, ENOMEM,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL,
+				   "cannot allocate verbs spec attributes.");
+		return NULL;
+	}
+	ibv_attr->priority = priority;
+	return ibv_attr;
+}
+
+/**
+ * Finalise verbs flow attributes.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[in, out] parser
+ *   Internal parser structure.
+ */
+static void
+priv_flow_convert_finalise(struct priv *priv, struct mlx5_flow_parse *parser)
+{
+	const unsigned int ipv4 =
+		hash_rxq_init[parser->layer].ip_version == MLX5_IPV4;
+	const enum hash_rxq_type hmin = ipv4 ? HASH_RXQ_TCPV4 : HASH_RXQ_TCPV6;
+	const enum hash_rxq_type hmax = ipv4 ? HASH_RXQ_IPV4 : HASH_RXQ_IPV6;
+	const enum hash_rxq_type ohmin = ipv4 ? HASH_RXQ_TCPV6 : HASH_RXQ_TCPV4;
+	const enum hash_rxq_type ohmax = ipv4 ? HASH_RXQ_IPV6 : HASH_RXQ_IPV4;
+	const enum hash_rxq_type ip = ipv4 ? HASH_RXQ_IPV4 : HASH_RXQ_IPV6;
+	unsigned int i;
+
+	(void)priv;
+	if (parser->layer == HASH_RXQ_ETH) {
+		goto fill;
+	} else {
+		/*
+		 * This layer becomes useless as the pattern define under
+		 * layers.
+		 */
+		rte_free(parser->queue[HASH_RXQ_ETH].ibv_attr);
+		parser->queue[HASH_RXQ_ETH].ibv_attr = NULL;
+	}
+	/* Remove opposite kind of layer e.g. IPv6 if the pattern is IPv4. */
+	for (i = ohmin; i != (ohmax + 1); ++i) {
+		if (!parser->queue[i].ibv_attr)
+			continue;
+		rte_free(parser->queue[i].ibv_attr);
+		parser->queue[i].ibv_attr = NULL;
+	}
+	/* Remove impossible flow according to the RSS configuration. */
+	if (hash_rxq_init[parser->layer].dpdk_rss_hf &
+	    parser->rss_conf.rss_hf) {
+		/* Remove any other flow. */
+		for (i = hmin; i != (hmax + 1); ++i) {
+			if ((i == parser->layer) ||
+			     (!parser->queue[i].ibv_attr))
+				continue;
+			rte_free(parser->queue[i].ibv_attr);
+			parser->queue[i].ibv_attr = NULL;
+		}
+	} else  if (!parser->queue[ip].ibv_attr) {
+		/* no RSS possible with the current configuration. */
+		parser->queues_n = 1;
+		return;
+	}
+fill:
+	/*
+	 * Fill missing layers in verbs specifications, or compute the correct
+	 * offset to allocate the memory space for the attributes and
+	 * specifications.
+	 */
+	for (i = 0; i != hash_rxq_init_n - 1; ++i) {
+		union {
+			struct ibv_flow_spec_ipv4_ext ipv4;
+			struct ibv_flow_spec_ipv6 ipv6;
+			struct ibv_flow_spec_tcp_udp udp_tcp;
+		} specs;
+		void *dst;
+		uint16_t size;
+
+		if (i == parser->layer)
+			continue;
+		if (parser->layer == HASH_RXQ_ETH) {
+			if (hash_rxq_init[i].ip_version == MLX5_IPV4) {
+				size = sizeof(struct ibv_flow_spec_ipv4_ext);
+				specs.ipv4 = (struct ibv_flow_spec_ipv4_ext){
+					.type = IBV_FLOW_SPEC_IPV4_EXT,
+					.size = size,
+				};
+			} else {
+				size = sizeof(struct ibv_flow_spec_ipv6);
+				specs.ipv6 = (struct ibv_flow_spec_ipv6){
+					.type = IBV_FLOW_SPEC_IPV6,
+					.size = size,
+				};
+			}
+			if (parser->queue[i].ibv_attr) {
+				dst = (void *)((uintptr_t)
+					       parser->queue[i].ibv_attr +
+					       parser->queue[i].offset);
+				memcpy(dst, &specs, size);
+				++parser->queue[i].ibv_attr->num_of_specs;
+			}
+			parser->queue[i].offset += size;
+		}
+		if ((i == HASH_RXQ_UDPV4) || (i == HASH_RXQ_TCPV4) ||
+		    (i == HASH_RXQ_UDPV6) || (i == HASH_RXQ_TCPV6)) {
+			size = sizeof(struct ibv_flow_spec_tcp_udp);
+			specs.udp_tcp = (struct ibv_flow_spec_tcp_udp) {
+				.type = ((i == HASH_RXQ_UDPV4 ||
+					  i == HASH_RXQ_UDPV6) ?
+					 IBV_FLOW_SPEC_UDP :
+					 IBV_FLOW_SPEC_TCP),
+				.size = size,
+			};
+			if (parser->queue[i].ibv_attr) {
+				dst = (void *)((uintptr_t)
+					       parser->queue[i].ibv_attr +
+					       parser->queue[i].offset);
+				memcpy(dst, &specs, size);
+				++parser->queue[i].ibv_attr->num_of_specs;
+			}
+			parser->queue[i].offset += size;
+		}
+	}
+}
+
+/**
+ * Validate and convert a flow supported by the NIC.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] pattern
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ * @param[in, out] parser
+ *   Internal parser structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+priv_flow_convert(struct priv *priv,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error,
+		  struct mlx5_flow_parse *parser)
+{
+	const struct mlx5_flow_items *cur_item = mlx5_flow_items;
+	unsigned int i;
 	int ret;
-	struct mlx5_flow flow = { .offset = sizeof(struct ibv_exp_flow_attr) };
-	struct mlx5_flow_action action = {
-		.queue = 0,
-		.drop = 0,
-		.mark = 0,
+
+	/* First step. Validate the attributes, items and actions. */
+	*parser = (struct mlx5_flow_parse){
+		.create = parser->create,
+		.layer = HASH_RXQ_ETH,
 		.mark_id = MLX5_FLOW_MARK_DEFAULT,
-		.queues_n = 0,
 	};
-
-	priv_lock(priv);
-	ret = priv_flow_validate(priv, attr, items, actions, error, &flow,
-				 &action);
-	priv_unlock(priv);
+	ret = priv_flow_convert_attributes(priv, attr, error, parser);
+	if (ret)
+		return ret;
+	ret = priv_flow_convert_actions(priv, actions, error, parser);
+	if (ret)
+		return ret;
+	ret = priv_flow_convert_items_validate(priv, items, error, parser);
+	if (ret)
+		return ret;
+	priv_flow_convert_finalise(priv, parser);
+	/*
+	 * Second step.
+	 * Allocate the memory space to store verbs specifications.
+	 */
+	if (parser->drop) {
+		parser->drop_q.ibv_attr =
+			priv_flow_convert_allocate(priv, attr->priority,
+						   parser->drop_q.offset,
+						   error);
+		if (!parser->drop_q.ibv_attr)
+			return ENOMEM;
+		parser->drop_q.offset = sizeof(struct ibv_flow_attr);
+	} else if (parser->queues_n == 1) {
+		unsigned int priority =
+			attr->priority +
+			hash_rxq_init[HASH_RXQ_ETH].flow_priority;
+		unsigned int offset = parser->queue[HASH_RXQ_ETH].offset;
+
+		parser->queue[HASH_RXQ_ETH].ibv_attr =
+			priv_flow_convert_allocate(priv, priority,
+						   offset, error);
+		if (!parser->queue[HASH_RXQ_ETH].ibv_attr)
+			return ENOMEM;
+		parser->queue[HASH_RXQ_ETH].offset =
+			sizeof(struct ibv_flow_attr);
+	} else {
+		for (i = 0; i != hash_rxq_init_n; ++i) {
+			unsigned int priority =
+				attr->priority +
+				hash_rxq_init[i].flow_priority;
+			unsigned int offset;
+
+			if (!(parser->rss_conf.rss_hf &
+			      hash_rxq_init[i].dpdk_rss_hf) &&
+			    (i != HASH_RXQ_ETH))
+				continue;
+			offset = parser->queue[i].offset;
+			parser->queue[i].ibv_attr =
+				priv_flow_convert_allocate(priv, priority,
+							   offset, error);
+			if (!parser->queue[i].ibv_attr)
+				goto exit_enomem;
+			parser->queue[i].offset = sizeof(struct ibv_flow_attr);
+		}
+	}
+	/* Third step. Conversion parse, fill the specifications. */
+	parser->inner = 0;
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
+		if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
+			continue;
+		cur_item = &mlx5_flow_items[items->type];
+		ret = cur_item->convert(items,
+					(cur_item->default_mask ?
+					 cur_item->default_mask :
+					 cur_item->mask),
+					parser);
+		if (ret) {
+			rte_flow_error_set(error, ret,
+					   RTE_FLOW_ERROR_TYPE_ITEM,
+					   items, "item not supported");
+			goto exit_free;
+		}
+	}
+	if (parser->mark)
+		mlx5_flow_create_flag_mark(parser, parser->mark_id);
+	if (parser->count && parser->create) {
+		mlx5_flow_create_count(priv, parser);
+		if (!parser->cs)
+			goto exit_count_error;
+	}
+	/*
+	 * Last step. Complete missing specification to reach the RSS
+	 * configuration.
+	 */
+	if (parser->drop) {
+		/*
+		 * Drop queue priority needs to be adjusted to
+		 * their most specific layer priority.
+		 */
+		parser->drop_q.ibv_attr->priority =
+			attr->priority +
+			hash_rxq_init[parser->layer].flow_priority;
+	} else if (parser->queues_n > 1) {
+		priv_flow_convert_finalise(priv, parser);
+	} else {
+		/*
+		 * Action queue have their priority overridden with
+		 * Ethernet priority, this priority needs to be adjusted to
+		 * their most specific layer priority.
+		 */
+		parser->queue[HASH_RXQ_ETH].ibv_attr->priority =
+			attr->priority +
+			hash_rxq_init[parser->layer].flow_priority;
+	}
+exit_free:
+	/* Only verification is expected, all resources should be released. */
+	if (!parser->create) {
+		if (parser->drop) {
+			rte_free(parser->drop_q.ibv_attr);
+			parser->drop_q.ibv_attr = NULL;
+		}
+		for (i = 0; i != hash_rxq_init_n; ++i) {
+			if (parser->queue[i].ibv_attr) {
+				rte_free(parser->queue[i].ibv_attr);
+				parser->queue[i].ibv_attr = NULL;
+			}
+		}
+	}
 	return ret;
+exit_enomem:
+	for (i = 0; i != hash_rxq_init_n; ++i) {
+		if (parser->queue[i].ibv_attr) {
+			rte_free(parser->queue[i].ibv_attr);
+			parser->queue[i].ibv_attr = NULL;
+		}
+	}
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			   NULL, "cannot allocate verbs spec attributes.");
+	return ret;
+exit_count_error:
+	rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			   NULL, "cannot create counter.");
+	return rte_errno;
+}
+
+/**
+ * Copy the specification created into the flow.
+ *
+ * @param parser
+ *   Internal parser structure.
+ * @param src
+ *   Create specification.
+ * @param size
+ *   Size in bytes of the specification to copy.
+ */
+static void
+mlx5_flow_create_copy(struct mlx5_flow_parse *parser, void *src,
+		      unsigned int size)
+{
+	unsigned int i;
+	void *dst;
+
+	if (parser->drop) {
+		dst = (void *)((uintptr_t)parser->drop_q.ibv_attr +
+				parser->drop_q.offset);
+		memcpy(dst, src, size);
+		++parser->drop_q.ibv_attr->num_of_specs;
+		parser->drop_q.offset += size;
+		return;
+	}
+	for (i = 0; i != hash_rxq_init_n; ++i) {
+		if (!parser->queue[i].ibv_attr)
+			continue;
+		/* Specification must be the same l3 type or none. */
+		if (parser->layer == HASH_RXQ_ETH ||
+		    (hash_rxq_init[parser->layer].ip_version ==
+		     hash_rxq_init[i].ip_version) ||
+		    (hash_rxq_init[i].ip_version == 0)) {
+			dst = (void *)((uintptr_t)parser->queue[i].ibv_attr +
+					parser->queue[i].offset);
+			memcpy(dst, src, size);
+			++parser->queue[i].ibv_attr->num_of_specs;
+			parser->queue[i].offset += size;
+		}
+	}
 }
 
 /**
@@ -648,35 +1282,35 @@ mlx5_flow_create_eth(const struct rte_flow_item *item,
 {
 	const struct rte_flow_item_eth *spec = item->spec;
 	const struct rte_flow_item_eth *mask = item->mask;
-	struct mlx5_flow *flow = (struct mlx5_flow *)data;
-	struct ibv_exp_flow_spec_eth *eth;
-	const unsigned int eth_size = sizeof(struct ibv_exp_flow_spec_eth);
-	unsigned int i;
-
-	++flow->ibv_attr->num_of_specs;
-	flow->ibv_attr->priority = 2;
-	flow->hash_fields = 0;
-	eth = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
-	*eth = (struct ibv_exp_flow_spec_eth) {
-		.type = flow->inner | IBV_EXP_FLOW_SPEC_ETH,
+	struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data;
+	const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth);
+	struct ibv_flow_spec_eth eth = {
+		.type = parser->inner | IBV_FLOW_SPEC_ETH,
 		.size = eth_size,
 	};
-	if (!spec)
-		return 0;
-	if (!mask)
-		mask = default_mask;
-	memcpy(eth->val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN);
-	memcpy(eth->val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN);
-	eth->val.ether_type = spec->type;
-	memcpy(eth->mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN);
-	memcpy(eth->mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN);
-	eth->mask.ether_type = mask->type;
-	/* Remove unwanted bits from values. */
-	for (i = 0; i < ETHER_ADDR_LEN; ++i) {
-		eth->val.dst_mac[i] &= eth->mask.dst_mac[i];
-		eth->val.src_mac[i] &= eth->mask.src_mac[i];
-	}
-	eth->val.ether_type &= eth->mask.ether_type;
+
+	/* Don't update layer for the inner pattern. */
+	if (!parser->inner)
+		parser->layer = HASH_RXQ_ETH;
+	if (spec) {
+		unsigned int i;
+
+		if (!mask)
+			mask = default_mask;
+		memcpy(&eth.val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN);
+		memcpy(&eth.val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN);
+		eth.val.ether_type = spec->type;
+		memcpy(&eth.mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN);
+		memcpy(&eth.mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN);
+		eth.mask.ether_type = mask->type;
+		/* Remove unwanted bits from values. */
+		for (i = 0; i < ETHER_ADDR_LEN; ++i) {
+			eth.val.dst_mac[i] &= eth.mask.dst_mac[i];
+			eth.val.src_mac[i] &= eth.mask.src_mac[i];
+		}
+		eth.val.ether_type &= eth.mask.ether_type;
+	}
+	mlx5_flow_create_copy(parser, &eth, eth_size);
 	return 0;
 }
 
@@ -697,18 +1331,34 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item,
 {
 	const struct rte_flow_item_vlan *spec = item->spec;
 	const struct rte_flow_item_vlan *mask = item->mask;
-	struct mlx5_flow *flow = (struct mlx5_flow *)data;
-	struct ibv_exp_flow_spec_eth *eth;
-	const unsigned int eth_size = sizeof(struct ibv_exp_flow_spec_eth);
+	struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data;
+	struct ibv_flow_spec_eth *eth;
+	const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth);
 
-	eth = (void *)((uintptr_t)flow->ibv_attr + flow->offset - eth_size);
-	if (!spec)
-		return 0;
-	if (!mask)
-		mask = default_mask;
-	eth->val.vlan_tag = spec->tci;
-	eth->mask.vlan_tag = mask->tci;
-	eth->val.vlan_tag &= eth->mask.vlan_tag;
+	if (spec) {
+		unsigned int i;
+		if (!mask)
+			mask = default_mask;
+
+		if (parser->drop) {
+			eth = (void *)((uintptr_t)parser->drop_q.ibv_attr +
+				       parser->drop_q.offset - eth_size);
+			eth->val.vlan_tag = spec->tci;
+			eth->mask.vlan_tag = mask->tci;
+			eth->val.vlan_tag &= eth->mask.vlan_tag;
+			return 0;
+		}
+		for (i = 0; i != hash_rxq_init_n; ++i) {
+			if (!parser->queue[i].ibv_attr)
+				continue;
+
+			eth = (void *)((uintptr_t)parser->queue[i].ibv_attr +
+				       parser->queue[i].offset - eth_size);
+			eth->val.vlan_tag = spec->tci;
+			eth->mask.vlan_tag = mask->tci;
+			eth->val.vlan_tag &= eth->mask.vlan_tag;
+		}
+	}
 	return 0;
 }
 
@@ -729,40 +1379,38 @@ mlx5_flow_create_ipv4(const struct rte_flow_item *item,
 {
 	const struct rte_flow_item_ipv4 *spec = item->spec;
 	const struct rte_flow_item_ipv4 *mask = item->mask;
-	struct mlx5_flow *flow = (struct mlx5_flow *)data;
-	struct ibv_exp_flow_spec_ipv4_ext *ipv4;
-	unsigned int ipv4_size = sizeof(struct ibv_exp_flow_spec_ipv4_ext);
-
-	++flow->ibv_attr->num_of_specs;
-	flow->ibv_attr->priority = 1;
-	flow->hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
-			     IBV_EXP_RX_HASH_DST_IPV4);
-	ipv4 = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
-	*ipv4 = (struct ibv_exp_flow_spec_ipv4_ext) {
-		.type = flow->inner | IBV_EXP_FLOW_SPEC_IPV4_EXT,
+	struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data;
+	unsigned int ipv4_size = sizeof(struct ibv_flow_spec_ipv4_ext);
+	struct ibv_flow_spec_ipv4_ext ipv4 = {
+		.type = parser->inner | IBV_FLOW_SPEC_IPV4_EXT,
 		.size = ipv4_size,
 	};
-	if (!spec)
-		return 0;
-	if (!mask)
-		mask = default_mask;
-	ipv4->val = (struct ibv_exp_flow_ipv4_ext_filter){
-		.src_ip = spec->hdr.src_addr,
-		.dst_ip = spec->hdr.dst_addr,
-		.proto = spec->hdr.next_proto_id,
-		.tos = spec->hdr.type_of_service,
-	};
-	ipv4->mask = (struct ibv_exp_flow_ipv4_ext_filter){
-		.src_ip = mask->hdr.src_addr,
-		.dst_ip = mask->hdr.dst_addr,
-		.proto = mask->hdr.next_proto_id,
-		.tos = mask->hdr.type_of_service,
-	};
-	/* Remove unwanted bits from values. */
-	ipv4->val.src_ip &= ipv4->mask.src_ip;
-	ipv4->val.dst_ip &= ipv4->mask.dst_ip;
-	ipv4->val.proto &= ipv4->mask.proto;
-	ipv4->val.tos &= ipv4->mask.tos;
+
+	/* Don't update layer for the inner pattern. */
+	if (!parser->inner)
+		parser->layer = HASH_RXQ_IPV4;
+	if (spec) {
+		if (!mask)
+			mask = default_mask;
+		ipv4.val = (struct ibv_flow_ipv4_ext_filter){
+			.src_ip = spec->hdr.src_addr,
+			.dst_ip = spec->hdr.dst_addr,
+			.proto = spec->hdr.next_proto_id,
+			.tos = spec->hdr.type_of_service,
+		};
+		ipv4.mask = (struct ibv_flow_ipv4_ext_filter){
+			.src_ip = mask->hdr.src_addr,
+			.dst_ip = mask->hdr.dst_addr,
+			.proto = mask->hdr.next_proto_id,
+			.tos = mask->hdr.type_of_service,
+		};
+		/* Remove unwanted bits from values. */
+		ipv4.val.src_ip &= ipv4.mask.src_ip;
+		ipv4.val.dst_ip &= ipv4.mask.dst_ip;
+		ipv4.val.proto &= ipv4.mask.proto;
+		ipv4.val.tos &= ipv4.mask.tos;
+	}
+	mlx5_flow_create_copy(parser, &ipv4, ipv4_size);
 	return 0;
 }
 
@@ -783,43 +1431,42 @@ mlx5_flow_create_ipv6(const struct rte_flow_item *item,
 {
 	const struct rte_flow_item_ipv6 *spec = item->spec;
 	const struct rte_flow_item_ipv6 *mask = item->mask;
-	struct mlx5_flow *flow = (struct mlx5_flow *)data;
-	struct ibv_exp_flow_spec_ipv6_ext *ipv6;
-	unsigned int ipv6_size = sizeof(struct ibv_exp_flow_spec_ipv6_ext);
-	unsigned int i;
-
-	++flow->ibv_attr->num_of_specs;
-	flow->ibv_attr->priority = 1;
-	flow->hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
-			     IBV_EXP_RX_HASH_DST_IPV6);
-	ipv6 = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
-	*ipv6 = (struct ibv_exp_flow_spec_ipv6_ext) {
-		.type = flow->inner | IBV_EXP_FLOW_SPEC_IPV6_EXT,
+	struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data;
+	unsigned int ipv6_size = sizeof(struct ibv_flow_spec_ipv6);
+	struct ibv_flow_spec_ipv6 ipv6 = {
+		.type = parser->inner | IBV_FLOW_SPEC_IPV6,
 		.size = ipv6_size,
 	};
-	if (!spec)
-		return 0;
-	if (!mask)
-		mask = default_mask;
-	memcpy(ipv6->val.src_ip, spec->hdr.src_addr,
-	       RTE_DIM(ipv6->val.src_ip));
-	memcpy(ipv6->val.dst_ip, spec->hdr.dst_addr,
-	       RTE_DIM(ipv6->val.dst_ip));
-	memcpy(ipv6->mask.src_ip, mask->hdr.src_addr,
-	       RTE_DIM(ipv6->mask.src_ip));
-	memcpy(ipv6->mask.dst_ip, mask->hdr.dst_addr,
-	       RTE_DIM(ipv6->mask.dst_ip));
-	ipv6->mask.flow_label = mask->hdr.vtc_flow;
-	ipv6->mask.next_hdr = mask->hdr.proto;
-	ipv6->mask.hop_limit = mask->hdr.hop_limits;
-	/* Remove unwanted bits from values. */
-	for (i = 0; i < RTE_DIM(ipv6->val.src_ip); ++i) {
-		ipv6->val.src_ip[i] &= ipv6->mask.src_ip[i];
-		ipv6->val.dst_ip[i] &= ipv6->mask.dst_ip[i];
-	}
-	ipv6->val.flow_label &= ipv6->mask.flow_label;
-	ipv6->val.next_hdr &= ipv6->mask.next_hdr;
-	ipv6->val.hop_limit &= ipv6->mask.hop_limit;
+
+	/* Don't update layer for the inner pattern. */
+	if (!parser->inner)
+		parser->layer = HASH_RXQ_IPV6;
+	if (spec) {
+		unsigned int i;
+
+		if (!mask)
+			mask = default_mask;
+		memcpy(&ipv6.val.src_ip, spec->hdr.src_addr,
+		       RTE_DIM(ipv6.val.src_ip));
+		memcpy(&ipv6.val.dst_ip, spec->hdr.dst_addr,
+		       RTE_DIM(ipv6.val.dst_ip));
+		memcpy(&ipv6.mask.src_ip, mask->hdr.src_addr,
+		       RTE_DIM(ipv6.mask.src_ip));
+		memcpy(&ipv6.mask.dst_ip, mask->hdr.dst_addr,
+		       RTE_DIM(ipv6.mask.dst_ip));
+		ipv6.mask.flow_label = mask->hdr.vtc_flow;
+		ipv6.mask.next_hdr = mask->hdr.proto;
+		ipv6.mask.hop_limit = mask->hdr.hop_limits;
+		/* Remove unwanted bits from values. */
+		for (i = 0; i < RTE_DIM(ipv6.val.src_ip); ++i) {
+			ipv6.val.src_ip[i] &= ipv6.mask.src_ip[i];
+			ipv6.val.dst_ip[i] &= ipv6.mask.dst_ip[i];
+		}
+		ipv6.val.flow_label &= ipv6.mask.flow_label;
+		ipv6.val.next_hdr &= ipv6.mask.next_hdr;
+		ipv6.val.hop_limit &= ipv6.mask.hop_limit;
+	}
+	mlx5_flow_create_copy(parser, &ipv6, ipv6_size);
 	return 0;
 }
 
@@ -840,30 +1487,32 @@ mlx5_flow_create_udp(const struct rte_flow_item *item,
 {
 	const struct rte_flow_item_udp *spec = item->spec;
 	const struct rte_flow_item_udp *mask = item->mask;
-	struct mlx5_flow *flow = (struct mlx5_flow *)data;
-	struct ibv_exp_flow_spec_tcp_udp *udp;
-	unsigned int udp_size = sizeof(struct ibv_exp_flow_spec_tcp_udp);
-
-	++flow->ibv_attr->num_of_specs;
-	flow->ibv_attr->priority = 0;
-	flow->hash_fields |= (IBV_EXP_RX_HASH_SRC_PORT_UDP |
-			      IBV_EXP_RX_HASH_DST_PORT_UDP);
-	udp = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
-	*udp = (struct ibv_exp_flow_spec_tcp_udp) {
-		.type = flow->inner | IBV_EXP_FLOW_SPEC_UDP,
+	struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data;
+	unsigned int udp_size = sizeof(struct ibv_flow_spec_tcp_udp);
+	struct ibv_flow_spec_tcp_udp udp = {
+		.type = parser->inner | IBV_FLOW_SPEC_UDP,
 		.size = udp_size,
 	};
-	if (!spec)
-		return 0;
-	if (!mask)
-		mask = default_mask;
-	udp->val.dst_port = spec->hdr.dst_port;
-	udp->val.src_port = spec->hdr.src_port;
-	udp->mask.dst_port = mask->hdr.dst_port;
-	udp->mask.src_port = mask->hdr.src_port;
-	/* Remove unwanted bits from values. */
-	udp->val.src_port &= udp->mask.src_port;
-	udp->val.dst_port &= udp->mask.dst_port;
+
+	/* Don't update layer for the inner pattern. */
+	if (!parser->inner) {
+		if (parser->layer == HASH_RXQ_IPV4)
+			parser->layer = HASH_RXQ_UDPV4;
+		else
+			parser->layer = HASH_RXQ_UDPV6;
+	}
+	if (spec) {
+		if (!mask)
+			mask = default_mask;
+		udp.val.dst_port = spec->hdr.dst_port;
+		udp.val.src_port = spec->hdr.src_port;
+		udp.mask.dst_port = mask->hdr.dst_port;
+		udp.mask.src_port = mask->hdr.src_port;
+		/* Remove unwanted bits from values. */
+		udp.val.src_port &= udp.mask.src_port;
+		udp.val.dst_port &= udp.mask.dst_port;
+	}
+	mlx5_flow_create_copy(parser, &udp, udp_size);
 	return 0;
 }
 
@@ -884,30 +1533,32 @@ mlx5_flow_create_tcp(const struct rte_flow_item *item,
 {
 	const struct rte_flow_item_tcp *spec = item->spec;
 	const struct rte_flow_item_tcp *mask = item->mask;
-	struct mlx5_flow *flow = (struct mlx5_flow *)data;
-	struct ibv_exp_flow_spec_tcp_udp *tcp;
-	unsigned int tcp_size = sizeof(struct ibv_exp_flow_spec_tcp_udp);
-
-	++flow->ibv_attr->num_of_specs;
-	flow->ibv_attr->priority = 0;
-	flow->hash_fields |= (IBV_EXP_RX_HASH_SRC_PORT_TCP |
-			      IBV_EXP_RX_HASH_DST_PORT_TCP);
-	tcp = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
-	*tcp = (struct ibv_exp_flow_spec_tcp_udp) {
-		.type = flow->inner | IBV_EXP_FLOW_SPEC_TCP,
+	struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data;
+	unsigned int tcp_size = sizeof(struct ibv_flow_spec_tcp_udp);
+	struct ibv_flow_spec_tcp_udp tcp = {
+		.type = parser->inner | IBV_FLOW_SPEC_TCP,
 		.size = tcp_size,
 	};
-	if (!spec)
-		return 0;
-	if (!mask)
-		mask = default_mask;
-	tcp->val.dst_port = spec->hdr.dst_port;
-	tcp->val.src_port = spec->hdr.src_port;
-	tcp->mask.dst_port = mask->hdr.dst_port;
-	tcp->mask.src_port = mask->hdr.src_port;
-	/* Remove unwanted bits from values. */
-	tcp->val.src_port &= tcp->mask.src_port;
-	tcp->val.dst_port &= tcp->mask.dst_port;
+
+	/* Don't update layer for the inner pattern. */
+	if (!parser->inner) {
+		if (parser->layer == HASH_RXQ_IPV4)
+			parser->layer = HASH_RXQ_TCPV4;
+		else
+			parser->layer = HASH_RXQ_TCPV6;
+	}
+	if (spec) {
+		if (!mask)
+			mask = default_mask;
+		tcp.val.dst_port = spec->hdr.dst_port;
+		tcp.val.src_port = spec->hdr.src_port;
+		tcp.mask.dst_port = mask->hdr.dst_port;
+		tcp.mask.src_port = mask->hdr.src_port;
+		/* Remove unwanted bits from values. */
+		tcp.val.src_port &= tcp.mask.src_port;
+		tcp.val.dst_port &= tcp.mask.dst_port;
+	}
+	mlx5_flow_create_copy(parser, &tcp, tcp_size);
 	return 0;
 }
 
@@ -928,57 +1579,97 @@ mlx5_flow_create_vxlan(const struct rte_flow_item *item,
 {
 	const struct rte_flow_item_vxlan *spec = item->spec;
 	const struct rte_flow_item_vxlan *mask = item->mask;
-	struct mlx5_flow *flow = (struct mlx5_flow *)data;
-	struct ibv_exp_flow_spec_tunnel *vxlan;
-	unsigned int size = sizeof(struct ibv_exp_flow_spec_tunnel);
+	struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data;
+	unsigned int size = sizeof(struct ibv_flow_spec_tunnel);
+	struct ibv_flow_spec_tunnel vxlan = {
+		.type = parser->inner | IBV_FLOW_SPEC_VXLAN_TUNNEL,
+		.size = size,
+	};
 	union vni {
 		uint32_t vlan_id;
 		uint8_t vni[4];
 	} id;
 
-	++flow->ibv_attr->num_of_specs;
-	flow->ibv_attr->priority = 0;
 	id.vni[0] = 0;
-	vxlan = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
-	*vxlan = (struct ibv_exp_flow_spec_tunnel) {
-		.type = flow->inner | IBV_EXP_FLOW_SPEC_VXLAN_TUNNEL,
-		.size = size,
-	};
-	flow->inner = IBV_EXP_FLOW_SPEC_INNER;
-	if (!spec)
-		return 0;
-	if (!mask)
-		mask = default_mask;
-	memcpy(&id.vni[1], spec->vni, 3);
-	vxlan->val.tunnel_id = id.vlan_id;
-	memcpy(&id.vni[1], mask->vni, 3);
-	vxlan->mask.tunnel_id = id.vlan_id;
-	/* Remove unwanted bits from values. */
-	vxlan->val.tunnel_id &= vxlan->mask.tunnel_id;
+	parser->inner = IBV_FLOW_SPEC_INNER;
+	if (spec) {
+		if (!mask)
+			mask = default_mask;
+		memcpy(&id.vni[1], spec->vni, 3);
+		vxlan.val.tunnel_id = id.vlan_id;
+		memcpy(&id.vni[1], mask->vni, 3);
+		vxlan.mask.tunnel_id = id.vlan_id;
+		/* Remove unwanted bits from values. */
+		vxlan.val.tunnel_id &= vxlan.mask.tunnel_id;
+	}
+	/*
+	 * Tunnel id 0 is equivalent as not adding a VXLAN layer, if only this
+	 * layer is defined in the Verbs specification it is interpreted as
+	 * wildcard and all packets will match this rule, if it follows a full
+	 * stack layer (ex: eth / ipv4 / udp), all packets matching the layers
+	 * before will also match this rule.
+	 * To avoid such situation, VNI 0 is currently refused.
+	 */
+	if (!vxlan.val.tunnel_id)
+		return EINVAL;
+	mlx5_flow_create_copy(parser, &vxlan, size);
 	return 0;
 }
 
 /**
  * Convert mark/flag action to Verbs specification.
  *
- * @param flow
- *   Pointer to MLX5 flow structure.
+ * @param parser
+ *   Internal parser structure.
  * @param mark_id
  *   Mark identifier.
  */
 static int
-mlx5_flow_create_flag_mark(struct mlx5_flow *flow, uint32_t mark_id)
+mlx5_flow_create_flag_mark(struct mlx5_flow_parse *parser, uint32_t mark_id)
 {
-	struct ibv_exp_flow_spec_action_tag *tag;
-	unsigned int size = sizeof(struct ibv_exp_flow_spec_action_tag);
-
-	tag = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
-	*tag = (struct ibv_exp_flow_spec_action_tag){
-		.type = IBV_EXP_FLOW_SPEC_ACTION_TAG,
+	unsigned int size = sizeof(struct ibv_flow_spec_action_tag);
+	struct ibv_flow_spec_action_tag tag = {
+		.type = IBV_FLOW_SPEC_ACTION_TAG,
 		.size = size,
 		.tag_id = mlx5_flow_mark_set(mark_id),
 	};
-	++flow->ibv_attr->num_of_specs;
+
+	assert(parser->mark);
+	mlx5_flow_create_copy(parser, &tag, size);
+	return 0;
+}
+
+/**
+ * Convert count action to Verbs specification.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param parser
+ *   Pointer to MLX5 flow parser structure.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+mlx5_flow_create_count(struct priv *priv __rte_unused,
+		       struct mlx5_flow_parse *parser __rte_unused)
+{
+#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
+	unsigned int size = sizeof(struct ibv_flow_spec_counter_action);
+	struct ibv_counter_set_init_attr init_attr = {0};
+	struct ibv_flow_spec_counter_action counter = {
+		.type = IBV_FLOW_SPEC_ACTION_COUNT,
+		.size = size,
+		.counter_set_handle = 0,
+	};
+
+	init_attr.counter_set_id = 0;
+	parser->cs = ibv_create_counter_set(priv->ctx, &init_attr);
+	if (!parser->cs)
+		return EINVAL;
+	counter.counter_set_handle = parser->cs->handle;
+	mlx5_flow_create_copy(parser, &counter, size);
+#endif
 	return 0;
 }
 
@@ -987,59 +1678,127 @@ mlx5_flow_create_flag_mark(struct mlx5_flow *flow, uint32_t mark_id)
  *
  * @param priv
  *   Pointer to private structure.
+ * @param parser
+ *   Internal parser structure.
  * @param flow
- *   MLX5 flow attributes (filled by mlx5_flow_validate()).
+ *   Pointer to the rte_flow.
  * @param[out] error
  *   Perform verbose error reporting if not NULL.
  *
  * @return
- *   A flow if the rule could be created.
+ *   0 on success, errno value on failure.
  */
-static struct rte_flow *
+static int
 priv_flow_create_action_queue_drop(struct priv *priv,
-				   struct mlx5_flow *flow,
+				   struct mlx5_flow_parse *parser,
+				   struct rte_flow *flow,
 				   struct rte_flow_error *error)
 {
-	struct rte_flow *rte_flow;
-#ifdef HAVE_VERBS_IBV_EXP_FLOW_SPEC_ACTION_DROP
-	struct ibv_exp_flow_spec_action_drop *drop;
-	unsigned int size = sizeof(struct ibv_exp_flow_spec_action_drop);
-#endif
+	struct ibv_flow_spec_action_drop *drop;
+	unsigned int size = sizeof(struct ibv_flow_spec_action_drop);
+	int err = 0;
 
 	assert(priv->pd);
 	assert(priv->ctx);
-	rte_flow = rte_calloc(__func__, 1, sizeof(*rte_flow), 0);
-	if (!rte_flow) {
-		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
-				   NULL, "cannot allocate flow memory");
-		return NULL;
-	}
-	rte_flow->drop = 1;
-#ifdef HAVE_VERBS_IBV_EXP_FLOW_SPEC_ACTION_DROP
-	drop = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
-	*drop = (struct ibv_exp_flow_spec_action_drop){
-			.type = IBV_EXP_FLOW_SPEC_ACTION_DROP,
+	flow->drop = 1;
+	drop = (void *)((uintptr_t)parser->drop_q.ibv_attr +
+			parser->drop_q.offset);
+	*drop = (struct ibv_flow_spec_action_drop){
+			.type = IBV_FLOW_SPEC_ACTION_DROP,
 			.size = size,
 	};
-	++flow->ibv_attr->num_of_specs;
-	flow->offset += sizeof(struct ibv_exp_flow_spec_action_drop);
-#endif
-	rte_flow->ibv_attr = flow->ibv_attr;
-	if (!priv->started)
-		return rte_flow;
-	rte_flow->qp = priv->flow_drop_queue->qp;
-	rte_flow->ibv_flow = ibv_exp_create_flow(rte_flow->qp,
-						 rte_flow->ibv_attr);
-	if (!rte_flow->ibv_flow) {
+	++parser->drop_q.ibv_attr->num_of_specs;
+	parser->drop_q.offset += size;
+	flow->drxq.ibv_attr = parser->drop_q.ibv_attr;
+	if (parser->count)
+		flow->cs = parser->cs;
+	if (!priv->dev->data->dev_started)
+		return 0;
+	parser->drop_q.ibv_attr = NULL;
+	flow->drxq.ibv_flow = ibv_create_flow(priv->flow_drop_queue->qp,
+					      flow->drxq.ibv_attr);
+	if (!flow->drxq.ibv_flow) {
 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "flow rule creation failure");
+		err = ENOMEM;
 		goto error;
 	}
-	return rte_flow;
+	return 0;
 error:
-	assert(rte_flow);
-	rte_free(rte_flow);
-	return NULL;
+	assert(flow);
+	if (flow->drxq.ibv_flow) {
+		claim_zero(ibv_destroy_flow(flow->drxq.ibv_flow));
+		flow->drxq.ibv_flow = NULL;
+	}
+	if (flow->drxq.ibv_attr) {
+		rte_free(flow->drxq.ibv_attr);
+		flow->drxq.ibv_attr = NULL;
+	}
+	if (flow->cs) {
+		claim_zero(ibv_destroy_counter_set(flow->cs));
+		flow->cs = NULL;
+		parser->cs = NULL;
+	}
+	return err;
+}
+
+/**
+ * Create hash Rx queues when RSS is enabled.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param parser
+ *   Internal parser structure.
+ * @param flow
+ *   Pointer to the rte_flow.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a errno value otherwise and rte_errno is set.
+ */
+static int
+priv_flow_create_action_queue_rss(struct priv *priv,
+				  struct mlx5_flow_parse *parser,
+				  struct rte_flow *flow,
+				  struct rte_flow_error *error)
+{
+	unsigned int i;
+
+	for (i = 0; i != hash_rxq_init_n; ++i) {
+		uint64_t hash_fields;
+
+		if (!parser->queue[i].ibv_attr)
+			continue;
+		flow->frxq[i].ibv_attr = parser->queue[i].ibv_attr;
+		parser->queue[i].ibv_attr = NULL;
+		hash_fields = hash_rxq_init[i].hash_fields;
+		if (!priv->dev->data->dev_started)
+			continue;
+		flow->frxq[i].hrxq =
+			mlx5_priv_hrxq_get(priv,
+					   parser->rss_conf.rss_key,
+					   parser->rss_conf.rss_key_len,
+					   hash_fields,
+					   parser->queues,
+					   parser->queues_n);
+		if (flow->frxq[i].hrxq)
+			continue;
+		flow->frxq[i].hrxq =
+			mlx5_priv_hrxq_new(priv,
+					   parser->rss_conf.rss_key,
+					   parser->rss_conf.rss_key_len,
+					   hash_fields,
+					   parser->queues,
+					   parser->queues_n);
+		if (!flow->frxq[i].hrxq) {
+			rte_flow_error_set(error, ENOMEM,
+					   RTE_FLOW_ERROR_TYPE_HANDLE,
+					   NULL, "cannot create hash rxq");
+			return ENOMEM;
+		}
+	}
+	return 0;
 }
 
 /**
@@ -1047,112 +1806,79 @@ error:
  *
  * @param priv
  *   Pointer to private structure.
+ * @param parser
+ *   Internal parser structure.
  * @param flow
- *   MLX5 flow attributes (filled by mlx5_flow_validate()).
- * @param action
- *   Target action structure.
+ *   Pointer to the rte_flow.
  * @param[out] error
  *   Perform verbose error reporting if not NULL.
  *
  * @return
- *   A flow if the rule could be created.
+ *   0 on success, a errno value otherwise and rte_errno is set.
  */
-static struct rte_flow *
+static int
 priv_flow_create_action_queue(struct priv *priv,
-			      struct mlx5_flow *flow,
-			      struct mlx5_flow_action *action,
+			      struct mlx5_flow_parse *parser,
+			      struct rte_flow *flow,
 			      struct rte_flow_error *error)
 {
-	struct rte_flow *rte_flow;
+	int err = 0;
 	unsigned int i;
-	unsigned int j;
-	const unsigned int wqs_n = 1 << log2above(action->queues_n);
-	struct ibv_exp_wq *wqs[wqs_n];
 
 	assert(priv->pd);
 	assert(priv->ctx);
-	assert(!action->drop);
-	rte_flow = rte_calloc(__func__, 1, sizeof(*rte_flow) +
-			      sizeof(*rte_flow->rxqs) * action->queues_n, 0);
-	if (!rte_flow) {
-		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
-				   NULL, "cannot allocate flow memory");
-		return NULL;
-	}
-	for (i = 0; i < action->queues_n; ++i) {
-		struct rxq_ctrl *rxq;
-
-		rxq = container_of((*priv->rxqs)[action->queues[i]],
-				   struct rxq_ctrl, rxq);
-		wqs[i] = rxq->wq;
-		rte_flow->rxqs[i] = &rxq->rxq;
-		++rte_flow->rxqs_n;
-		rxq->rxq.mark |= action->mark;
-	}
-	/* finalise indirection table. */
-	for (j = 0; i < wqs_n; ++i, ++j) {
-		wqs[i] = wqs[j];
-		if (j == action->queues_n)
-			j = 0;
-	}
-	rte_flow->mark = action->mark;
-	rte_flow->ibv_attr = flow->ibv_attr;
-	rte_flow->hash_fields = flow->hash_fields;
-	rte_flow->ind_table = ibv_exp_create_rwq_ind_table(
-		priv->ctx,
-		&(struct ibv_exp_rwq_ind_table_init_attr){
-			.pd = priv->pd,
-			.log_ind_tbl_size = log2above(action->queues_n),
-			.ind_tbl = wqs,
-			.comp_mask = 0,
-		});
-	if (!rte_flow->ind_table) {
-		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
-				   NULL, "cannot allocate indirection table");
-		goto error;
-	}
-	rte_flow->qp = ibv_exp_create_qp(
-		priv->ctx,
-		&(struct ibv_exp_qp_init_attr){
-			.qp_type = IBV_QPT_RAW_PACKET,
-			.comp_mask =
-				IBV_EXP_QP_INIT_ATTR_PD |
-				IBV_EXP_QP_INIT_ATTR_PORT |
-				IBV_EXP_QP_INIT_ATTR_RX_HASH,
-			.pd = priv->pd,
-			.rx_hash_conf = &(struct ibv_exp_rx_hash_conf){
-				.rx_hash_function =
-					IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
-				.rx_hash_key_len = rss_hash_default_key_len,
-				.rx_hash_key = rss_hash_default_key,
-				.rx_hash_fields_mask = rte_flow->hash_fields,
-				.rwq_ind_tbl = rte_flow->ind_table,
-			},
-			.port_num = priv->port,
-		});
-	if (!rte_flow->qp) {
-		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
-				   NULL, "cannot allocate QP");
+	assert(!parser->drop);
+	err = priv_flow_create_action_queue_rss(priv, parser, flow, error);
+	if (err)
 		goto error;
+	if (parser->count)
+		flow->cs = parser->cs;
+	if (!priv->dev->data->dev_started)
+		return 0;
+	for (i = 0; i != hash_rxq_init_n; ++i) {
+		if (!flow->frxq[i].hrxq)
+			continue;
+		flow->frxq[i].ibv_flow =
+			ibv_create_flow(flow->frxq[i].hrxq->qp,
+					flow->frxq[i].ibv_attr);
+		if (!flow->frxq[i].ibv_flow) {
+			rte_flow_error_set(error, ENOMEM,
+					   RTE_FLOW_ERROR_TYPE_HANDLE,
+					   NULL, "flow rule creation failure");
+			err = ENOMEM;
+			goto error;
+		}
+		DEBUG("%p type %d QP %p ibv_flow %p",
+		      (void *)flow, i,
+		      (void *)flow->frxq[i].hrxq,
+		      (void *)flow->frxq[i].ibv_flow);
 	}
-	if (!priv->started)
-		return rte_flow;
-	rte_flow->ibv_flow = ibv_exp_create_flow(rte_flow->qp,
-						 rte_flow->ibv_attr);
-	if (!rte_flow->ibv_flow) {
-		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
-				   NULL, "flow rule creation failure");
-		goto error;
+	for (i = 0; i != parser->queues_n; ++i) {
+		struct mlx5_rxq_data *q =
+			(*priv->rxqs)[parser->queues[i]];
+
+		q->mark |= parser->mark;
 	}
-	return rte_flow;
+	return 0;
 error:
-	assert(rte_flow);
-	if (rte_flow->qp)
-		ibv_destroy_qp(rte_flow->qp);
-	if (rte_flow->ind_table)
-		ibv_exp_destroy_rwq_ind_table(rte_flow->ind_table);
-	rte_free(rte_flow);
-	return NULL;
+	assert(flow);
+	for (i = 0; i != hash_rxq_init_n; ++i) {
+		if (flow->frxq[i].ibv_flow) {
+			struct ibv_flow *ibv_flow = flow->frxq[i].ibv_flow;
+
+			claim_zero(ibv_destroy_flow(ibv_flow));
+		}
+		if (flow->frxq[i].hrxq)
+			mlx5_priv_hrxq_release(priv, flow->frxq[i].hrxq);
+		if (flow->frxq[i].ibv_attr)
+			rte_free(flow->frxq[i].ibv_attr);
+	}
+	if (flow->cs) {
+		claim_zero(ibv_destroy_counter_set(flow->cs));
+		flow->cs = NULL;
+		parser->cs = NULL;
+	}
+	return err;
 }
 
 /**
@@ -1160,6 +1886,8 @@ error:
  *
  * @param priv
  *   Pointer to private structure.
+ * @param list
+ *   Pointer to a TAILQ flow list.
  * @param[in] attr
  *   Flow rule attributes.
  * @param[in] pattern
@@ -1174,65 +1902,86 @@ error:
  */
 static struct rte_flow *
 priv_flow_create(struct priv *priv,
+		 struct mlx5_flows *list,
 		 const struct rte_flow_attr *attr,
 		 const struct rte_flow_item items[],
 		 const struct rte_flow_action actions[],
 		 struct rte_flow_error *error)
 {
-	struct rte_flow *rte_flow;
-	struct mlx5_flow flow = { .offset = sizeof(struct ibv_exp_flow_attr), };
-	struct mlx5_flow_action action = {
-		.queue = 0,
-		.drop = 0,
-		.mark = 0,
-		.mark_id = MLX5_FLOW_MARK_DEFAULT,
-		.queues_n = 0,
-	};
+	struct mlx5_flow_parse parser = { .create = 1, };
+	struct rte_flow *flow = NULL;
+	unsigned int i;
 	int err;
 
-	err = priv_flow_validate(priv, attr, items, actions, error, &flow,
-				 &action);
+	err = priv_flow_convert(priv, attr, items, actions, error, &parser);
 	if (err)
 		goto exit;
-	flow.ibv_attr = rte_malloc(__func__, flow.offset, 0);
-	flow.offset = sizeof(struct ibv_exp_flow_attr);
-	if (!flow.ibv_attr) {
-		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
-				   NULL, "cannot allocate ibv_attr memory");
-		goto exit;
+	flow = rte_calloc(__func__, 1,
+			  sizeof(*flow) + parser.queues_n * sizeof(uint16_t),
+			  0);
+	if (!flow) {
+		rte_flow_error_set(error, ENOMEM,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL,
+				   "cannot allocate flow memory");
+		return NULL;
 	}
-	*flow.ibv_attr = (struct ibv_exp_flow_attr){
-		.type = IBV_EXP_FLOW_ATTR_NORMAL,
-		.size = sizeof(struct ibv_exp_flow_attr),
-		.priority = attr->priority,
-		.num_of_specs = 0,
-		.port = 0,
-		.flags = 0,
-		.reserved = 0,
-	};
-	flow.inner = 0;
-	flow.hash_fields = 0;
-	claim_zero(priv_flow_validate(priv, attr, items, actions,
-				      error, &flow, &action));
-	if (action.mark && !action.drop) {
-		mlx5_flow_create_flag_mark(&flow, action.mark_id);
-		flow.offset += sizeof(struct ibv_exp_flow_spec_action_tag);
-	}
-	if (action.drop)
-		rte_flow =
-			priv_flow_create_action_queue_drop(priv, &flow, error);
-	else
-		rte_flow = priv_flow_create_action_queue(priv, &flow, &action,
+	/* Copy queues configuration. */
+	flow->queues = (uint16_t (*)[])(flow + 1);
+	memcpy(flow->queues, parser.queues, parser.queues_n * sizeof(uint16_t));
+	flow->queues_n = parser.queues_n;
+	/* Copy RSS configuration. */
+	flow->rss_conf = parser.rss_conf;
+	flow->rss_conf.rss_key = flow->rss_key;
+	memcpy(flow->rss_key, parser.rss_key, parser.rss_conf.rss_key_len);
+	/* finalise the flow. */
+	if (parser.drop)
+		err = priv_flow_create_action_queue_drop(priv, &parser, flow,
 							 error);
-	if (!rte_flow)
+	else
+		err = priv_flow_create_action_queue(priv, &parser, flow, error);
+	if (err)
 		goto exit;
-	return rte_flow;
+	TAILQ_INSERT_TAIL(list, flow, next);
+	DEBUG("Flow created %p", (void *)flow);
+	return flow;
 exit:
-	rte_free(flow.ibv_attr);
+	if (parser.drop) {
+		rte_free(parser.drop_q.ibv_attr);
+	} else {
+		for (i = 0; i != hash_rxq_init_n; ++i) {
+			if (parser.queue[i].ibv_attr)
+				rte_free(parser.queue[i].ibv_attr);
+		}
+	}
+	rte_free(flow);
 	return NULL;
 }
 
 /**
+ * Validate a flow supported by the NIC.
+ *
+ * @see rte_flow_validate()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_validate(struct rte_eth_dev *dev,
+		   const struct rte_flow_attr *attr,
+		   const struct rte_flow_item items[],
+		   const struct rte_flow_action actions[],
+		   struct rte_flow_error *error)
+{
+	struct priv *priv = dev->data->dev_private;
+	int ret;
+	struct mlx5_flow_parse parser = { .create = 0, };
+
+	priv_lock(priv);
+	ret = priv_flow_convert(priv, attr, items, actions, error, &parser);
+	priv_unlock(priv);
+	return ret;
+}
+
+/**
  * Create a flow.
  *
  * @see rte_flow_create()
@@ -1249,11 +1998,8 @@ mlx5_flow_create(struct rte_eth_dev *dev,
 	struct rte_flow *flow;
 
 	priv_lock(priv);
-	flow = priv_flow_create(priv, attr, items, actions, error);
-	if (flow) {
-		TAILQ_INSERT_TAIL(&priv->flows, flow, next);
-		DEBUG("Flow created %p", (void *)flow);
-	}
+	flow = priv_flow_create(priv, &priv->flows, attr, items, actions,
+				error);
 	priv_unlock(priv);
 	return flow;
 }
@@ -1263,121 +2009,95 @@ mlx5_flow_create(struct rte_eth_dev *dev,
  *
  * @param priv
  *   Pointer to private structure.
+ * @param list
+ *   Pointer to a TAILQ flow list.
  * @param[in] flow
  *   Flow to destroy.
  */
 static void
 priv_flow_destroy(struct priv *priv,
+		  struct mlx5_flows *list,
 		  struct rte_flow *flow)
 {
-	TAILQ_REMOVE(&priv->flows, flow, next);
-	if (flow->ibv_flow)
-		claim_zero(ibv_exp_destroy_flow(flow->ibv_flow));
-	if (flow->drop)
+	unsigned int i;
+
+	if (flow->drop || !flow->mark)
 		goto free;
-	if (flow->qp)
-		claim_zero(ibv_destroy_qp(flow->qp));
-	if (flow->ind_table)
-		claim_zero(ibv_exp_destroy_rwq_ind_table(flow->ind_table));
-	if (flow->drop && flow->wq)
-		claim_zero(ibv_exp_destroy_wq(flow->wq));
-	if (flow->drop && flow->cq)
-		claim_zero(ibv_destroy_cq(flow->cq));
-	if (flow->mark) {
+	for (i = 0; i != flow->queues_n; ++i) {
 		struct rte_flow *tmp;
-		struct rxq *rxq;
-		uint32_t mark_n = 0;
-		uint32_t queue_n;
+		int mark = 0;
 
 		/*
 		 * To remove the mark from the queue, the queue must not be
 		 * present in any other marked flow (RSS or not).
 		 */
-		for (queue_n = 0; queue_n < flow->rxqs_n; ++queue_n) {
-			rxq = flow->rxqs[queue_n];
-			for (tmp = TAILQ_FIRST(&priv->flows);
-			     tmp;
-			     tmp = TAILQ_NEXT(tmp, next)) {
-				uint32_t tqueue_n;
-
-				if (tmp->drop)
+		TAILQ_FOREACH(tmp, list, next) {
+			unsigned int j;
+			uint16_t *tqs = NULL;
+			uint16_t tq_n = 0;
+
+			if (!tmp->mark)
+				continue;
+			for (j = 0; j != hash_rxq_init_n; ++j) {
+				if (!tmp->frxq[j].hrxq)
 					continue;
-				for (tqueue_n = 0;
-				     tqueue_n < tmp->rxqs_n;
-				     ++tqueue_n) {
-					struct rxq *trxq;
-
-					trxq = tmp->rxqs[tqueue_n];
-					if (rxq == trxq)
-						++mark_n;
-				}
+				tqs = tmp->frxq[j].hrxq->ind_table->queues;
+				tq_n = tmp->frxq[j].hrxq->ind_table->queues_n;
 			}
-			rxq->mark = !!mark_n;
+			if (!tq_n)
+				continue;
+			for (j = 0; (j != tq_n) && !mark; j++)
+				if (tqs[j] == (*flow->queues)[i])
+					mark = 1;
 		}
+		(*priv->rxqs)[(*flow->queues)[i]]->mark = mark;
 	}
 free:
-	rte_free(flow->ibv_attr);
+	if (flow->drop) {
+		if (flow->drxq.ibv_flow)
+			claim_zero(ibv_destroy_flow(flow->drxq.ibv_flow));
+		rte_free(flow->drxq.ibv_attr);
+	} else {
+		for (i = 0; i != hash_rxq_init_n; ++i) {
+			struct mlx5_flow *frxq = &flow->frxq[i];
+
+			if (frxq->ibv_flow)
+				claim_zero(ibv_destroy_flow(frxq->ibv_flow));
+			if (frxq->hrxq)
+				mlx5_priv_hrxq_release(priv, frxq->hrxq);
+			if (frxq->ibv_attr)
+				rte_free(frxq->ibv_attr);
+		}
+	}
+	if (flow->cs) {
+		claim_zero(ibv_destroy_counter_set(flow->cs));
+		flow->cs = NULL;
+	}
+	TAILQ_REMOVE(list, flow, next);
 	DEBUG("Flow destroyed %p", (void *)flow);
 	rte_free(flow);
 }
 
 /**
- * Destroy a flow.
- *
- * @see rte_flow_destroy()
- * @see rte_flow_ops
- */
-int
-mlx5_flow_destroy(struct rte_eth_dev *dev,
-		  struct rte_flow *flow,
-		  struct rte_flow_error *error)
-{
-	struct priv *priv = dev->data->dev_private;
-
-	(void)error;
-	priv_lock(priv);
-	priv_flow_destroy(priv, flow);
-	priv_unlock(priv);
-	return 0;
-}
-
-/**
  * Destroy all flows.
  *
  * @param priv
  *   Pointer to private structure.
+ * @param list
+ *   Pointer to a TAILQ flow list.
  */
-static void
-priv_flow_flush(struct priv *priv)
+void
+priv_flow_flush(struct priv *priv, struct mlx5_flows *list)
 {
-	while (!TAILQ_EMPTY(&priv->flows)) {
+	while (!TAILQ_EMPTY(list)) {
 		struct rte_flow *flow;
 
-		flow = TAILQ_FIRST(&priv->flows);
-		priv_flow_destroy(priv, flow);
+		flow = TAILQ_FIRST(list);
+		priv_flow_destroy(priv, list, flow);
 	}
 }
 
 /**
- * Destroy all flows.
- *
- * @see rte_flow_flush()
- * @see rte_flow_ops
- */
-int
-mlx5_flow_flush(struct rte_eth_dev *dev,
-		struct rte_flow_error *error)
-{
-	struct priv *priv = dev->data->dev_private;
-
-	(void)error;
-	priv_lock(priv);
-	priv_flow_flush(priv);
-	priv_unlock(priv);
-	return 0;
-}
-
-/**
  * Create drop queue.
  *
  * @param priv
@@ -1386,11 +2106,10 @@ mlx5_flow_flush(struct rte_eth_dev *dev,
  * @return
  *   0 on success.
  */
-static int
+int
 priv_flow_create_drop_queue(struct priv *priv)
 {
-	struct rte_flow_drop *fdq = NULL;
-	unsigned int i;
+	struct mlx5_hrxq_drop *fdq = NULL;
 
 	assert(priv->pd);
 	assert(priv->ctx);
@@ -1399,57 +2118,50 @@ priv_flow_create_drop_queue(struct priv *priv)
 		WARN("cannot allocate memory for drop queue");
 		goto error;
 	}
-	fdq->cq = ibv_exp_create_cq(priv->ctx, 1, NULL, NULL, 0,
-			&(struct ibv_exp_cq_init_attr){
-			.comp_mask = 0,
-			});
+	fdq->cq = ibv_create_cq(priv->ctx, 1, NULL, NULL, 0);
 	if (!fdq->cq) {
 		WARN("cannot allocate CQ for drop queue");
 		goto error;
 	}
-	for (i = 0; i != MLX5_DROP_WQ_N; ++i) {
-		fdq->wqs[i] = ibv_exp_create_wq(priv->ctx,
-				&(struct ibv_exp_wq_init_attr){
-				.wq_type = IBV_EXP_WQT_RQ,
-				.max_recv_wr = 1,
-				.max_recv_sge = 1,
-				.pd = priv->pd,
-				.cq = fdq->cq,
-				});
-		if (!fdq->wqs[i]) {
-			WARN("cannot allocate WQ for drop queue");
-			goto error;
-		}
-	}
-	fdq->ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
-			&(struct ibv_exp_rwq_ind_table_init_attr){
+	fdq->wq = ibv_create_wq(priv->ctx,
+			&(struct ibv_wq_init_attr){
+			.wq_type = IBV_WQT_RQ,
+			.max_wr = 1,
+			.max_sge = 1,
 			.pd = priv->pd,
+			.cq = fdq->cq,
+			});
+	if (!fdq->wq) {
+		WARN("cannot allocate WQ for drop queue");
+		goto error;
+	}
+	fdq->ind_table = ibv_create_rwq_ind_table(priv->ctx,
+			&(struct ibv_rwq_ind_table_init_attr){
 			.log_ind_tbl_size = 0,
-			.ind_tbl = fdq->wqs,
+			.ind_tbl = &fdq->wq,
 			.comp_mask = 0,
 			});
 	if (!fdq->ind_table) {
 		WARN("cannot allocate indirection table for drop queue");
 		goto error;
 	}
-	fdq->qp = ibv_exp_create_qp(priv->ctx,
-		&(struct ibv_exp_qp_init_attr){
+	fdq->qp = ibv_create_qp_ex(priv->ctx,
+		&(struct ibv_qp_init_attr_ex){
 			.qp_type = IBV_QPT_RAW_PACKET,
 			.comp_mask =
-				IBV_EXP_QP_INIT_ATTR_PD |
-				IBV_EXP_QP_INIT_ATTR_PORT |
-				IBV_EXP_QP_INIT_ATTR_RX_HASH,
-			.pd = priv->pd,
-			.rx_hash_conf = &(struct ibv_exp_rx_hash_conf){
+				IBV_QP_INIT_ATTR_PD |
+				IBV_QP_INIT_ATTR_IND_TABLE |
+				IBV_QP_INIT_ATTR_RX_HASH,
+			.rx_hash_conf = (struct ibv_rx_hash_conf){
 				.rx_hash_function =
-					IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
+					IBV_RX_HASH_FUNC_TOEPLITZ,
 				.rx_hash_key_len = rss_hash_default_key_len,
 				.rx_hash_key = rss_hash_default_key,
 				.rx_hash_fields_mask = 0,
-				.rwq_ind_tbl = fdq->ind_table,
 				},
-			.port_num = priv->port,
-			});
+			.rwq_ind_tbl = fdq->ind_table,
+			.pd = priv->pd
+		});
 	if (!fdq->qp) {
 		WARN("cannot allocate QP for drop queue");
 		goto error;
@@ -1460,11 +2172,9 @@ error:
 	if (fdq->qp)
 		claim_zero(ibv_destroy_qp(fdq->qp));
 	if (fdq->ind_table)
-		claim_zero(ibv_exp_destroy_rwq_ind_table(fdq->ind_table));
-	for (i = 0; i != MLX5_DROP_WQ_N; ++i) {
-		if (fdq->wqs[i])
-			claim_zero(ibv_exp_destroy_wq(fdq->wqs[i]));
-	}
+		claim_zero(ibv_destroy_rwq_ind_table(fdq->ind_table));
+	if (fdq->wq)
+		claim_zero(ibv_destroy_wq(fdq->wq));
 	if (fdq->cq)
 		claim_zero(ibv_destroy_cq(fdq->cq));
 	if (fdq)
@@ -1479,22 +2189,19 @@ error:
  * @param priv
  *   Pointer to private structure.
  */
-static void
+void
 priv_flow_delete_drop_queue(struct priv *priv)
 {
-	struct rte_flow_drop *fdq = priv->flow_drop_queue;
-	unsigned int i;
+	struct mlx5_hrxq_drop *fdq = priv->flow_drop_queue;
 
 	if (!fdq)
 		return;
 	if (fdq->qp)
 		claim_zero(ibv_destroy_qp(fdq->qp));
 	if (fdq->ind_table)
-		claim_zero(ibv_exp_destroy_rwq_ind_table(fdq->ind_table));
-	for (i = 0; i != MLX5_DROP_WQ_N; ++i) {
-		if (fdq->wqs[i])
-			claim_zero(ibv_exp_destroy_wq(fdq->wqs[i]));
-	}
+		claim_zero(ibv_destroy_rwq_ind_table(fdq->ind_table));
+	if (fdq->wq)
+		claim_zero(ibv_destroy_wq(fdq->wq));
 	if (fdq->cq)
 		claim_zero(ibv_destroy_cq(fdq->cq));
 	rte_free(fdq);
@@ -1504,28 +2211,49 @@ priv_flow_delete_drop_queue(struct priv *priv)
 /**
  * Remove all flows.
  *
- * Called by dev_stop() to remove all flows.
- *
  * @param priv
  *   Pointer to private structure.
+ * @param list
+ *   Pointer to a TAILQ flow list.
  */
 void
-priv_flow_stop(struct priv *priv)
+priv_flow_stop(struct priv *priv, struct mlx5_flows *list)
 {
 	struct rte_flow *flow;
 
-	TAILQ_FOREACH_REVERSE(flow, &priv->flows, mlx5_flows, next) {
-		claim_zero(ibv_exp_destroy_flow(flow->ibv_flow));
-		flow->ibv_flow = NULL;
+	TAILQ_FOREACH_REVERSE(flow, list, mlx5_flows, next) {
+		unsigned int i;
+
+		if (flow->drop) {
+			if (!flow->drxq.ibv_flow)
+				continue;
+			claim_zero(ibv_destroy_flow(flow->drxq.ibv_flow));
+			flow->drxq.ibv_flow = NULL;
+			/* Next flow. */
+			continue;
+		}
 		if (flow->mark) {
-			unsigned int n;
+			struct mlx5_ind_table_ibv *ind_tbl = NULL;
 
-			for (n = 0; n < flow->rxqs_n; ++n)
-				flow->rxqs[n]->mark = 0;
+			for (i = 0; i != hash_rxq_init_n; ++i) {
+				if (!flow->frxq[i].hrxq)
+					continue;
+				ind_tbl = flow->frxq[i].hrxq->ind_table;
+			}
+			assert(ind_tbl);
+			for (i = 0; i != ind_tbl->queues_n; ++i)
+				(*priv->rxqs)[ind_tbl->queues[i]]->mark = 0;
+		}
+		for (i = 0; i != hash_rxq_init_n; ++i) {
+			if (!flow->frxq[i].ibv_flow)
+				continue;
+			claim_zero(ibv_destroy_flow(flow->frxq[i].ibv_flow));
+			flow->frxq[i].ibv_flow = NULL;
+			mlx5_priv_hrxq_release(priv, flow->frxq[i].hrxq);
+			flow->frxq[i].hrxq = NULL;
 		}
 		DEBUG("Flow %p removed", (void *)flow);
 	}
-	priv_flow_delete_drop_queue(priv);
 }
 
 /**
@@ -1533,75 +2261,321 @@ priv_flow_stop(struct priv *priv)
  *
  * @param priv
  *   Pointer to private structure.
+ * @param list
+ *   Pointer to a TAILQ flow list.
  *
  * @return
  *   0 on success, a errno value otherwise and rte_errno is set.
  */
 int
-priv_flow_start(struct priv *priv)
+priv_flow_start(struct priv *priv, struct mlx5_flows *list)
 {
-	int ret;
 	struct rte_flow *flow;
 
-	ret = priv_flow_create_drop_queue(priv);
-	if (ret)
-		return -1;
-	TAILQ_FOREACH(flow, &priv->flows, next) {
-		struct ibv_qp *qp;
+	TAILQ_FOREACH(flow, list, next) {
+		unsigned int i;
 
-		if (flow->drop)
-			qp = priv->flow_drop_queue->qp;
-		else
-			qp = flow->qp;
-		flow->ibv_flow = ibv_exp_create_flow(qp, flow->ibv_attr);
-		if (!flow->ibv_flow) {
-			DEBUG("Flow %p cannot be applied", (void *)flow);
-			rte_errno = EINVAL;
-			return rte_errno;
+		if (flow->drop) {
+			flow->drxq.ibv_flow =
+				ibv_create_flow(priv->flow_drop_queue->qp,
+						flow->drxq.ibv_attr);
+			if (!flow->drxq.ibv_flow) {
+				DEBUG("Flow %p cannot be applied",
+				      (void *)flow);
+				rte_errno = EINVAL;
+				return rte_errno;
+			}
+			DEBUG("Flow %p applied", (void *)flow);
+			/* Next flow. */
+			continue;
 		}
-		DEBUG("Flow %p applied", (void *)flow);
-		if (flow->mark) {
-			unsigned int n;
-
-			for (n = 0; n < flow->rxqs_n; ++n)
-				flow->rxqs[n]->mark = 1;
+		for (i = 0; i != hash_rxq_init_n; ++i) {
+			if (!flow->frxq[i].ibv_attr)
+				continue;
+			flow->frxq[i].hrxq =
+				mlx5_priv_hrxq_get(priv, flow->rss_conf.rss_key,
+						   flow->rss_conf.rss_key_len,
+						   hash_rxq_init[i].hash_fields,
+						   (*flow->queues),
+						   flow->queues_n);
+			if (flow->frxq[i].hrxq)
+				goto flow_create;
+			flow->frxq[i].hrxq =
+				mlx5_priv_hrxq_new(priv, flow->rss_conf.rss_key,
+						   flow->rss_conf.rss_key_len,
+						   hash_rxq_init[i].hash_fields,
+						   (*flow->queues),
+						   flow->queues_n);
+			if (!flow->frxq[i].hrxq) {
+				DEBUG("Flow %p cannot be applied",
+				      (void *)flow);
+				rte_errno = EINVAL;
+				return rte_errno;
+			}
+flow_create:
+			flow->frxq[i].ibv_flow =
+				ibv_create_flow(flow->frxq[i].hrxq->qp,
+						flow->frxq[i].ibv_attr);
+			if (!flow->frxq[i].ibv_flow) {
+				DEBUG("Flow %p cannot be applied",
+				      (void *)flow);
+				rte_errno = EINVAL;
+				return rte_errno;
+			}
+			DEBUG("Flow %p applied", (void *)flow);
 		}
+		if (!flow->mark)
+			continue;
+		for (i = 0; i != flow->queues_n; ++i)
+			(*priv->rxqs)[(*flow->queues)[i]]->mark = 1;
 	}
 	return 0;
 }
 
 /**
- * Verify if the Rx queue is used in a flow.
+ * Verify the flow list is empty
  *
  * @param priv
- *   Pointer to private structure.
- * @param rxq
- *   Pointer to the queue to search.
+ *  Pointer to private structure.
+ *
+ * @return the number of flows not released.
+ */
+int
+priv_flow_verify(struct priv *priv)
+{
+	struct rte_flow *flow;
+	int ret = 0;
+
+	TAILQ_FOREACH(flow, &priv->flows, next) {
+		DEBUG("%p: flow %p still referenced", (void *)priv,
+		      (void *)flow);
+		++ret;
+	}
+	return ret;
+}
+
+/**
+ * Enable a control flow configured from the control plane.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param eth_spec
+ *   An Ethernet flow spec to apply.
+ * @param eth_mask
+ *   An Ethernet flow mask to apply.
+ * @param vlan_spec
+ *   A VLAN flow spec to apply.
+ * @param vlan_mask
+ *   A VLAN flow mask to apply.
  *
  * @return
- *   Nonzero if the queue is used by a flow.
+ *   0 on success.
  */
 int
-priv_flow_rxq_in_use(struct priv *priv, struct rxq *rxq)
+mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
+		    struct rte_flow_item_eth *eth_spec,
+		    struct rte_flow_item_eth *eth_mask,
+		    struct rte_flow_item_vlan *vlan_spec,
+		    struct rte_flow_item_vlan *vlan_mask)
 {
+	struct priv *priv = dev->data->dev_private;
+	const struct rte_flow_attr attr = {
+		.ingress = 1,
+		.priority = MLX5_CTRL_FLOW_PRIORITY,
+	};
+	struct rte_flow_item items[] = {
+		{
+			.type = RTE_FLOW_ITEM_TYPE_ETH,
+			.spec = eth_spec,
+			.last = NULL,
+			.mask = eth_mask,
+		},
+		{
+			.type = (vlan_spec) ? RTE_FLOW_ITEM_TYPE_VLAN :
+				RTE_FLOW_ITEM_TYPE_END,
+			.spec = vlan_spec,
+			.last = NULL,
+			.mask = vlan_mask,
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+	};
+	struct rte_flow_action actions[] = {
+		{
+			.type = RTE_FLOW_ACTION_TYPE_RSS,
+		},
+		{
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		},
+	};
 	struct rte_flow *flow;
+	struct rte_flow_error error;
+	unsigned int i;
+	union {
+		struct rte_flow_action_rss rss;
+		struct {
+			const struct rte_eth_rss_conf *rss_conf;
+			uint16_t num;
+			uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
+		} local;
+	} action_rss;
+
+	if (!priv->reta_idx_n)
+		return EINVAL;
+	for (i = 0; i != priv->reta_idx_n; ++i)
+		action_rss.local.queue[i] = (*priv->reta_idx)[i];
+	action_rss.local.rss_conf = &priv->rss_conf;
+	action_rss.local.num = priv->reta_idx_n;
+	actions[0].conf = (const void *)&action_rss.rss;
+	flow = priv_flow_create(priv, &priv->ctrl_flows, &attr, items, actions,
+				&error);
+	if (!flow)
+		return rte_errno;
+	return 0;
+}
 
-	for (flow = TAILQ_FIRST(&priv->flows);
-	     flow;
-	     flow = TAILQ_NEXT(flow, next)) {
-		unsigned int n;
+/**
+ * Enable a flow control configured from the control plane.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param eth_spec
+ *   An Ethernet flow spec to apply.
+ * @param eth_mask
+ *   An Ethernet flow mask to apply.
+ *
+ * @return
+ *   0 on success.
+ */
+int
+mlx5_ctrl_flow(struct rte_eth_dev *dev,
+	       struct rte_flow_item_eth *eth_spec,
+	       struct rte_flow_item_eth *eth_mask)
+{
+	return mlx5_ctrl_flow_vlan(dev, eth_spec, eth_mask, NULL, NULL);
+}
 
-		if (flow->drop)
-			continue;
-		for (n = 0; n < flow->rxqs_n; ++n) {
-			if (flow->rxqs[n] == rxq)
-				return 1;
-		}
+/**
+ * Destroy a flow.
+ *
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_destroy(struct rte_eth_dev *dev,
+		  struct rte_flow *flow,
+		  struct rte_flow_error *error)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	(void)error;
+	priv_lock(priv);
+	priv_flow_destroy(priv, &priv->flows, flow);
+	priv_unlock(priv);
+	return 0;
+}
+
+/**
+ * Destroy all flows.
+ *
+ * @see rte_flow_flush()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_flush(struct rte_eth_dev *dev,
+		struct rte_flow_error *error)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	(void)error;
+	priv_lock(priv);
+	priv_flow_flush(priv, &priv->flows);
+	priv_unlock(priv);
+	return 0;
+}
+
+#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
+/**
+ * Query flow counter.
+ *
+ * @param cs
+ *   the counter set.
+ * @param counter_value
+ *   returned data from the counter.
+ *
+ * @return
+ *   0 on success, a errno value otherwise and rte_errno is set.
+ */
+static int
+priv_flow_query_count(struct ibv_counter_set *cs,
+		      struct mlx5_flow_counter_stats *counter_stats,
+		      struct rte_flow_query_count *query_count,
+		      struct rte_flow_error *error)
+{
+	uint64_t counters[2];
+	struct ibv_query_counter_set_attr query_cs_attr = {
+		.cs = cs,
+		.query_flags = IBV_COUNTER_SET_FORCE_UPDATE,
+	};
+	struct ibv_counter_set_data query_out = {
+		.out = counters,
+		.outlen = 2 * sizeof(uint64_t),
+	};
+	int res = ibv_query_counter_set(&query_cs_attr, &query_out);
+
+	if (res) {
+		rte_flow_error_set(error, -res,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL,
+				   "cannot read counter");
+		return -res;
+	}
+	query_count->hits_set = 1;
+	query_count->bytes_set = 1;
+	query_count->hits = counters[0] - counter_stats->hits;
+	query_count->bytes = counters[1] - counter_stats->bytes;
+	if (query_count->reset) {
+		counter_stats->hits = counters[0];
+		counter_stats->bytes = counters[1];
 	}
 	return 0;
 }
 
 /**
+ * Query a flows.
+ *
+ * @see rte_flow_query()
+ * @see rte_flow_ops
+ */
+int
+mlx5_flow_query(struct rte_eth_dev *dev,
+		struct rte_flow *flow,
+		enum rte_flow_action_type action __rte_unused,
+		void *data,
+		struct rte_flow_error *error)
+{
+	struct priv *priv = dev->data->dev_private;
+	int res = EINVAL;
+
+	priv_lock(priv);
+	if (flow->cs) {
+		res = priv_flow_query_count(flow->cs,
+					&flow->counter_stats,
+					(struct rte_flow_query_count *)data,
+					error);
+	} else {
+		rte_flow_error_set(error, res,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL,
+				   "no counter found for flow");
+	}
+	priv_unlock(priv);
+	return -res;
+}
+#endif
+
+/**
  * Isolated mode.
  *
  * @see rte_flow_isolate()
@@ -1615,7 +2589,7 @@ mlx5_flow_isolate(struct rte_eth_dev *dev,
 	struct priv *priv = dev->data->dev_private;
 
 	priv_lock(priv);
-	if (priv->started) {
+	if (dev->data->dev_started) {
 		rte_flow_error_set(error, EBUSY,
 				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
 				   NULL,
@@ -1624,6 +2598,497 @@ mlx5_flow_isolate(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 	priv->isolated = !!enable;
+	if (enable)
+		priv->dev->dev_ops = &mlx5_dev_ops_isolate;
+	else
+		priv->dev->dev_ops = &mlx5_dev_ops;
 	priv_unlock(priv);
 	return 0;
 }
+
+/**
+ * Convert a flow director filter to a generic flow.
+ *
+ * @param priv
+ *   Private structure.
+ * @param fdir_filter
+ *   Flow director filter to add.
+ * @param attributes
+ *   Generic flow parameters structure.
+ *
+ * @return
+ *  0 on success, errno value on error.
+ */
+static int
+priv_fdir_filter_convert(struct priv *priv,
+			 const struct rte_eth_fdir_filter *fdir_filter,
+			 struct mlx5_fdir *attributes)
+{
+	const struct rte_eth_fdir_input *input = &fdir_filter->input;
+
+	/* Validate queue number. */
+	if (fdir_filter->action.rx_queue >= priv->rxqs_n) {
+		ERROR("invalid queue number %d", fdir_filter->action.rx_queue);
+		return EINVAL;
+	}
+	attributes->attr.ingress = 1;
+	attributes->items[0] = (struct rte_flow_item) {
+		.type = RTE_FLOW_ITEM_TYPE_ETH,
+		.spec = &attributes->l2,
+		.mask = &attributes->l2_mask,
+	};
+	switch (fdir_filter->action.behavior) {
+	case RTE_ETH_FDIR_ACCEPT:
+		attributes->actions[0] = (struct rte_flow_action){
+			.type = RTE_FLOW_ACTION_TYPE_QUEUE,
+			.conf = &attributes->queue,
+		};
+		break;
+	case RTE_ETH_FDIR_REJECT:
+		attributes->actions[0] = (struct rte_flow_action){
+			.type = RTE_FLOW_ACTION_TYPE_DROP,
+		};
+		break;
+	default:
+		ERROR("invalid behavior %d", fdir_filter->action.behavior);
+		return ENOTSUP;
+	}
+	attributes->queue.index = fdir_filter->action.rx_queue;
+	switch (fdir_filter->input.flow_type) {
+	case RTE_ETH_FLOW_NONFRAG_IPV4_UDP:
+		attributes->l3.ipv4.hdr = (struct ipv4_hdr){
+			.src_addr = input->flow.udp4_flow.ip.src_ip,
+			.dst_addr = input->flow.udp4_flow.ip.dst_ip,
+			.time_to_live = input->flow.udp4_flow.ip.ttl,
+			.type_of_service = input->flow.udp4_flow.ip.tos,
+			.next_proto_id = input->flow.udp4_flow.ip.proto,
+		};
+		attributes->l4.udp.hdr = (struct udp_hdr){
+			.src_port = input->flow.udp4_flow.src_port,
+			.dst_port = input->flow.udp4_flow.dst_port,
+		};
+		attributes->items[1] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_IPV4,
+			.spec = &attributes->l3,
+		};
+		attributes->items[2] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_UDP,
+			.spec = &attributes->l4,
+		};
+		break;
+	case RTE_ETH_FLOW_NONFRAG_IPV4_TCP:
+		attributes->l3.ipv4.hdr = (struct ipv4_hdr){
+			.src_addr = input->flow.tcp4_flow.ip.src_ip,
+			.dst_addr = input->flow.tcp4_flow.ip.dst_ip,
+			.time_to_live = input->flow.tcp4_flow.ip.ttl,
+			.type_of_service = input->flow.tcp4_flow.ip.tos,
+			.next_proto_id = input->flow.tcp4_flow.ip.proto,
+		};
+		attributes->l4.tcp.hdr = (struct tcp_hdr){
+			.src_port = input->flow.tcp4_flow.src_port,
+			.dst_port = input->flow.tcp4_flow.dst_port,
+		};
+		attributes->items[1] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_IPV4,
+			.spec = &attributes->l3,
+		};
+		attributes->items[2] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_TCP,
+			.spec = &attributes->l4,
+		};
+		break;
+	case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
+		attributes->l3.ipv4.hdr = (struct ipv4_hdr){
+			.src_addr = input->flow.ip4_flow.src_ip,
+			.dst_addr = input->flow.ip4_flow.dst_ip,
+			.time_to_live = input->flow.ip4_flow.ttl,
+			.type_of_service = input->flow.ip4_flow.tos,
+			.next_proto_id = input->flow.ip4_flow.proto,
+		};
+		attributes->items[1] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_IPV4,
+			.spec = &attributes->l3,
+		};
+		break;
+	case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
+		attributes->l3.ipv6.hdr = (struct ipv6_hdr){
+			.hop_limits = input->flow.udp6_flow.ip.hop_limits,
+			.proto = input->flow.udp6_flow.ip.proto,
+		};
+		memcpy(attributes->l3.ipv6.hdr.src_addr,
+		       input->flow.udp6_flow.ip.src_ip,
+		       RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
+		memcpy(attributes->l3.ipv6.hdr.dst_addr,
+		       input->flow.udp6_flow.ip.dst_ip,
+		       RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
+		attributes->l4.udp.hdr = (struct udp_hdr){
+			.src_port = input->flow.udp6_flow.src_port,
+			.dst_port = input->flow.udp6_flow.dst_port,
+		};
+		attributes->items[1] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_IPV6,
+			.spec = &attributes->l3,
+		};
+		attributes->items[2] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_UDP,
+			.spec = &attributes->l4,
+		};
+		break;
+	case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
+		attributes->l3.ipv6.hdr = (struct ipv6_hdr){
+			.hop_limits = input->flow.tcp6_flow.ip.hop_limits,
+			.proto = input->flow.tcp6_flow.ip.proto,
+		};
+		memcpy(attributes->l3.ipv6.hdr.src_addr,
+		       input->flow.tcp6_flow.ip.src_ip,
+		       RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
+		memcpy(attributes->l3.ipv6.hdr.dst_addr,
+		       input->flow.tcp6_flow.ip.dst_ip,
+		       RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
+		attributes->l4.tcp.hdr = (struct tcp_hdr){
+			.src_port = input->flow.tcp6_flow.src_port,
+			.dst_port = input->flow.tcp6_flow.dst_port,
+		};
+		attributes->items[1] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_IPV6,
+			.spec = &attributes->l3,
+		};
+		attributes->items[2] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_TCP,
+			.spec = &attributes->l4,
+		};
+		break;
+	case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
+		attributes->l3.ipv6.hdr = (struct ipv6_hdr){
+			.hop_limits = input->flow.ipv6_flow.hop_limits,
+			.proto = input->flow.ipv6_flow.proto,
+		};
+		memcpy(attributes->l3.ipv6.hdr.src_addr,
+		       input->flow.ipv6_flow.src_ip,
+		       RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
+		memcpy(attributes->l3.ipv6.hdr.dst_addr,
+		       input->flow.ipv6_flow.dst_ip,
+		       RTE_DIM(attributes->l3.ipv6.hdr.src_addr));
+		attributes->items[1] = (struct rte_flow_item){
+			.type = RTE_FLOW_ITEM_TYPE_IPV6,
+			.spec = &attributes->l3,
+		};
+		break;
+	default:
+		ERROR("invalid flow type%d",
+		      fdir_filter->input.flow_type);
+		return ENOTSUP;
+	}
+	return 0;
+}
+
+/**
+ * Add new flow director filter and store it in list.
+ *
+ * @param priv
+ *   Private structure.
+ * @param fdir_filter
+ *   Flow director filter to add.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+priv_fdir_filter_add(struct priv *priv,
+		     const struct rte_eth_fdir_filter *fdir_filter)
+{
+	struct mlx5_fdir attributes = {
+		.attr.group = 0,
+		.l2_mask = {
+			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+			.type = 0,
+		},
+	};
+	struct mlx5_flow_parse parser = {
+		.layer = HASH_RXQ_ETH,
+	};
+	struct rte_flow_error error;
+	struct rte_flow *flow;
+	int ret;
+
+	ret = priv_fdir_filter_convert(priv, fdir_filter, &attributes);
+	if (ret)
+		return -ret;
+	ret = priv_flow_convert(priv, &attributes.attr, attributes.items,
+				attributes.actions, &error, &parser);
+	if (ret)
+		return -ret;
+	flow = priv_flow_create(priv,
+				&priv->flows,
+				&attributes.attr,
+				attributes.items,
+				attributes.actions,
+				&error);
+	if (flow) {
+		DEBUG("FDIR created %p", (void *)flow);
+		return 0;
+	}
+	return ENOTSUP;
+}
+
+/**
+ * Delete specific filter.
+ *
+ * @param priv
+ *   Private structure.
+ * @param fdir_filter
+ *   Filter to be deleted.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+priv_fdir_filter_delete(struct priv *priv,
+			const struct rte_eth_fdir_filter *fdir_filter)
+{
+	struct mlx5_fdir attributes = {
+		.attr.group = 0,
+	};
+	struct mlx5_flow_parse parser = {
+		.create = 1,
+		.layer = HASH_RXQ_ETH,
+	};
+	struct rte_flow_error error;
+	struct rte_flow *flow;
+	unsigned int i;
+	int ret;
+
+	ret = priv_fdir_filter_convert(priv, fdir_filter, &attributes);
+	if (ret)
+		return -ret;
+	ret = priv_flow_convert(priv, &attributes.attr, attributes.items,
+				attributes.actions, &error, &parser);
+	if (ret)
+		goto exit;
+	/*
+	 * Special case for drop action which is only set in the
+	 * specifications when the flow is created.  In this situation the
+	 * drop specification is missing.
+	 */
+	if (parser.drop) {
+		struct ibv_flow_spec_action_drop *drop;
+
+		drop = (void *)((uintptr_t)parser.drop_q.ibv_attr +
+				parser.drop_q.offset);
+		*drop = (struct ibv_flow_spec_action_drop){
+			.type = IBV_FLOW_SPEC_ACTION_DROP,
+			.size = sizeof(struct ibv_flow_spec_action_drop),
+		};
+		parser.drop_q.ibv_attr->num_of_specs++;
+	}
+	TAILQ_FOREACH(flow, &priv->flows, next) {
+		struct ibv_flow_attr *attr;
+		struct ibv_spec_header *attr_h;
+		void *spec;
+		struct ibv_flow_attr *flow_attr;
+		struct ibv_spec_header *flow_h;
+		void *flow_spec;
+		unsigned int specs_n;
+
+		if (parser.drop)
+			attr = parser.drop_q.ibv_attr;
+		else
+			attr = parser.queue[HASH_RXQ_ETH].ibv_attr;
+		if (flow->drop)
+			flow_attr = flow->drxq.ibv_attr;
+		else
+			flow_attr = flow->frxq[HASH_RXQ_ETH].ibv_attr;
+		/* Compare first the attributes. */
+		if (memcmp(attr, flow_attr, sizeof(struct ibv_flow_attr)))
+			continue;
+		if (attr->num_of_specs == 0)
+			continue;
+		spec = (void *)((uintptr_t)attr +
+				sizeof(struct ibv_flow_attr));
+		flow_spec = (void *)((uintptr_t)flow_attr +
+				     sizeof(struct ibv_flow_attr));
+		specs_n = RTE_MIN(attr->num_of_specs, flow_attr->num_of_specs);
+		for (i = 0; i != specs_n; ++i) {
+			attr_h = spec;
+			flow_h = flow_spec;
+			if (memcmp(spec, flow_spec,
+				   RTE_MIN(attr_h->size, flow_h->size)))
+				continue;
+			spec = (void *)((uintptr_t)attr + attr_h->size);
+			flow_spec = (void *)((uintptr_t)flow_attr +
+					     flow_h->size);
+		}
+		/* At this point, the flow match. */
+		break;
+	}
+	if (flow)
+		priv_flow_destroy(priv, &priv->flows, flow);
+exit:
+	if (parser.drop) {
+		rte_free(parser.drop_q.ibv_attr);
+	} else {
+		for (i = 0; i != hash_rxq_init_n; ++i) {
+			if (parser.queue[i].ibv_attr)
+				rte_free(parser.queue[i].ibv_attr);
+		}
+	}
+	return -ret;
+}
+
+/**
+ * Update queue for specific filter.
+ *
+ * @param priv
+ *   Private structure.
+ * @param fdir_filter
+ *   Filter to be updated.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+priv_fdir_filter_update(struct priv *priv,
+			const struct rte_eth_fdir_filter *fdir_filter)
+{
+	int ret;
+
+	ret = priv_fdir_filter_delete(priv, fdir_filter);
+	if (ret)
+		return ret;
+	ret = priv_fdir_filter_add(priv, fdir_filter);
+	return ret;
+}
+
+/**
+ * Flush all filters.
+ *
+ * @param priv
+ *   Private structure.
+ */
+static void
+priv_fdir_filter_flush(struct priv *priv)
+{
+	priv_flow_flush(priv, &priv->flows);
+}
+
+/**
+ * Get flow director information.
+ *
+ * @param priv
+ *   Private structure.
+ * @param[out] fdir_info
+ *   Resulting flow director information.
+ */
+static void
+priv_fdir_info_get(struct priv *priv, struct rte_eth_fdir_info *fdir_info)
+{
+	struct rte_eth_fdir_masks *mask =
+		&priv->dev->data->dev_conf.fdir_conf.mask;
+
+	fdir_info->mode = priv->dev->data->dev_conf.fdir_conf.mode;
+	fdir_info->guarant_spc = 0;
+	rte_memcpy(&fdir_info->mask, mask, sizeof(fdir_info->mask));
+	fdir_info->max_flexpayload = 0;
+	fdir_info->flow_types_mask[0] = 0;
+	fdir_info->flex_payload_unit = 0;
+	fdir_info->max_flex_payload_segment_num = 0;
+	fdir_info->flex_payload_limit = 0;
+	memset(&fdir_info->flex_conf, 0, sizeof(fdir_info->flex_conf));
+}
+
+/**
+ * Deal with flow director operations.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+priv_fdir_ctrl_func(struct priv *priv, enum rte_filter_op filter_op, void *arg)
+{
+	enum rte_fdir_mode fdir_mode =
+		priv->dev->data->dev_conf.fdir_conf.mode;
+	int ret = 0;
+
+	if (filter_op == RTE_ETH_FILTER_NOP)
+		return 0;
+	if (fdir_mode != RTE_FDIR_MODE_PERFECT &&
+	    fdir_mode != RTE_FDIR_MODE_PERFECT_MAC_VLAN) {
+		ERROR("%p: flow director mode %d not supported",
+		      (void *)priv, fdir_mode);
+		return EINVAL;
+	}
+	switch (filter_op) {
+	case RTE_ETH_FILTER_ADD:
+		ret = priv_fdir_filter_add(priv, arg);
+		break;
+	case RTE_ETH_FILTER_UPDATE:
+		ret = priv_fdir_filter_update(priv, arg);
+		break;
+	case RTE_ETH_FILTER_DELETE:
+		ret = priv_fdir_filter_delete(priv, arg);
+		break;
+	case RTE_ETH_FILTER_FLUSH:
+		priv_fdir_filter_flush(priv);
+		break;
+	case RTE_ETH_FILTER_INFO:
+		priv_fdir_info_get(priv, arg);
+		break;
+	default:
+		DEBUG("%p: unknown operation %u", (void *)priv,
+		      filter_op);
+		ret = EINVAL;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * Manage filter operations.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param filter_type
+ *   Filter type.
+ * @param filter_op
+ *   Operation to perform.
+ * @param arg
+ *   Pointer to operation-specific structure.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+mlx5_dev_filter_ctrl(struct rte_eth_dev *dev,
+		     enum rte_filter_type filter_type,
+		     enum rte_filter_op filter_op,
+		     void *arg)
+{
+	int ret = EINVAL;
+	struct priv *priv = dev->data->dev_private;
+
+	switch (filter_type) {
+	case RTE_ETH_FILTER_GENERIC:
+		if (filter_op != RTE_ETH_FILTER_GET)
+			return -EINVAL;
+		*(const void **)arg = &mlx5_flow_ops;
+		return 0;
+	case RTE_ETH_FILTER_FDIR:
+		priv_lock(priv);
+		ret = priv_fdir_ctrl_func(priv, filter_op, arg);
+		priv_unlock(priv);
+		break;
+	default:
+		ERROR("%p: filter type (%d) not supported",
+		      (void *)dev, filter_type);
+		break;
+	}
+	return -ret;
+}
diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c
index 8489ea67..d17b991e 100644
--- a/drivers/net/mlx5/mlx5_mac.c
+++ b/drivers/net/mlx5/mlx5_mac.c
@@ -51,16 +51,9 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_ether.h>
 #include <rte_ethdev.h>
 #include <rte_common.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
@@ -90,112 +83,6 @@ priv_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN])
 }
 
 /**
- * Delete MAC flow steering rule.
- *
- * @param hash_rxq
- *   Pointer to hash RX queue structure.
- * @param mac_index
- *   MAC address index.
- * @param vlan_index
- *   VLAN index to use.
- */
-static void
-hash_rxq_del_mac_flow(struct hash_rxq *hash_rxq, unsigned int mac_index,
-		      unsigned int vlan_index)
-{
-#ifndef NDEBUG
-	const uint8_t (*mac)[ETHER_ADDR_LEN] =
-		(const uint8_t (*)[ETHER_ADDR_LEN])
-		hash_rxq->priv->mac[mac_index].addr_bytes;
-#endif
-
-	assert(mac_index < RTE_DIM(hash_rxq->mac_flow));
-	assert(vlan_index < RTE_DIM(hash_rxq->mac_flow[mac_index]));
-	if (hash_rxq->mac_flow[mac_index][vlan_index] == NULL)
-		return;
-	DEBUG("%p: removing MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u"
-	      " VLAN index %u",
-	      (void *)hash_rxq,
-	      (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5],
-	      mac_index,
-	      vlan_index);
-	claim_zero(ibv_exp_destroy_flow(hash_rxq->mac_flow
-					[mac_index][vlan_index]));
-	hash_rxq->mac_flow[mac_index][vlan_index] = NULL;
-}
-
-/**
- * Unregister a MAC address from a hash RX queue.
- *
- * @param hash_rxq
- *   Pointer to hash RX queue structure.
- * @param mac_index
- *   MAC address index.
- */
-static void
-hash_rxq_mac_addr_del(struct hash_rxq *hash_rxq, unsigned int mac_index)
-{
-	unsigned int i;
-
-	assert(mac_index < RTE_DIM(hash_rxq->mac_flow));
-	for (i = 0; (i != RTE_DIM(hash_rxq->mac_flow[mac_index])); ++i)
-		hash_rxq_del_mac_flow(hash_rxq, mac_index, i);
-}
-
-/**
- * Unregister all MAC addresses from a hash RX queue.
- *
- * @param hash_rxq
- *   Pointer to hash RX queue structure.
- */
-void
-hash_rxq_mac_addrs_del(struct hash_rxq *hash_rxq)
-{
-	unsigned int i;
-
-	for (i = 0; (i != RTE_DIM(hash_rxq->mac_flow)); ++i)
-		hash_rxq_mac_addr_del(hash_rxq, i);
-}
-
-/**
- * Unregister a MAC address.
- *
- * This is done for each hash RX queue.
- *
- * @param priv
- *   Pointer to private structure.
- * @param mac_index
- *   MAC address index.
- */
-static void
-priv_mac_addr_del(struct priv *priv, unsigned int mac_index)
-{
-	unsigned int i;
-
-	assert(mac_index < RTE_DIM(priv->mac));
-	if (!BITFIELD_ISSET(priv->mac_configured, mac_index))
-		return;
-	for (i = 0; (i != priv->hash_rxqs_n); ++i)
-		hash_rxq_mac_addr_del(&(*priv->hash_rxqs)[i], mac_index);
-	BITFIELD_RESET(priv->mac_configured, mac_index);
-}
-
-/**
- * Unregister all MAC addresses from all hash RX queues.
- *
- * @param priv
- *   Pointer to private structure.
- */
-void
-priv_mac_addrs_disable(struct priv *priv)
-{
-	unsigned int i;
-
-	for (i = 0; (i != priv->hash_rxqs_n); ++i)
-		hash_rxq_mac_addrs_del(&(*priv->hash_rxqs)[i]);
-}
-
-/**
  * DPDK callback to remove a MAC address.
  *
  * @param dev
@@ -206,258 +93,12 @@ priv_mac_addrs_disable(struct priv *priv)
 void
 mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
 {
-	struct priv *priv = dev->data->dev_private;
-
 	if (mlx5_is_secondary())
 		return;
-
-	priv_lock(priv);
-	DEBUG("%p: removing MAC address from index %" PRIu32,
-	      (void *)dev, index);
-	if (index >= RTE_DIM(priv->mac))
-		goto end;
-	priv_mac_addr_del(priv, index);
-end:
-	priv_unlock(priv);
-}
-
-/**
- * Add MAC flow steering rule.
- *
- * @param hash_rxq
- *   Pointer to hash RX queue structure.
- * @param mac_index
- *   MAC address index to register.
- * @param vlan_index
- *   VLAN index to use.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-hash_rxq_add_mac_flow(struct hash_rxq *hash_rxq, unsigned int mac_index,
-		      unsigned int vlan_index)
-{
-	struct ibv_exp_flow *flow;
-	struct priv *priv = hash_rxq->priv;
-	const uint8_t (*mac)[ETHER_ADDR_LEN] =
-			(const uint8_t (*)[ETHER_ADDR_LEN])
-			priv->mac[mac_index].addr_bytes;
-	FLOW_ATTR_SPEC_ETH(data, priv_flow_attr(priv, NULL, 0, hash_rxq->type));
-	struct ibv_exp_flow_attr *attr = &data->attr;
-	struct ibv_exp_flow_spec_eth *spec = &data->spec;
-	unsigned int vlan_enabled = !!priv->vlan_filter_n;
-	unsigned int vlan_id = priv->vlan_filter[vlan_index];
-
-	assert(mac_index < RTE_DIM(hash_rxq->mac_flow));
-	assert(vlan_index < RTE_DIM(hash_rxq->mac_flow[mac_index]));
-	if (hash_rxq->mac_flow[mac_index][vlan_index] != NULL)
-		return 0;
-	/*
-	 * No padding must be inserted by the compiler between attr and spec.
-	 * This layout is expected by libibverbs.
-	 */
-	assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec);
-	priv_flow_attr(priv, attr, sizeof(data), hash_rxq->type);
-	/* The first specification must be Ethernet. */
-	assert(spec->type == IBV_EXP_FLOW_SPEC_ETH);
-	assert(spec->size == sizeof(*spec));
-	*spec = (struct ibv_exp_flow_spec_eth){
-		.type = IBV_EXP_FLOW_SPEC_ETH,
-		.size = sizeof(*spec),
-		.val = {
-			.dst_mac = {
-				(*mac)[0], (*mac)[1], (*mac)[2],
-				(*mac)[3], (*mac)[4], (*mac)[5]
-			},
-			.vlan_tag = (vlan_enabled ? htons(vlan_id) : 0),
-		},
-		.mask = {
-			.dst_mac = "\xff\xff\xff\xff\xff\xff",
-			.vlan_tag = (vlan_enabled ? htons(0xfff) : 0),
-		},
-	};
-	DEBUG("%p: adding MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u"
-	      " VLAN index %u filtering %s, ID %u",
-	      (void *)hash_rxq,
-	      (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5],
-	      mac_index,
-	      vlan_index,
-	      (vlan_enabled ? "enabled" : "disabled"),
-	      vlan_id);
-	/* Create related flow. */
-	errno = 0;
-	flow = ibv_exp_create_flow(hash_rxq->qp, attr);
-	if (flow == NULL) {
-		/* It's not clear whether errno is always set in this case. */
-		ERROR("%p: flow configuration failed, errno=%d: %s",
-		      (void *)hash_rxq, errno,
-		      (errno ? strerror(errno) : "Unknown error"));
-		if (errno)
-			return errno;
-		return EINVAL;
-	}
-	hash_rxq->mac_flow[mac_index][vlan_index] = flow;
-	return 0;
-}
-
-/**
- * Register a MAC address in a hash RX queue.
- *
- * @param hash_rxq
- *   Pointer to hash RX queue structure.
- * @param mac_index
- *   MAC address index to register.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-hash_rxq_mac_addr_add(struct hash_rxq *hash_rxq, unsigned int mac_index)
-{
-	struct priv *priv = hash_rxq->priv;
-	unsigned int i = 0;
-	int ret;
-
-	assert(mac_index < RTE_DIM(hash_rxq->mac_flow));
-	assert(RTE_DIM(hash_rxq->mac_flow[mac_index]) ==
-	       RTE_DIM(priv->vlan_filter));
-	/* Add a MAC address for each VLAN filter, or at least once. */
-	do {
-		ret = hash_rxq_add_mac_flow(hash_rxq, mac_index, i);
-		if (ret) {
-			/* Failure, rollback. */
-			while (i != 0)
-				hash_rxq_del_mac_flow(hash_rxq, mac_index,
-						      --i);
-			return ret;
-		}
-	} while (++i < priv->vlan_filter_n);
-	return 0;
-}
-
-/**
- * Register all MAC addresses in a hash RX queue.
- *
- * @param hash_rxq
- *   Pointer to hash RX queue structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-int
-hash_rxq_mac_addrs_add(struct hash_rxq *hash_rxq)
-{
-	struct priv *priv = hash_rxq->priv;
-	unsigned int i;
-	int ret;
-
-	assert(RTE_DIM(priv->mac) == RTE_DIM(hash_rxq->mac_flow));
-	for (i = 0; (i != RTE_DIM(priv->mac)); ++i) {
-		if (!BITFIELD_ISSET(priv->mac_configured, i))
-			continue;
-		ret = hash_rxq_mac_addr_add(hash_rxq, i);
-		if (!ret)
-			continue;
-		/* Failure, rollback. */
-		while (i != 0)
-			hash_rxq_mac_addr_del(hash_rxq, --i);
-		assert(ret > 0);
-		return ret;
-	}
-	return 0;
-}
-
-/**
- * Register a MAC address.
- *
- * This is done for each hash RX queue.
- *
- * @param priv
- *   Pointer to private structure.
- * @param mac_index
- *   MAC address index to use.
- * @param mac
- *   MAC address to register.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-int
-priv_mac_addr_add(struct priv *priv, unsigned int mac_index,
-		  const uint8_t (*mac)[ETHER_ADDR_LEN])
-{
-	unsigned int i;
-	int ret;
-
-	assert(mac_index < RTE_DIM(priv->mac));
-	/* First, make sure this address isn't already configured. */
-	for (i = 0; (i != RTE_DIM(priv->mac)); ++i) {
-		/* Skip this index, it's going to be reconfigured. */
-		if (i == mac_index)
-			continue;
-		if (!BITFIELD_ISSET(priv->mac_configured, i))
-			continue;
-		if (memcmp(priv->mac[i].addr_bytes, *mac, sizeof(*mac)))
-			continue;
-		/* Address already configured elsewhere, return with error. */
-		return EADDRINUSE;
-	}
-	if (BITFIELD_ISSET(priv->mac_configured, mac_index))
-		priv_mac_addr_del(priv, mac_index);
-	priv->mac[mac_index] = (struct ether_addr){
-		{
-			(*mac)[0], (*mac)[1], (*mac)[2],
-			(*mac)[3], (*mac)[4], (*mac)[5]
-		}
-	};
-	if (!priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
-		goto end;
-	for (i = 0; (i != priv->hash_rxqs_n); ++i) {
-		ret = hash_rxq_mac_addr_add(&(*priv->hash_rxqs)[i], mac_index);
-		if (!ret)
-			continue;
-		/* Failure, rollback. */
-		while (i != 0)
-			hash_rxq_mac_addr_del(&(*priv->hash_rxqs)[--i],
-					      mac_index);
-		return ret;
-	}
-end:
-	BITFIELD_SET(priv->mac_configured, mac_index);
-	return 0;
-}
-
-/**
- * Register all MAC addresses in all hash RX queues.
- *
- * @param priv
- *   Pointer to private structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-int
-priv_mac_addrs_enable(struct priv *priv)
-{
-	unsigned int i;
-	int ret;
-
-	if (priv->isolated)
-		return 0;
-	if (!priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
-		return 0;
-	for (i = 0; (i != priv->hash_rxqs_n); ++i) {
-		ret = hash_rxq_mac_addrs_add(&(*priv->hash_rxqs)[i]);
-		if (!ret)
-			continue;
-		/* Failure, rollback. */
-		while (i != 0)
-			hash_rxq_mac_addrs_del(&(*priv->hash_rxqs)[--i]);
-		assert(ret > 0);
-		return ret;
-	}
-	return 0;
+	assert(index < MLX5_MAX_MAC_ADDRESSES);
+	memset(&dev->data->mac_addrs[index], 0, sizeof(struct ether_addr));
+	if (!dev->data->promiscuous && !dev->data->all_multicast)
+		mlx5_traffic_restart(dev);
 }
 
 /**
@@ -471,31 +112,35 @@ priv_mac_addrs_enable(struct priv *priv)
  *   MAC address index.
  * @param vmdq
  *   VMDq pool index to associate address with (ignored).
+ *
+ * @return
+ *   0 on success.
  */
 int
-mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
+mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
 		  uint32_t index, uint32_t vmdq)
 {
-	struct priv *priv = dev->data->dev_private;
-	int re;
-
-	if (mlx5_is_secondary())
-		return -ENOTSUP;
+	unsigned int i;
+	int ret = 0;
 
 	(void)vmdq;
-	priv_lock(priv);
-	DEBUG("%p: adding MAC address at index %" PRIu32,
-	      (void *)dev, index);
-	if (index >= RTE_DIM(priv->mac)) {
-		re = EINVAL;
-		goto end;
+	if (mlx5_is_secondary())
+		return 0;
+	assert(index < MLX5_MAX_MAC_ADDRESSES);
+	/* First, make sure this address isn't already configured. */
+	for (i = 0; (i != MLX5_MAX_MAC_ADDRESSES); ++i) {
+		/* Skip this index, it's going to be reconfigured. */
+		if (i == index)
+			continue;
+		if (memcmp(&dev->data->mac_addrs[i], mac, sizeof(*mac)))
+			continue;
+		/* Address already configured elsewhere, return with error. */
+		return EADDRINUSE;
 	}
-	re = priv_mac_addr_add(priv, index,
-			       (const uint8_t (*)[ETHER_ADDR_LEN])
-			       mac_addr->addr_bytes);
-end:
-	priv_unlock(priv);
-	return -re;
+	dev->data->mac_addrs[index] = *mac;
+	if (!dev->data->promiscuous && !dev->data->all_multicast)
+		mlx5_traffic_restart(dev);
+	return ret;
 }
 
 /**
@@ -509,7 +154,8 @@ end:
 void
 mlx5_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
 {
+	if (mlx5_is_secondary())
+		return;
 	DEBUG("%p: setting primary MAC address", (void *)dev);
-	mlx5_mac_addr_remove(dev, 0);
 	mlx5_mac_addr_add(dev, mac_addr, 0, 0);
 }
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 28733517..6b29eed5 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -41,14 +41,8 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_mempool.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
+#include <rte_malloc.h>
 
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
@@ -118,59 +112,13 @@ static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start,
 }
 
 /**
- * Register mempool as a memory region.
- *
- * @param pd
- *   Pointer to protection domain.
- * @param mp
- *   Pointer to memory pool.
- *
- * @return
- *   Memory region pointer, NULL in case of error.
- */
-struct ibv_mr *
-mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
-{
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	uintptr_t start;
-	uintptr_t end;
-	unsigned int i;
-
-	if (mlx5_check_mempool(mp, &start, &end) != 0) {
-		ERROR("mempool %p: not virtually contiguous",
-		      (void *)mp);
-		return NULL;
-	}
-
-	DEBUG("mempool %p area start=%p end=%p size=%zu",
-	      (void *)mp, (void *)start, (void *)end,
-	      (size_t)(end - start));
-	/* Round start and end to page boundary if found in memory segments. */
-	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
-		uintptr_t addr = (uintptr_t)ms[i].addr;
-		size_t len = ms[i].len;
-		unsigned int align = ms[i].hugepage_sz;
-
-		if ((start > addr) && (start < addr + len))
-			start = RTE_ALIGN_FLOOR(start, align);
-		if ((end > addr) && (end < addr + len))
-			end = RTE_ALIGN_CEIL(end, align);
-	}
-	DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
-	      (void *)mp, (void *)start, (void *)end,
-	      (size_t)(end - start));
-	return ibv_reg_mr(pd,
-			  (void *)start,
-			  end - start,
-			  IBV_ACCESS_LOCAL_WRITE);
-}
-
-/**
  * Register a Memory Region (MR) <-> Memory Pool (MP) association in
  * txq->mp2mr[]. If mp2mr[] is full, remove an entry first.
  *
  * This function should only be called by txq_mp2mr().
  *
+ * @param priv
+ *   Pointer to private structure.
  * @param txq
  *   Pointer to TX queue structure.
  * @param[in] mp
@@ -179,45 +127,75 @@ mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
  *   Index of the next available entry.
  *
  * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
+ *   mr on success, NULL on failure.
  */
-uint32_t
-txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
+struct mlx5_mr*
+priv_txq_mp2mr_reg(struct priv *priv, struct mlx5_txq_data *txq,
+		   struct rte_mempool *mp, unsigned int idx)
 {
-	struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
-	struct ibv_mr *mr;
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq, struct mlx5_txq_ctrl, txq);
+	struct mlx5_mr *mr;
 
 	/* Add a new entry, register MR first. */
 	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
 	      (void *)txq_ctrl, mp->name, (void *)mp);
-	mr = mlx5_mp2mr(txq_ctrl->priv->pd, mp);
+	mr = priv_mr_get(priv, mp);
+	if (mr == NULL)
+		mr = priv_mr_new(priv, mp);
 	if (unlikely(mr == NULL)) {
 		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
 		      (void *)txq_ctrl);
-		return (uint32_t)-1;
+		return NULL;
 	}
-	if (unlikely(idx == RTE_DIM(txq_ctrl->txq.mp2mr))) {
+	if (unlikely(idx == RTE_DIM(txq->mp2mr))) {
 		/* Table is full, remove oldest entry. */
 		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
 		      (void *)txq_ctrl);
 		--idx;
-		claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[0].mr));
-		memmove(&txq_ctrl->txq.mp2mr[0], &txq_ctrl->txq.mp2mr[1],
-			(sizeof(txq_ctrl->txq.mp2mr) -
-			 sizeof(txq_ctrl->txq.mp2mr[0])));
+		priv_mr_release(priv, txq->mp2mr[0]);
+		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
 	}
 	/* Store the new entry. */
-	txq_ctrl->txq.mp2mr[idx].start = (uintptr_t)mr->addr;
-	txq_ctrl->txq.mp2mr[idx].end = (uintptr_t)mr->addr + mr->length;
-	txq_ctrl->txq.mp2mr[idx].mr = mr;
-	txq_ctrl->txq.mp2mr[idx].lkey = htonl(mr->lkey);
+	txq_ctrl->txq.mp2mr[idx] = mr;
 	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
 	      (void *)txq_ctrl, mp->name, (void *)mp,
-	      txq_ctrl->txq.mp2mr[idx].lkey);
-	return txq_ctrl->txq.mp2mr[idx].lkey;
+	      txq_ctrl->txq.mp2mr[idx]->lkey);
+	return mr;
+}
+
+/**
+ * Register a Memory Region (MR) <-> Memory Pool (MP) association in
+ * txq->mp2mr[]. If mp2mr[] is full, remove an entry first.
+ *
+ * This function should only be called by txq_mp2mr().
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param[in] mp
+ *   Memory Pool for which a Memory Region lkey must be returned.
+ * @param idx
+ *   Index of the next available entry.
+ *
+ * @return
+ *   mr on success, NULL on failure.
+ */
+struct mlx5_mr*
+mlx5_txq_mp2mr_reg(struct mlx5_txq_data *txq, struct rte_mempool *mp,
+		   unsigned int idx)
+{
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq, struct mlx5_txq_ctrl, txq);
+	struct mlx5_mr *mr;
+
+	priv_lock(txq_ctrl->priv);
+	mr = priv_txq_mp2mr_reg(txq_ctrl->priv, txq, mp, idx);
+	priv_unlock(txq_ctrl->priv);
+	return mr;
 }
 
-struct txq_mp2mr_mbuf_check_data {
+struct mlx5_mp2mr_mbuf_check_data {
 	int ret;
 };
 
@@ -239,7 +217,7 @@ static void
 txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
 	uint32_t index __rte_unused)
 {
-	struct txq_mp2mr_mbuf_check_data *data = arg;
+	struct mlx5_mp2mr_mbuf_check_data *data = arg;
 	struct rte_mbuf *buf = obj;
 
 	/*
@@ -260,35 +238,158 @@ txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
  *   Pointer to TX queue structure.
  */
 void
-txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
+mlx5_mp2mr_iter(struct rte_mempool *mp, void *arg)
 {
-	struct txq_ctrl *txq_ctrl = arg;
-	struct txq_mp2mr_mbuf_check_data data = {
+	struct priv *priv = (struct priv *)arg;
+	struct mlx5_mp2mr_mbuf_check_data data = {
 		.ret = 0,
 	};
-	uintptr_t start;
-	uintptr_t end;
-	unsigned int i;
+	struct mlx5_mr *mr;
 
 	/* Register mempool only if the first element looks like a mbuf. */
 	if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
 			data.ret == -1)
 		return;
+	mr = priv_mr_get(priv, mp);
+	if (mr) {
+		priv_mr_release(priv, mr);
+		return;
+	}
+	priv_mr_new(priv, mp);
+}
+
+/**
+ * Register a new memory region from the mempool and store it in the memory
+ * region list.
+ *
+ * @param  priv
+ *   Pointer to private structure.
+ * @param mp
+ *   Pointer to the memory pool to register.
+ * @return
+ *   The memory region on success.
+ */
+struct mlx5_mr*
+priv_mr_new(struct priv *priv, struct rte_mempool *mp)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	uintptr_t start;
+	uintptr_t end;
+	unsigned int i;
+	struct mlx5_mr *mr;
+
+	mr = rte_zmalloc_socket(__func__, sizeof(*mr), 0, mp->socket_id);
+	if (!mr) {
+		DEBUG("unable to configure MR, ibv_reg_mr() failed.");
+		return NULL;
+	}
 	if (mlx5_check_mempool(mp, &start, &end) != 0) {
 		ERROR("mempool %p: not virtually contiguous",
 		      (void *)mp);
-		return;
+		return NULL;
 	}
-	for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) {
-		struct ibv_mr *mr = txq_ctrl->txq.mp2mr[i].mr;
+	DEBUG("mempool %p area start=%p end=%p size=%zu",
+	      (void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	/* Round start and end to page boundary if found in memory segments. */
+	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+		uintptr_t addr = (uintptr_t)ms[i].addr;
+		size_t len = ms[i].len;
+		unsigned int align = ms[i].hugepage_sz;
 
-		if (unlikely(mr == NULL)) {
-			/* Unknown MP, add a new MR for it. */
-			break;
+		if ((start > addr) && (start < addr + len))
+			start = RTE_ALIGN_FLOOR(start, align);
+		if ((end > addr) && (end < addr + len))
+			end = RTE_ALIGN_CEIL(end, align);
+	}
+	DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+	      (void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	mr->mr = ibv_reg_mr(priv->pd, (void *)start, end - start,
+			    IBV_ACCESS_LOCAL_WRITE);
+	mr->mp = mp;
+	mr->lkey = rte_cpu_to_be_32(mr->mr->lkey);
+	mr->start = start;
+	mr->end = (uintptr_t)mr->mr->addr + mr->mr->length;
+	rte_atomic32_inc(&mr->refcnt);
+	DEBUG("%p: new Memory Region %p refcnt: %d", (void *)priv,
+	      (void *)mr, rte_atomic32_read(&mr->refcnt));
+	LIST_INSERT_HEAD(&priv->mr, mr, next);
+	return mr;
+}
+
+/**
+ * Search the memory region object in the memory region list.
+ *
+ * @param  priv
+ *   Pointer to private structure.
+ * @param mp
+ *   Pointer to the memory pool to register.
+ * @return
+ *   The memory region on success.
+ */
+struct mlx5_mr*
+priv_mr_get(struct priv *priv, struct rte_mempool *mp)
+{
+	struct mlx5_mr *mr;
+
+	assert(mp);
+	if (LIST_EMPTY(&priv->mr))
+		return NULL;
+	LIST_FOREACH(mr, &priv->mr, next) {
+		if (mr->mp == mp) {
+			rte_atomic32_inc(&mr->refcnt);
+			DEBUG("Memory Region %p refcnt: %d",
+			      (void *)mr, rte_atomic32_read(&mr->refcnt));
+			return mr;
 		}
-		if (start >= (uintptr_t)mr->addr &&
-		    end <= (uintptr_t)mr->addr + mr->length)
-			return;
 	}
-	txq_mp2mr_reg(&txq_ctrl->txq, mp, i);
+	return NULL;
+}
+
+/**
+ * Release the memory region object.
+ *
+ * @param  mr
+ *   Pointer to memory region to release.
+ *
+ * @return
+ *   0 on success, errno on failure.
+ */
+int
+priv_mr_release(struct priv *priv, struct mlx5_mr *mr)
+{
+	(void)priv;
+	assert(mr);
+	DEBUG("Memory Region %p refcnt: %d",
+	      (void *)mr, rte_atomic32_read(&mr->refcnt));
+	if (rte_atomic32_dec_and_test(&mr->refcnt)) {
+		claim_zero(ibv_dereg_mr(mr->mr));
+		LIST_REMOVE(mr, next);
+		rte_free(mr);
+		return 0;
+	}
+	return EBUSY;
+}
+
+/**
+ * Verify the flow list is empty
+ *
+ * @param priv
+ *  Pointer to private structure.
+ *
+ * @return the number of object not released.
+ */
+int
+priv_mr_verify(struct priv *priv)
+{
+	int ret = 0;
+	struct mlx5_mr *mr;
+
+	LIST_FOREACH(mr, &priv->mr, next) {
+		DEBUG("%p: mr %p still referenced", (void *)priv,
+		      (void *)mr);
+		++ret;
+	}
+	return ret;
 }
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 608072f7..2de310bc 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -41,7 +41,7 @@
 #ifdef PEDANTIC
 #pragma GCC diagnostic ignored "-Wpedantic"
 #endif
-#include <infiniband/mlx5_hw.h>
+#include <infiniband/mlx5dv.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
@@ -89,9 +89,6 @@
 /* Default max packet length to be inlined. */
 #define MLX5_EMPW_MAX_INLINE_LEN (4U * MLX5_WQE_SIZE)
 
-#ifndef HAVE_VERBS_MLX5_OPCODE_TSO
-#define MLX5_OPCODE_TSO MLX5_OPCODE_LSO_MPW /* Compat with OFED 3.3. */
-#endif
 
 #define MLX5_OPC_MOD_ENHANCED_MPSW 0
 #define MLX5_OPCODE_ENHANCED_MPSW 0x29
@@ -154,6 +151,9 @@
 /* Default mark value used when none is provided. */
 #define MLX5_FLOW_MARK_DEFAULT 0xffffff
 
+/* Maximum number of DS in WQE. */
+#define MLX5_DSEG_MAX 63
+
 /* Subset of struct mlx5_wqe_eth_seg. */
 struct mlx5_wqe_eth_seg_small {
 	uint32_t rsvd0;
@@ -244,6 +244,46 @@ struct mlx5_cqe {
 	uint8_t op_own;
 };
 
+/* Adding direct verbs to data-path. */
+
+/* CQ sequence number mask. */
+#define MLX5_CQ_SQN_MASK 0x3
+
+/* CQ sequence number index. */
+#define MLX5_CQ_SQN_OFFSET 28
+
+/* CQ doorbell index mask. */
+#define MLX5_CI_MASK 0xffffff
+
+/* CQ doorbell offset. */
+#define MLX5_CQ_ARM_DB 1
+
+/* CQ doorbell offset*/
+#define MLX5_CQ_DOORBELL 0x20
+
+/* CQE format value. */
+#define MLX5_COMPRESSED 0x3
+
+/* CQE format mask. */
+#define MLX5E_CQE_FORMAT_MASK 0xc
+
+/* MPW opcode. */
+#define MLX5_OPC_MOD_MPW 0x01
+
+/* Compressed Rx CQE structure. */
+struct mlx5_mini_cqe8 {
+	union {
+		uint32_t rx_hash_result;
+		uint32_t checksum;
+		struct {
+			uint16_t wqe_counter;
+			uint8_t  s_wqe_opcode;
+			uint8_t  reserved;
+		} s_wqe_info;
+	};
+	uint32_t byte_cnt;
+};
+
 /**
  * Convert a user mark to flow mark.
  *
diff --git a/drivers/net/mlx5/mlx5_rss.c b/drivers/net/mlx5/mlx5_rss.c
index a2dd7d17..f3de46de 100644
--- a/drivers/net/mlx5/mlx5_rss.c
+++ b/drivers/net/mlx5/mlx5_rss.c
@@ -47,88 +47,13 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_malloc.h>
 #include <rte_ethdev.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
 
 /**
- * Get a RSS configuration hash key.
- *
- * @param priv
- *   Pointer to private structure.
- * @param rss_hf
- *   RSS hash functions configuration must be retrieved for.
- *
- * @return
- *   Pointer to a RSS configuration structure or NULL if rss_hf cannot
- *   be matched.
- */
-static struct rte_eth_rss_conf *
-rss_hash_get(struct priv *priv, uint64_t rss_hf)
-{
-	unsigned int i;
-
-	for (i = 0; (i != hash_rxq_init_n); ++i) {
-		uint64_t dpdk_rss_hf = hash_rxq_init[i].dpdk_rss_hf;
-
-		if (!(dpdk_rss_hf & rss_hf))
-			continue;
-		return (*priv->rss_conf)[i];
-	}
-	return NULL;
-}
-
-/**
- * Register a RSS key.
- *
- * @param priv
- *   Pointer to private structure.
- * @param key
- *   Hash key to register.
- * @param key_len
- *   Hash key length in bytes.
- * @param rss_hf
- *   RSS hash functions the provided key applies to.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-int
-rss_hash_rss_conf_new_key(struct priv *priv, const uint8_t *key,
-			  unsigned int key_len, uint64_t rss_hf)
-{
-	unsigned int i;
-
-	for (i = 0; (i != hash_rxq_init_n); ++i) {
-		struct rte_eth_rss_conf *rss_conf;
-		uint64_t dpdk_rss_hf = hash_rxq_init[i].dpdk_rss_hf;
-
-		if (!(dpdk_rss_hf & rss_hf))
-			continue;
-		rss_conf = rte_realloc((*priv->rss_conf)[i],
-				       (sizeof(*rss_conf) + key_len),
-				       0);
-		if (!rss_conf)
-			return ENOMEM;
-		rss_conf->rss_key = (void *)(rss_conf + 1);
-		rss_conf->rss_key_len = key_len;
-		rss_conf->rss_hf = dpdk_rss_hf;
-		memcpy(rss_conf->rss_key, key, key_len);
-		(*priv->rss_conf)[i] = rss_conf;
-	}
-	return 0;
-}
-
-/**
  * DPDK callback to update the RSS hash configuration.
  *
  * @param dev
@@ -144,23 +69,24 @@ mlx5_rss_hash_update(struct rte_eth_dev *dev,
 		     struct rte_eth_rss_conf *rss_conf)
 {
 	struct priv *priv = dev->data->dev_private;
-	int err = 0;
+	int ret = 0;
 
 	priv_lock(priv);
-
-	assert(priv->rss_conf != NULL);
-
-	/* Apply configuration. */
-	if (rss_conf->rss_key)
-		err = rss_hash_rss_conf_new_key(priv,
-						rss_conf->rss_key,
-						rss_conf->rss_key_len,
-						rss_conf->rss_hf);
-	/* Store protocols for which RSS is enabled. */
-	priv->rss_hf = rss_conf->rss_hf;
+	if (rss_conf->rss_key && rss_conf->rss_key_len) {
+		priv->rss_conf.rss_key = rte_realloc(priv->rss_conf.rss_key,
+						     rss_conf->rss_key_len, 0);
+		if (!priv->rss_conf.rss_key) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
+		       rss_conf->rss_key_len);
+		priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
+	}
+	priv->rss_conf.rss_hf = rss_conf->rss_hf;
+out:
 	priv_unlock(priv);
-	assert(err >= 0);
-	return -err;
+	return ret;
 }
 
 /**
@@ -179,26 +105,17 @@ mlx5_rss_hash_conf_get(struct rte_eth_dev *dev,
 		       struct rte_eth_rss_conf *rss_conf)
 {
 	struct priv *priv = dev->data->dev_private;
-	struct rte_eth_rss_conf *priv_rss_conf;
 
-	priv_lock(priv);
-
-	assert(priv->rss_conf != NULL);
-
-	priv_rss_conf = rss_hash_get(priv, rss_conf->rss_hf);
-	if (!priv_rss_conf) {
-		rss_conf->rss_hf = 0;
-		priv_unlock(priv);
+	if (!rss_conf)
 		return -EINVAL;
-	}
+	priv_lock(priv);
 	if (rss_conf->rss_key &&
-	    rss_conf->rss_key_len >= priv_rss_conf->rss_key_len)
-		memcpy(rss_conf->rss_key,
-		       priv_rss_conf->rss_key,
-		       priv_rss_conf->rss_key_len);
-	rss_conf->rss_key_len = priv_rss_conf->rss_key_len;
-	rss_conf->rss_hf = priv_rss_conf->rss_hf;
-
+	    (rss_conf->rss_key_len >= priv->rss_conf.rss_key_len)) {
+		memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
+		       priv->rss_conf.rss_key_len);
+	}
+	rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
+	rss_conf->rss_hf = priv->rss_conf.rss_hf;
 	priv_unlock(priv);
 	return 0;
 }
@@ -357,11 +274,13 @@ mlx5_dev_rss_reta_update(struct rte_eth_dev *dev,
 	int ret;
 	struct priv *priv = dev->data->dev_private;
 
-	mlx5_dev_stop(dev);
+	assert(!mlx5_is_secondary());
 	priv_lock(priv);
 	ret = priv_dev_rss_reta_update(priv, reta_conf, reta_size);
 	priv_unlock(priv);
-	if (ret)
-		return -ret;
-	return mlx5_dev_start(dev);
+	if (dev->data->dev_started) {
+		mlx5_dev_stop(dev);
+		mlx5_dev_start(dev);
+	}
+	return -ret;
 }
diff --git a/drivers/net/mlx5/mlx5_rxmode.c b/drivers/net/mlx5/mlx5_rxmode.c
index a67e5426..0ef2cdf0 100644
--- a/drivers/net/mlx5/mlx5_rxmode.c
+++ b/drivers/net/mlx5/mlx5_rxmode.c
@@ -45,343 +45,12 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_ethdev.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
 #include "mlx5_utils.h"
 
-/* Initialization data for special flows. */
-static const struct special_flow_init special_flow_init[] = {
-	[HASH_RXQ_FLOW_TYPE_PROMISC] = {
-		.dst_mac_val = "\x00\x00\x00\x00\x00\x00",
-		.dst_mac_mask = "\x00\x00\x00\x00\x00\x00",
-		.hash_types =
-			1 << HASH_RXQ_TCPV4 |
-			1 << HASH_RXQ_UDPV4 |
-			1 << HASH_RXQ_IPV4 |
-			1 << HASH_RXQ_TCPV6 |
-			1 << HASH_RXQ_UDPV6 |
-			1 << HASH_RXQ_IPV6 |
-			1 << HASH_RXQ_ETH |
-			0,
-		.per_vlan = 0,
-	},
-	[HASH_RXQ_FLOW_TYPE_ALLMULTI] = {
-		.dst_mac_val = "\x01\x00\x00\x00\x00\x00",
-		.dst_mac_mask = "\x01\x00\x00\x00\x00\x00",
-		.hash_types =
-			1 << HASH_RXQ_UDPV4 |
-			1 << HASH_RXQ_IPV4 |
-			1 << HASH_RXQ_UDPV6 |
-			1 << HASH_RXQ_IPV6 |
-			1 << HASH_RXQ_ETH |
-			0,
-		.per_vlan = 0,
-	},
-	[HASH_RXQ_FLOW_TYPE_BROADCAST] = {
-		.dst_mac_val = "\xff\xff\xff\xff\xff\xff",
-		.dst_mac_mask = "\xff\xff\xff\xff\xff\xff",
-		.hash_types =
-			1 << HASH_RXQ_UDPV4 |
-			1 << HASH_RXQ_IPV4 |
-			1 << HASH_RXQ_UDPV6 |
-			1 << HASH_RXQ_IPV6 |
-			1 << HASH_RXQ_ETH |
-			0,
-		.per_vlan = 1,
-	},
-	[HASH_RXQ_FLOW_TYPE_IPV6MULTI] = {
-		.dst_mac_val = "\x33\x33\x00\x00\x00\x00",
-		.dst_mac_mask = "\xff\xff\x00\x00\x00\x00",
-		.hash_types =
-			1 << HASH_RXQ_UDPV6 |
-			1 << HASH_RXQ_IPV6 |
-			1 << HASH_RXQ_ETH |
-			0,
-		.per_vlan = 1,
-	},
-};
-
-/**
- * Enable a special flow in a hash RX queue for a given VLAN index.
- *
- * @param hash_rxq
- *   Pointer to hash RX queue structure.
- * @param flow_type
- *   Special flow type.
- * @param vlan_index
- *   VLAN index to use.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-hash_rxq_special_flow_enable_vlan(struct hash_rxq *hash_rxq,
-				  enum hash_rxq_flow_type flow_type,
-				  unsigned int vlan_index)
-{
-	struct priv *priv = hash_rxq->priv;
-	struct ibv_exp_flow *flow;
-	FLOW_ATTR_SPEC_ETH(data, priv_flow_attr(priv, NULL, 0, hash_rxq->type));
-	struct ibv_exp_flow_attr *attr = &data->attr;
-	struct ibv_exp_flow_spec_eth *spec = &data->spec;
-	const uint8_t *mac;
-	const uint8_t *mask;
-	unsigned int vlan_enabled = (priv->vlan_filter_n &&
-				     special_flow_init[flow_type].per_vlan);
-	unsigned int vlan_id = priv->vlan_filter[vlan_index];
-
-	/* Check if flow is relevant for this hash_rxq. */
-	if (!(special_flow_init[flow_type].hash_types & (1 << hash_rxq->type)))
-		return 0;
-	/* Check if flow already exists. */
-	if (hash_rxq->special_flow[flow_type][vlan_index] != NULL)
-		return 0;
-
-	/*
-	 * No padding must be inserted by the compiler between attr and spec.
-	 * This layout is expected by libibverbs.
-	 */
-	assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec);
-	priv_flow_attr(priv, attr, sizeof(data), hash_rxq->type);
-	/* The first specification must be Ethernet. */
-	assert(spec->type == IBV_EXP_FLOW_SPEC_ETH);
-	assert(spec->size == sizeof(*spec));
-
-	mac = special_flow_init[flow_type].dst_mac_val;
-	mask = special_flow_init[flow_type].dst_mac_mask;
-	*spec = (struct ibv_exp_flow_spec_eth){
-		.type = IBV_EXP_FLOW_SPEC_ETH,
-		.size = sizeof(*spec),
-		.val = {
-			.dst_mac = {
-				mac[0], mac[1], mac[2],
-				mac[3], mac[4], mac[5],
-			},
-			.vlan_tag = (vlan_enabled ? htons(vlan_id) : 0),
-		},
-		.mask = {
-			.dst_mac = {
-				mask[0], mask[1], mask[2],
-				mask[3], mask[4], mask[5],
-			},
-			.vlan_tag = (vlan_enabled ? htons(0xfff) : 0),
-		},
-	};
-
-	errno = 0;
-	flow = ibv_exp_create_flow(hash_rxq->qp, attr);
-	if (flow == NULL) {
-		/* It's not clear whether errno is always set in this case. */
-		ERROR("%p: flow configuration failed, errno=%d: %s",
-		      (void *)hash_rxq, errno,
-		      (errno ? strerror(errno) : "Unknown error"));
-		if (errno)
-			return errno;
-		return EINVAL;
-	}
-	hash_rxq->special_flow[flow_type][vlan_index] = flow;
-	DEBUG("%p: special flow %s (index %d) VLAN %u (index %u) enabled",
-	      (void *)hash_rxq, hash_rxq_flow_type_str(flow_type), flow_type,
-	      vlan_id, vlan_index);
-	return 0;
-}
-
-/**
- * Disable a special flow in a hash RX queue for a given VLAN index.
- *
- * @param hash_rxq
- *   Pointer to hash RX queue structure.
- * @param flow_type
- *   Special flow type.
- * @param vlan_index
- *   VLAN index to use.
- */
-static void
-hash_rxq_special_flow_disable_vlan(struct hash_rxq *hash_rxq,
-				   enum hash_rxq_flow_type flow_type,
-				   unsigned int vlan_index)
-{
-	struct ibv_exp_flow *flow =
-		hash_rxq->special_flow[flow_type][vlan_index];
-
-	if (flow == NULL)
-		return;
-	claim_zero(ibv_exp_destroy_flow(flow));
-	hash_rxq->special_flow[flow_type][vlan_index] = NULL;
-	DEBUG("%p: special flow %s (index %d) VLAN %u (index %u) disabled",
-	      (void *)hash_rxq, hash_rxq_flow_type_str(flow_type), flow_type,
-	      hash_rxq->priv->vlan_filter[vlan_index], vlan_index);
-}
-
-/**
- * Enable a special flow in a hash RX queue.
- *
- * @param hash_rxq
- *   Pointer to hash RX queue structure.
- * @param flow_type
- *   Special flow type.
- * @param vlan_index
- *   VLAN index to use.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-hash_rxq_special_flow_enable(struct hash_rxq *hash_rxq,
-			     enum hash_rxq_flow_type flow_type)
-{
-	struct priv *priv = hash_rxq->priv;
-	unsigned int i = 0;
-	int ret;
-
-	assert((unsigned int)flow_type < RTE_DIM(hash_rxq->special_flow));
-	assert(RTE_DIM(hash_rxq->special_flow[flow_type]) ==
-	       RTE_DIM(priv->vlan_filter));
-	/* Add a special flow for each VLAN filter when relevant. */
-	do {
-		ret = hash_rxq_special_flow_enable_vlan(hash_rxq, flow_type, i);
-		if (ret) {
-			/* Failure, rollback. */
-			while (i != 0)
-				hash_rxq_special_flow_disable_vlan(hash_rxq,
-								   flow_type,
-								   --i);
-			return ret;
-		}
-	} while (special_flow_init[flow_type].per_vlan &&
-		 ++i < priv->vlan_filter_n);
-	return 0;
-}
-
-/**
- * Disable a special flow in a hash RX queue.
- *
- * @param hash_rxq
- *   Pointer to hash RX queue structure.
- * @param flow_type
- *   Special flow type.
- */
-static void
-hash_rxq_special_flow_disable(struct hash_rxq *hash_rxq,
-			      enum hash_rxq_flow_type flow_type)
-{
-	unsigned int i;
-
-	assert((unsigned int)flow_type < RTE_DIM(hash_rxq->special_flow));
-	for (i = 0; (i != RTE_DIM(hash_rxq->special_flow[flow_type])); ++i)
-		hash_rxq_special_flow_disable_vlan(hash_rxq, flow_type, i);
-}
-
-/**
- * Enable a special flow in all hash RX queues.
- *
- * @param priv
- *   Private structure.
- * @param flow_type
- *   Special flow type.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-int
-priv_special_flow_enable(struct priv *priv, enum hash_rxq_flow_type flow_type)
-{
-	unsigned int i;
-
-	if (!priv_allow_flow_type(priv, flow_type))
-		return 0;
-	for (i = 0; (i != priv->hash_rxqs_n); ++i) {
-		struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
-		int ret;
-
-		ret = hash_rxq_special_flow_enable(hash_rxq, flow_type);
-		if (!ret)
-			continue;
-		/* Failure, rollback. */
-		while (i != 0) {
-			hash_rxq = &(*priv->hash_rxqs)[--i];
-			hash_rxq_special_flow_disable(hash_rxq, flow_type);
-		}
-		return ret;
-	}
-	return 0;
-}
-
-/**
- * Disable a special flow in all hash RX queues.
- *
- * @param priv
- *   Private structure.
- * @param flow_type
- *   Special flow type.
- */
-void
-priv_special_flow_disable(struct priv *priv, enum hash_rxq_flow_type flow_type)
-{
-	unsigned int i;
-
-	for (i = 0; (i != priv->hash_rxqs_n); ++i) {
-		struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
-
-		hash_rxq_special_flow_disable(hash_rxq, flow_type);
-	}
-}
-
-/**
- * Enable all special flows in all hash RX queues.
- *
- * @param priv
- *   Private structure.
- */
-int
-priv_special_flow_enable_all(struct priv *priv)
-{
-	enum hash_rxq_flow_type flow_type;
-
-	if (priv->isolated)
-		return 0;
-	for (flow_type = HASH_RXQ_FLOW_TYPE_PROMISC;
-			flow_type != HASH_RXQ_FLOW_TYPE_MAC;
-			++flow_type) {
-		int ret;
-
-		ret = priv_special_flow_enable(priv, flow_type);
-		if (!ret)
-			continue;
-		/* Failure, rollback. */
-		while (flow_type)
-			priv_special_flow_disable(priv, --flow_type);
-		return ret;
-	}
-	return 0;
-}
-
-/**
- * Disable all special flows in all hash RX queues.
- *
- * @param priv
- *   Private structure.
- */
-void
-priv_special_flow_disable_all(struct priv *priv)
-{
-	enum hash_rxq_flow_type flow_type;
-
-	for (flow_type = HASH_RXQ_FLOW_TYPE_PROMISC;
-			flow_type != HASH_RXQ_FLOW_TYPE_MAC;
-			++flow_type)
-		priv_special_flow_disable(priv, flow_type);
-}
-
 /**
  * DPDK callback to enable promiscuous mode.
  *
@@ -391,19 +60,10 @@ priv_special_flow_disable_all(struct priv *priv)
 void
 mlx5_promiscuous_enable(struct rte_eth_dev *dev)
 {
-	struct priv *priv = dev->data->dev_private;
-	int ret;
-
 	if (mlx5_is_secondary())
 		return;
-
-	priv_lock(priv);
-	priv->promisc_req = 1;
-	ret = priv_rehash_flows(priv);
-	if (ret)
-		ERROR("error while enabling promiscuous mode: %s",
-		      strerror(ret));
-	priv_unlock(priv);
+	dev->data->promiscuous = 1;
+	mlx5_traffic_restart(dev);
 }
 
 /**
@@ -415,19 +75,10 @@ mlx5_promiscuous_enable(struct rte_eth_dev *dev)
 void
 mlx5_promiscuous_disable(struct rte_eth_dev *dev)
 {
-	struct priv *priv = dev->data->dev_private;
-	int ret;
-
 	if (mlx5_is_secondary())
 		return;
-
-	priv_lock(priv);
-	priv->promisc_req = 0;
-	ret = priv_rehash_flows(priv);
-	if (ret)
-		ERROR("error while disabling promiscuous mode: %s",
-		      strerror(ret));
-	priv_unlock(priv);
+	dev->data->promiscuous = 0;
+	mlx5_traffic_restart(dev);
 }
 
 /**
@@ -439,19 +90,10 @@ mlx5_promiscuous_disable(struct rte_eth_dev *dev)
 void
 mlx5_allmulticast_enable(struct rte_eth_dev *dev)
 {
-	struct priv *priv = dev->data->dev_private;
-	int ret;
-
 	if (mlx5_is_secondary())
 		return;
-
-	priv_lock(priv);
-	priv->allmulti_req = 1;
-	ret = priv_rehash_flows(priv);
-	if (ret)
-		ERROR("error while enabling allmulticast mode: %s",
-		      strerror(ret));
-	priv_unlock(priv);
+	dev->data->all_multicast = 1;
+	mlx5_traffic_restart(dev);
 }
 
 /**
@@ -463,17 +105,8 @@ mlx5_allmulticast_enable(struct rte_eth_dev *dev)
 void
 mlx5_allmulticast_disable(struct rte_eth_dev *dev)
 {
-	struct priv *priv = dev->data->dev_private;
-	int ret;
-
 	if (mlx5_is_secondary())
 		return;
-
-	priv_lock(priv);
-	priv->allmulti_req = 0;
-	ret = priv_rehash_flows(priv);
-	if (ret)
-		ERROR("error while disabling allmulticast mode: %s",
-		      strerror(ret));
-	priv_unlock(priv);
+	dev->data->all_multicast = 0;
+	mlx5_traffic_restart(dev);
 }
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 74387a79..6b29aaee 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -37,6 +37,7 @@
 #include <string.h>
 #include <stdint.h>
 #include <fcntl.h>
+#include <sys/queue.h>
 
 /* Verbs header. */
 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
@@ -44,25 +45,18 @@
 #pragma GCC diagnostic ignored "-Wpedantic"
 #endif
 #include <infiniband/verbs.h>
-#include <infiniband/arch.h>
-#include <infiniband/mlx5_hw.h>
+#include <infiniband/mlx5dv.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_mbuf.h>
 #include <rte_malloc.h>
 #include <rte_ethdev.h>
 #include <rte_common.h>
 #include <rte_interrupts.h>
 #include <rte_debug.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
+#include <rte_io.h>
 
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
@@ -70,122 +64,6 @@
 #include "mlx5_autoconf.h"
 #include "mlx5_defs.h"
 
-/* Initialization data for hash RX queues. */
-const struct hash_rxq_init hash_rxq_init[] = {
-	[HASH_RXQ_TCPV4] = {
-		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
-				IBV_EXP_RX_HASH_DST_IPV4 |
-				IBV_EXP_RX_HASH_SRC_PORT_TCP |
-				IBV_EXP_RX_HASH_DST_PORT_TCP),
-		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
-		.flow_priority = 0,
-		.flow_spec.tcp_udp = {
-			.type = IBV_EXP_FLOW_SPEC_TCP,
-			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
-		},
-		.underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
-	},
-	[HASH_RXQ_UDPV4] = {
-		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
-				IBV_EXP_RX_HASH_DST_IPV4 |
-				IBV_EXP_RX_HASH_SRC_PORT_UDP |
-				IBV_EXP_RX_HASH_DST_PORT_UDP),
-		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
-		.flow_priority = 0,
-		.flow_spec.tcp_udp = {
-			.type = IBV_EXP_FLOW_SPEC_UDP,
-			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
-		},
-		.underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
-	},
-	[HASH_RXQ_IPV4] = {
-		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
-				IBV_EXP_RX_HASH_DST_IPV4),
-		.dpdk_rss_hf = (ETH_RSS_IPV4 |
-				ETH_RSS_FRAG_IPV4),
-		.flow_priority = 1,
-		.flow_spec.ipv4 = {
-			.type = IBV_EXP_FLOW_SPEC_IPV4,
-			.size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
-		},
-		.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
-	},
-	[HASH_RXQ_TCPV6] = {
-		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
-				IBV_EXP_RX_HASH_DST_IPV6 |
-				IBV_EXP_RX_HASH_SRC_PORT_TCP |
-				IBV_EXP_RX_HASH_DST_PORT_TCP),
-		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
-		.flow_priority = 0,
-		.flow_spec.tcp_udp = {
-			.type = IBV_EXP_FLOW_SPEC_TCP,
-			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
-		},
-		.underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
-	},
-	[HASH_RXQ_UDPV6] = {
-		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
-				IBV_EXP_RX_HASH_DST_IPV6 |
-				IBV_EXP_RX_HASH_SRC_PORT_UDP |
-				IBV_EXP_RX_HASH_DST_PORT_UDP),
-		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
-		.flow_priority = 0,
-		.flow_spec.tcp_udp = {
-			.type = IBV_EXP_FLOW_SPEC_UDP,
-			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
-		},
-		.underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
-	},
-	[HASH_RXQ_IPV6] = {
-		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
-				IBV_EXP_RX_HASH_DST_IPV6),
-		.dpdk_rss_hf = (ETH_RSS_IPV6 |
-				ETH_RSS_FRAG_IPV6),
-		.flow_priority = 1,
-		.flow_spec.ipv6 = {
-			.type = IBV_EXP_FLOW_SPEC_IPV6,
-			.size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
-		},
-		.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
-	},
-	[HASH_RXQ_ETH] = {
-		.hash_fields = 0,
-		.dpdk_rss_hf = 0,
-		.flow_priority = 2,
-		.flow_spec.eth = {
-			.type = IBV_EXP_FLOW_SPEC_ETH,
-			.size = sizeof(hash_rxq_init[0].flow_spec.eth),
-		},
-		.underlayer = NULL,
-	},
-};
-
-/* Number of entries in hash_rxq_init[]. */
-const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
-
-/* Initialization data for hash RX queue indirection tables. */
-static const struct ind_table_init ind_table_init[] = {
-	{
-		.max_size = -1u, /* Superseded by HW limitations. */
-		.hash_types =
-			1 << HASH_RXQ_TCPV4 |
-			1 << HASH_RXQ_UDPV4 |
-			1 << HASH_RXQ_IPV4 |
-			1 << HASH_RXQ_TCPV6 |
-			1 << HASH_RXQ_UDPV6 |
-			1 << HASH_RXQ_IPV6 |
-			0,
-		.hash_types_n = 6,
-	},
-	{
-		.max_size = 1,
-		.hash_types = 1 << HASH_RXQ_ETH,
-		.hash_types_n = 1,
-	},
-};
-
-#define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
-
 /* Default RSS hash key also used for ConnectX-3. */
 uint8_t rss_hash_default_key[] = {
 	0x2c, 0xc6, 0x81, 0xd1,
@@ -204,495 +82,27 @@ uint8_t rss_hash_default_key[] = {
 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
 
 /**
- * Populate flow steering rule for a given hash RX queue type using
- * information from hash_rxq_init[]. Nothing is written to flow_attr when
- * flow_attr_size is not large enough, but the required size is still returned.
- *
- * @param priv
- *   Pointer to private structure.
- * @param[out] flow_attr
- *   Pointer to flow attribute structure to fill. Note that the allocated
- *   area must be larger and large enough to hold all flow specifications.
- * @param flow_attr_size
- *   Entire size of flow_attr and trailing room for flow specifications.
- * @param type
- *   Hash RX queue type to use for flow steering rule.
- *
- * @return
- *   Total size of the flow attribute buffer. No errors are defined.
- */
-size_t
-priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
-	       size_t flow_attr_size, enum hash_rxq_type type)
-{
-	size_t offset = sizeof(*flow_attr);
-	const struct hash_rxq_init *init = &hash_rxq_init[type];
-
-	assert(priv != NULL);
-	assert((size_t)type < RTE_DIM(hash_rxq_init));
-	do {
-		offset += init->flow_spec.hdr.size;
-		init = init->underlayer;
-	} while (init != NULL);
-	if (offset > flow_attr_size)
-		return offset;
-	flow_attr_size = offset;
-	init = &hash_rxq_init[type];
-	*flow_attr = (struct ibv_exp_flow_attr){
-		.type = IBV_EXP_FLOW_ATTR_NORMAL,
-		/* Priorities < 3 are reserved for flow director. */
-		.priority = init->flow_priority + 3,
-		.num_of_specs = 0,
-		.port = priv->port,
-		.flags = 0,
-	};
-	do {
-		offset -= init->flow_spec.hdr.size;
-		memcpy((void *)((uintptr_t)flow_attr + offset),
-		       &init->flow_spec,
-		       init->flow_spec.hdr.size);
-		++flow_attr->num_of_specs;
-		init = init->underlayer;
-	} while (init != NULL);
-	return flow_attr_size;
-}
-
-/**
- * Convert hash type position in indirection table initializer to
- * hash RX queue type.
- *
- * @param table
- *   Indirection table initializer.
- * @param pos
- *   Hash type position.
- *
- * @return
- *   Hash RX queue type.
- */
-static enum hash_rxq_type
-hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
-{
-	enum hash_rxq_type type = HASH_RXQ_TCPV4;
-
-	assert(pos < table->hash_types_n);
-	do {
-		if ((table->hash_types & (1 << type)) && (pos-- == 0))
-			break;
-		++type;
-	} while (1);
-	return type;
-}
-
-/**
- * Filter out disabled hash RX queue types from ind_table_init[].
- *
- * @param priv
- *   Pointer to private structure.
- * @param[out] table
- *   Output table.
- *
- * @return
- *   Number of table entries.
- */
-static unsigned int
-priv_make_ind_table_init(struct priv *priv,
-			 struct ind_table_init (*table)[IND_TABLE_INIT_N])
-{
-	uint64_t rss_hf;
-	unsigned int i;
-	unsigned int j;
-	unsigned int table_n = 0;
-	/* Mandatory to receive frames not handled by normal hash RX queues. */
-	unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
-
-	rss_hf = priv->rss_hf;
-	/* Process other protocols only if more than one queue. */
-	if (priv->rxqs_n > 1)
-		for (i = 0; (i != hash_rxq_init_n); ++i)
-			if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
-				hash_types_sup |= (1 << i);
-
-	/* Filter out entries whose protocols are not in the set. */
-	for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
-		unsigned int nb;
-		unsigned int h;
-
-		/* j is increased only if the table has valid protocols. */
-		assert(j <= i);
-		(*table)[j] = ind_table_init[i];
-		(*table)[j].hash_types &= hash_types_sup;
-		for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
-			if (((*table)[j].hash_types >> h) & 0x1)
-				++nb;
-		(*table)[i].hash_types_n = nb;
-		if (nb) {
-			++table_n;
-			++j;
-		}
-	}
-	return table_n;
-}
-
-/**
- * Initialize hash RX queues and indirection table.
- *
- * @param priv
- *   Pointer to private structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-int
-priv_create_hash_rxqs(struct priv *priv)
-{
-	struct ibv_exp_wq *wqs[priv->reta_idx_n];
-	struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
-	unsigned int ind_tables_n =
-		priv_make_ind_table_init(priv, &ind_table_init);
-	unsigned int hash_rxqs_n = 0;
-	struct hash_rxq (*hash_rxqs)[] = NULL;
-	struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
-	unsigned int i;
-	unsigned int j;
-	unsigned int k;
-	int err = 0;
-
-	assert(priv->ind_tables == NULL);
-	assert(priv->ind_tables_n == 0);
-	assert(priv->hash_rxqs == NULL);
-	assert(priv->hash_rxqs_n == 0);
-	assert(priv->pd != NULL);
-	assert(priv->ctx != NULL);
-	if (priv->isolated)
-		return 0;
-	if (priv->rxqs_n == 0)
-		return EINVAL;
-	assert(priv->rxqs != NULL);
-	if (ind_tables_n == 0) {
-		ERROR("all hash RX queue types have been filtered out,"
-		      " indirection table cannot be created");
-		return EINVAL;
-	}
-	if (priv->rxqs_n & (priv->rxqs_n - 1)) {
-		INFO("%u RX queues are configured, consider rounding this"
-		     " number to the next power of two for better balancing",
-		     priv->rxqs_n);
-		DEBUG("indirection table extended to assume %u WQs",
-		      priv->reta_idx_n);
-	}
-	for (i = 0; (i != priv->reta_idx_n); ++i) {
-		struct rxq_ctrl *rxq_ctrl;
-
-		rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
-					struct rxq_ctrl, rxq);
-		wqs[i] = rxq_ctrl->wq;
-	}
-	/* Get number of hash RX queues to configure. */
-	for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
-		hash_rxqs_n += ind_table_init[i].hash_types_n;
-	DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
-	      hash_rxqs_n, priv->rxqs_n, ind_tables_n);
-	/* Create indirection tables. */
-	ind_tables = rte_calloc(__func__, ind_tables_n,
-				sizeof((*ind_tables)[0]), 0);
-	if (ind_tables == NULL) {
-		err = ENOMEM;
-		ERROR("cannot allocate indirection tables container: %s",
-		      strerror(err));
-		goto error;
-	}
-	for (i = 0; (i != ind_tables_n); ++i) {
-		struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
-			.pd = priv->pd,
-			.log_ind_tbl_size = 0, /* Set below. */
-			.ind_tbl = wqs,
-			.comp_mask = 0,
-		};
-		unsigned int ind_tbl_size = ind_table_init[i].max_size;
-		struct ibv_exp_rwq_ind_table *ind_table;
-
-		if (priv->reta_idx_n < ind_tbl_size)
-			ind_tbl_size = priv->reta_idx_n;
-		ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
-		errno = 0;
-		ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
-							 &ind_init_attr);
-		if (ind_table != NULL) {
-			(*ind_tables)[i] = ind_table;
-			continue;
-		}
-		/* Not clear whether errno is set. */
-		err = (errno ? errno : EINVAL);
-		ERROR("RX indirection table creation failed with error %d: %s",
-		      err, strerror(err));
-		goto error;
-	}
-	/* Allocate array that holds hash RX queues and related data. */
-	hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
-			       sizeof((*hash_rxqs)[0]), 0);
-	if (hash_rxqs == NULL) {
-		err = ENOMEM;
-		ERROR("cannot allocate hash RX queues container: %s",
-		      strerror(err));
-		goto error;
-	}
-	for (i = 0, j = 0, k = 0;
-	     ((i != hash_rxqs_n) && (j != ind_tables_n));
-	     ++i) {
-		struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
-		enum hash_rxq_type type =
-			hash_rxq_type_from_pos(&ind_table_init[j], k);
-		struct rte_eth_rss_conf *priv_rss_conf =
-			(*priv->rss_conf)[type];
-		struct ibv_exp_rx_hash_conf hash_conf = {
-			.rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
-			.rx_hash_key_len = (priv_rss_conf ?
-					    priv_rss_conf->rss_key_len :
-					    rss_hash_default_key_len),
-			.rx_hash_key = (priv_rss_conf ?
-					priv_rss_conf->rss_key :
-					rss_hash_default_key),
-			.rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
-			.rwq_ind_tbl = (*ind_tables)[j],
-		};
-		struct ibv_exp_qp_init_attr qp_init_attr = {
-			.max_inl_recv = 0, /* Currently not supported. */
-			.qp_type = IBV_QPT_RAW_PACKET,
-			.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
-				      IBV_EXP_QP_INIT_ATTR_RX_HASH),
-			.pd = priv->pd,
-			.rx_hash_conf = &hash_conf,
-			.port_num = priv->port,
-		};
-
-		DEBUG("using indirection table %u for hash RX queue %u type %d",
-		      j, i, type);
-		*hash_rxq = (struct hash_rxq){
-			.priv = priv,
-			.qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
-			.type = type,
-		};
-		if (hash_rxq->qp == NULL) {
-			err = (errno ? errno : EINVAL);
-			ERROR("Hash RX QP creation failure: %s",
-			      strerror(err));
-			goto error;
-		}
-		if (++k < ind_table_init[j].hash_types_n)
-			continue;
-		/* Switch to the next indirection table and reset hash RX
-		 * queue type array index. */
-		++j;
-		k = 0;
-	}
-	priv->ind_tables = ind_tables;
-	priv->ind_tables_n = ind_tables_n;
-	priv->hash_rxqs = hash_rxqs;
-	priv->hash_rxqs_n = hash_rxqs_n;
-	assert(err == 0);
-	return 0;
-error:
-	if (hash_rxqs != NULL) {
-		for (i = 0; (i != hash_rxqs_n); ++i) {
-			struct ibv_qp *qp = (*hash_rxqs)[i].qp;
-
-			if (qp == NULL)
-				continue;
-			claim_zero(ibv_destroy_qp(qp));
-		}
-		rte_free(hash_rxqs);
-	}
-	if (ind_tables != NULL) {
-		for (j = 0; (j != ind_tables_n); ++j) {
-			struct ibv_exp_rwq_ind_table *ind_table =
-				(*ind_tables)[j];
-
-			if (ind_table == NULL)
-				continue;
-			claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
-		}
-		rte_free(ind_tables);
-	}
-	return err;
-}
-
-/**
- * Clean up hash RX queues and indirection table.
- *
- * @param priv
- *   Pointer to private structure.
- */
-void
-priv_destroy_hash_rxqs(struct priv *priv)
-{
-	unsigned int i;
-
-	DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
-	if (priv->hash_rxqs_n == 0) {
-		assert(priv->hash_rxqs == NULL);
-		assert(priv->ind_tables == NULL);
-		return;
-	}
-	for (i = 0; (i != priv->hash_rxqs_n); ++i) {
-		struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
-		unsigned int j, k;
-
-		assert(hash_rxq->priv == priv);
-		assert(hash_rxq->qp != NULL);
-		/* Also check that there are no remaining flows. */
-		for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
-			for (k = 0;
-			     (k != RTE_DIM(hash_rxq->special_flow[j]));
-			     ++k)
-				assert(hash_rxq->special_flow[j][k] == NULL);
-		for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
-			for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
-				assert(hash_rxq->mac_flow[j][k] == NULL);
-		claim_zero(ibv_destroy_qp(hash_rxq->qp));
-	}
-	priv->hash_rxqs_n = 0;
-	rte_free(priv->hash_rxqs);
-	priv->hash_rxqs = NULL;
-	for (i = 0; (i != priv->ind_tables_n); ++i) {
-		struct ibv_exp_rwq_ind_table *ind_table =
-			(*priv->ind_tables)[i];
-
-		assert(ind_table != NULL);
-		claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
-	}
-	priv->ind_tables_n = 0;
-	rte_free(priv->ind_tables);
-	priv->ind_tables = NULL;
-}
-
-/**
- * Check whether a given flow type is allowed.
- *
- * @param priv
- *   Pointer to private structure.
- * @param type
- *   Flow type to check.
- *
- * @return
- *   Nonzero if the given flow type is allowed.
- */
-int
-priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
-{
-	/* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
-	 * has been requested. */
-	if (priv->promisc_req)
-		return type == HASH_RXQ_FLOW_TYPE_PROMISC;
-	switch (type) {
-	case HASH_RXQ_FLOW_TYPE_PROMISC:
-		return !!priv->promisc_req;
-	case HASH_RXQ_FLOW_TYPE_ALLMULTI:
-		return !!priv->allmulti_req;
-	case HASH_RXQ_FLOW_TYPE_BROADCAST:
-	case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
-		/* If allmulti is enabled, broadcast and ipv6multi
-		 * are unnecessary. */
-		return !priv->allmulti_req;
-	case HASH_RXQ_FLOW_TYPE_MAC:
-		return 1;
-	default:
-		/* Unsupported flow type is not allowed. */
-		return 0;
-	}
-	return 0;
-}
-
-/**
- * Automatically enable/disable flows according to configuration.
- *
- * @param priv
- *   Private structure.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-int
-priv_rehash_flows(struct priv *priv)
-{
-	enum hash_rxq_flow_type i;
-
-	for (i = HASH_RXQ_FLOW_TYPE_PROMISC;
-			i != RTE_DIM((*priv->hash_rxqs)[0].special_flow);
-			++i)
-		if (!priv_allow_flow_type(priv, i)) {
-			priv_special_flow_disable(priv, i);
-		} else {
-			int ret = priv_special_flow_enable(priv, i);
-
-			if (ret)
-				return ret;
-		}
-	if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
-		return priv_mac_addrs_enable(priv);
-	priv_mac_addrs_disable(priv);
-	return 0;
-}
-
-/**
- * Unlike regular Rx function, vPMD Rx doesn't replace mbufs immediately when
- * receiving packets. Instead it replaces later in bulk. In rxq->elts[], entries
- * from rq_pi to rq_ci are owned by device but the rest is already delivered to
- * application. In order not to reuse those mbufs by rxq_alloc_elts(), this
- * function must be called to replace used mbufs.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- */
-static void
-rxq_trim_elts(struct rxq *rxq)
-{
-	const uint16_t q_n = (1 << rxq->elts_n);
-	const uint16_t q_mask = q_n - 1;
-	uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
-	uint16_t i;
-
-	if (!rxq->trim_elts)
-		return;
-	for (i = 0; i < used; ++i)
-		(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
-	rxq->trim_elts = 0;
-	return;
-}
-
-/**
  * Allocate RX queue elements.
  *
  * @param rxq_ctrl
  *   Pointer to RX queue structure.
- * @param elts_n
- *   Number of elements to allocate.
- * @param[in] pool
- *   If not NULL, fetch buffers from this array instead of allocating them
- *   with rte_pktmbuf_alloc().
  *
  * @return
  *   0 on success, errno value on failure.
  */
-static int
-rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
-	       struct rte_mbuf *(*pool)[])
+int
+rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
 	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
+	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
 	unsigned int i;
 	int ret = 0;
 
 	/* Iterate on segments. */
 	for (i = 0; (i != elts_n); ++i) {
 		struct rte_mbuf *buf;
-		volatile struct mlx5_wqe_data_seg *scat =
-			&(*rxq_ctrl->rxq.wqes)[i];
-
-		buf = (pool != NULL) ? (*pool)[i] : NULL;
-		if (buf != NULL) {
-			rte_pktmbuf_reset(buf);
-			rte_pktmbuf_refcnt_update(buf, 1);
-		} else
-			buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
+
+		buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
 		if (buf == NULL) {
 			ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
 			ret = ENOMEM;
@@ -711,21 +121,35 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
 		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
 		PKT_LEN(buf) = DATA_LEN(buf);
 		NB_SEGS(buf) = 1;
-		/* scat->addr must be able to store a pointer. */
-		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
-		*scat = (struct mlx5_wqe_data_seg){
-			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
-			.byte_count = htonl(DATA_LEN(buf)),
-			.lkey = htonl(rxq_ctrl->mr->lkey),
-		};
 		(*rxq_ctrl->rxq.elts)[i] = buf;
 	}
+	/* If Rx vector is activated. */
+	if (rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
+		struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
+		struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
+		int j;
+
+		/* Initialize default rearm_data for vPMD. */
+		mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
+		rte_mbuf_refcnt_set(mbuf_init, 1);
+		mbuf_init->nb_segs = 1;
+		mbuf_init->port = rxq->port_id;
+		/*
+		 * prevent compiler reordering:
+		 * rearm_data covers previous fields.
+		 */
+		rte_compiler_barrier();
+		rxq->mbuf_initializer =
+			*(uint64_t *)&mbuf_init->rearm_data;
+		/* Padding with a fake mbuf for vectorized Rx. */
+		for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
+			(*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
+	}
 	DEBUG("%p: allocated and configured %u segments (max %u packets)",
 	      (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
 	assert(ret == 0);
 	return 0;
 error:
-	assert(pool == NULL);
 	elts_n = i;
 	for (i = 0; (i != elts_n); ++i) {
 		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
@@ -744,19 +168,30 @@ error:
  *   Pointer to RX queue structure.
  */
 static void
-rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
+rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
-	unsigned int i;
+	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
+	const uint16_t q_n = (1 << rxq->elts_n);
+	const uint16_t q_mask = q_n - 1;
+	uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
+	uint16_t i;
 
-	rxq_trim_elts(&rxq_ctrl->rxq);
 	DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
-	if (rxq_ctrl->rxq.elts == NULL)
+	if (rxq->elts == NULL)
 		return;
-
-	for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
-		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
-			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
-		(*rxq_ctrl->rxq.elts)[i] = NULL;
+	/**
+	 * Some mbuf in the Ring belongs to the application.  They cannot be
+	 * freed.
+	 */
+	if (rxq_check_vec_support(rxq) > 0) {
+		for (i = 0; i < used; ++i)
+			(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
+		rxq->rq_pi = rxq->rq_ci;
+	}
+	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
+		if ((*rxq->elts)[i] != NULL)
+			rte_pktmbuf_free_seg((*rxq->elts)[i]);
+		(*rxq->elts)[i] = NULL;
 	}
 }
 
@@ -769,343 +204,15 @@ rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
  *   Pointer to RX queue structure.
  */
 void
-rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
+mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
 	DEBUG("cleaning up %p", (void *)rxq_ctrl);
-	rxq_free_elts(rxq_ctrl);
-	if (rxq_ctrl->fdir_queue != NULL)
-		priv_fdir_queue_destroy(rxq_ctrl->priv, rxq_ctrl->fdir_queue);
-	if (rxq_ctrl->wq != NULL)
-		claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
-	if (rxq_ctrl->cq != NULL)
-		claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
-	if (rxq_ctrl->channel != NULL)
-		claim_zero(ibv_destroy_comp_channel(rxq_ctrl->channel));
-	if (rxq_ctrl->mr != NULL)
-		claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
+	if (rxq_ctrl->ibv)
+		mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv);
 	memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
 }
 
 /**
- * Initialize RX queue.
- *
- * @param tmpl
- *   Pointer to RX queue control template.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static inline int
-rxq_setup(struct rxq_ctrl *tmpl)
-{
-	struct ibv_cq *ibcq = tmpl->cq;
-	struct ibv_mlx5_cq_info cq_info;
-	struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
-	struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
-		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
-
-	if (ibv_mlx5_exp_get_cq_info(ibcq, &cq_info)) {
-		ERROR("Unable to query CQ info. check your OFED.");
-		return ENOTSUP;
-	}
-	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
-		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
-		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
-		return EINVAL;
-	}
-	if (elts == NULL)
-		return ENOMEM;
-	tmpl->rxq.rq_db = rwq->rq.db;
-	tmpl->rxq.cqe_n = log2above(cq_info.cqe_cnt);
-	tmpl->rxq.cq_ci = 0;
-	tmpl->rxq.rq_ci = 0;
-	tmpl->rxq.rq_pi = 0;
-	tmpl->rxq.cq_db = cq_info.dbrec;
-	tmpl->rxq.wqes =
-		(volatile struct mlx5_wqe_data_seg (*)[])
-		(uintptr_t)rwq->rq.buff;
-	tmpl->rxq.cqes =
-		(volatile struct mlx5_cqe (*)[])
-		(uintptr_t)cq_info.buf;
-	tmpl->rxq.elts = elts;
-	return 0;
-}
-
-/**
- * Configure a RX queue.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param rxq_ctrl
- *   Pointer to RX queue structure.
- * @param desc
- *   Number of descriptors to configure in queue.
- * @param socket
- *   NUMA socket on which memory must be allocated.
- * @param[in] conf
- *   Thresholds parameters.
- * @param mp
- *   Memory pool for buffer allocations.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static int
-rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
-	       uint16_t desc, unsigned int socket,
-	       const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
-{
-	struct priv *priv = dev->data->dev_private;
-	struct rxq_ctrl tmpl = {
-		.priv = priv,
-		.socket = socket,
-		.rxq = {
-			.elts_n = log2above(desc),
-			.mp = mp,
-			.rss_hash = priv->rxqs_n > 1,
-		},
-	};
-	struct ibv_exp_wq_attr mod;
-	union {
-		struct ibv_exp_cq_init_attr cq;
-		struct ibv_exp_wq_init_attr wq;
-		struct ibv_exp_cq_attr cq_attr;
-	} attr;
-	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
-	unsigned int cqe_n = desc - 1;
-	struct rte_mbuf *(*elts)[desc] = NULL;
-	int ret = 0;
-
-	(void)conf; /* Thresholds configuration (ignored). */
-	/* Enable scattered packets support for this queue if necessary. */
-	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
-	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
-	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
-		tmpl.rxq.sges_n = 0;
-	} else if (dev->data->dev_conf.rxmode.enable_scatter) {
-		unsigned int size =
-			RTE_PKTMBUF_HEADROOM +
-			dev->data->dev_conf.rxmode.max_rx_pkt_len;
-		unsigned int sges_n;
-
-		/*
-		 * Determine the number of SGEs needed for a full packet
-		 * and round it to the next power of two.
-		 */
-		sges_n = log2above((size / mb_len) + !!(size % mb_len));
-		tmpl.rxq.sges_n = sges_n;
-		/* Make sure rxq.sges_n did not overflow. */
-		size = mb_len * (1 << tmpl.rxq.sges_n);
-		size -= RTE_PKTMBUF_HEADROOM;
-		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
-			ERROR("%p: too many SGEs (%u) needed to handle"
-			      " requested maximum packet size %u",
-			      (void *)dev,
-			      1 << sges_n,
-			      dev->data->dev_conf.rxmode.max_rx_pkt_len);
-			return EOVERFLOW;
-		}
-	} else {
-		WARN("%p: the requested maximum Rx packet size (%u) is"
-		     " larger than a single mbuf (%u) and scattered"
-		     " mode has not been requested",
-		     (void *)dev,
-		     dev->data->dev_conf.rxmode.max_rx_pkt_len,
-		     mb_len - RTE_PKTMBUF_HEADROOM);
-	}
-	DEBUG("%p: maximum number of segments per packet: %u",
-	      (void *)dev, 1 << tmpl.rxq.sges_n);
-	if (desc % (1 << tmpl.rxq.sges_n)) {
-		ERROR("%p: number of RX queue descriptors (%u) is not a"
-		      " multiple of SGEs per packet (%u)",
-		      (void *)dev,
-		      desc,
-		      1 << tmpl.rxq.sges_n);
-		return EINVAL;
-	}
-	/* Toggle RX checksum offload if hardware supports it. */
-	if (priv->hw_csum)
-		tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
-	if (priv->hw_csum_l2tun)
-		tmpl.rxq.csum_l2tun =
-			!!dev->data->dev_conf.rxmode.hw_ip_checksum;
-	/* Use the entire RX mempool as the memory region. */
-	tmpl.mr = mlx5_mp2mr(priv->pd, mp);
-	if (tmpl.mr == NULL) {
-		ret = EINVAL;
-		ERROR("%p: MR creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	if (dev->data->dev_conf.intr_conf.rxq) {
-		tmpl.channel = ibv_create_comp_channel(priv->ctx);
-		if (tmpl.channel == NULL) {
-			ret = ENOMEM;
-			ERROR("%p: Rx interrupt completion channel creation"
-			      " failure: %s",
-			      (void *)dev, strerror(ret));
-			goto error;
-		}
-	}
-	attr.cq = (struct ibv_exp_cq_init_attr){
-		.comp_mask = 0,
-	};
-	if (priv->cqe_comp) {
-		attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
-		attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
-		/*
-		 * For vectorized Rx, it must not be doubled in order to
-		 * make cq_ci and rq_ci aligned.
-		 */
-		if (rxq_check_vec_support(&tmpl.rxq) < 0)
-			cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
-	}
-	tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl.channel, 0,
-				    &attr.cq);
-	if (tmpl.cq == NULL) {
-		ret = ENOMEM;
-		ERROR("%p: CQ creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	DEBUG("priv->device_attr.max_qp_wr is %d",
-	      priv->device_attr.max_qp_wr);
-	DEBUG("priv->device_attr.max_sge is %d",
-	      priv->device_attr.max_sge);
-	/* Configure VLAN stripping. */
-	tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
-			       !!dev->data->dev_conf.rxmode.hw_vlan_strip);
-	attr.wq = (struct ibv_exp_wq_init_attr){
-		.wq_context = NULL, /* Could be useful in the future. */
-		.wq_type = IBV_EXP_WQT_RQ,
-		/* Max number of outstanding WRs. */
-		.max_recv_wr = desc >> tmpl.rxq.sges_n,
-		/* Max number of scatter/gather elements in a WR. */
-		.max_recv_sge = 1 << tmpl.rxq.sges_n,
-		.pd = priv->pd,
-		.cq = tmpl.cq,
-		.comp_mask =
-			IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
-			0,
-		.vlan_offloads = (tmpl.rxq.vlan_strip ?
-				  IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
-				  0),
-	};
-	/* By default, FCS (CRC) is stripped by hardware. */
-	if (dev->data->dev_conf.rxmode.hw_strip_crc) {
-		tmpl.rxq.crc_present = 0;
-	} else if (priv->hw_fcs_strip) {
-		/* Ask HW/Verbs to leave CRC in place when supported. */
-		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
-		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
-		tmpl.rxq.crc_present = 1;
-	} else {
-		WARN("%p: CRC stripping has been disabled but will still"
-		     " be performed by hardware, make sure MLNX_OFED and"
-		     " firmware are up to date",
-		     (void *)dev);
-		tmpl.rxq.crc_present = 0;
-	}
-	DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
-	      " incoming frames to hide it",
-	      (void *)dev,
-	      tmpl.rxq.crc_present ? "disabled" : "enabled",
-	      tmpl.rxq.crc_present << 2);
-	if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
-		; /* Nothing else to do. */
-	else if (priv->hw_padding) {
-		INFO("%p: enabling packet padding on queue %p",
-		     (void *)dev, (void *)rxq_ctrl);
-		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
-		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
-	} else
-		WARN("%p: packet padding has been requested but is not"
-		     " supported, make sure MLNX_OFED and firmware are"
-		     " up to date",
-		     (void *)dev);
-
-	tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
-	if (tmpl.wq == NULL) {
-		ret = (errno ? errno : EINVAL);
-		ERROR("%p: WQ creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	/*
-	 * Make sure number of WRs*SGEs match expectations since a queue
-	 * cannot allocate more than "desc" buffers.
-	 */
-	if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
-	    ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
-		ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
-		      (void *)dev,
-		      (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
-		      attr.wq.max_recv_wr, attr.wq.max_recv_sge);
-		ret = EINVAL;
-		goto error;
-	}
-	/* Save port ID. */
-	tmpl.rxq.port_id = dev->data->port_id;
-	DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
-	/* Change queue state to ready. */
-	mod = (struct ibv_exp_wq_attr){
-		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
-		.wq_state = IBV_EXP_WQS_RDY,
-	};
-	ret = ibv_exp_modify_wq(tmpl.wq, &mod);
-	if (ret) {
-		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	ret = rxq_setup(&tmpl);
-	if (ret) {
-		ERROR("%p: cannot initialize RX queue structure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	/* Reuse buffers from original queue if possible. */
-	if (rxq_ctrl->rxq.elts_n) {
-		assert(1 << rxq_ctrl->rxq.elts_n == desc);
-		assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
-		rxq_trim_elts(&rxq_ctrl->rxq);
-		ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
-	} else
-		ret = rxq_alloc_elts(&tmpl, desc, NULL);
-	if (ret) {
-		ERROR("%p: RXQ allocation failed: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	/* Clean up rxq in case we're reinitializing it. */
-	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
-	rxq_cleanup(rxq_ctrl);
-	/* Move mbuf pointers to dedicated storage area in RX queue. */
-	elts = (void *)(rxq_ctrl + 1);
-	rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
-#ifndef NDEBUG
-	memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
-#endif
-	rte_free(tmpl.rxq.elts);
-	tmpl.rxq.elts = elts;
-	*rxq_ctrl = tmpl;
-	/* Update doorbell counter. */
-	rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
-	rte_wmb();
-	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
-	DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
-	assert(ret == 0);
-	return 0;
-error:
-	elts = tmpl.rxq.elts;
-	rxq_cleanup(&tmpl);
-	rte_free(elts);
-	assert(ret > 0);
-	return ret;
-}
-
-/**
- * DPDK callback to configure a RX queue.
  *
  * @param dev
  *   Pointer to Ethernet device structure.
@@ -1129,14 +236,14 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		    struct rte_mempool *mp)
 {
 	struct priv *priv = dev->data->dev_private;
-	struct rxq *rxq = (*priv->rxqs)[idx];
-	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
-	const uint16_t desc_pad = MLX5_VPMD_DESCS_PER_LOOP; /* For vPMD. */
-	int ret;
+	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	int ret = 0;
 
+	(void)conf;
 	if (mlx5_is_secondary())
 		return -E_RTE_SECONDARY;
-
 	priv_lock(priv);
 	if (!rte_is_power_of_2(desc)) {
 		desc = 1 << log2above(desc);
@@ -1152,50 +259,24 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		priv_unlock(priv);
 		return -EOVERFLOW;
 	}
-	if (rxq != NULL) {
-		DEBUG("%p: reusing already allocated queue index %u (%p)",
-		      (void *)dev, idx, (void *)rxq);
-		if (priv->started) {
-			priv_unlock(priv);
-			return -EEXIST;
-		}
-		(*priv->rxqs)[idx] = NULL;
-		rxq_cleanup(rxq_ctrl);
-		/* Resize if rxq size is changed. */
-		if (rxq_ctrl->rxq.elts_n != log2above(desc)) {
-			rxq_ctrl = rte_realloc(rxq_ctrl,
-					       sizeof(*rxq_ctrl) +
-					       (desc + desc_pad) *
-						sizeof(struct rte_mbuf *),
-					       RTE_CACHE_LINE_SIZE);
-			if (!rxq_ctrl) {
-				ERROR("%p: unable to reallocate queue index %u",
-					(void *)dev, idx);
-				priv_unlock(priv);
-				return -ENOMEM;
-			}
-		}
-	} else {
-		rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
-					     (desc + desc_pad) *
-					      sizeof(struct rte_mbuf *),
-					     0, socket);
-		if (rxq_ctrl == NULL) {
-			ERROR("%p: unable to allocate queue index %u",
-			      (void *)dev, idx);
-			priv_unlock(priv);
-			return -ENOMEM;
-		}
+	if (!mlx5_priv_rxq_releasable(priv, idx)) {
+		ret = EBUSY;
+		ERROR("%p: unable to release queue index %u",
+		      (void *)dev, idx);
+		goto out;
 	}
-	ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
-	if (ret)
-		rte_free(rxq_ctrl);
-	else {
-		rxq_ctrl->rxq.stats.idx = idx;
-		DEBUG("%p: adding RX queue %p to list",
-		      (void *)dev, (void *)rxq_ctrl);
-		(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
+	mlx5_priv_rxq_release(priv, idx);
+	rxq_ctrl = mlx5_priv_rxq_new(priv, idx, desc, socket, mp);
+	if (!rxq_ctrl) {
+		ERROR("%p: unable to allocate queue index %u",
+		      (void *)dev, idx);
+		ret = ENOMEM;
+		goto out;
 	}
+	DEBUG("%p: adding RX queue %p to list",
+	      (void *)dev, (void *)rxq_ctrl);
+	(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
+out:
 	priv_unlock(priv);
 	return -ret;
 }
@@ -1209,76 +290,26 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 void
 mlx5_rx_queue_release(void *dpdk_rxq)
 {
-	struct rxq *rxq = (struct rxq *)dpdk_rxq;
-	struct rxq_ctrl *rxq_ctrl;
+	struct mlx5_rxq_data *rxq = (struct mlx5_rxq_data *)dpdk_rxq;
+	struct mlx5_rxq_ctrl *rxq_ctrl;
 	struct priv *priv;
-	unsigned int i;
 
 	if (mlx5_is_secondary())
 		return;
 
 	if (rxq == NULL)
 		return;
-	rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
+	rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
 	priv = rxq_ctrl->priv;
 	priv_lock(priv);
-	if (priv_flow_rxq_in_use(priv, rxq))
+	if (!mlx5_priv_rxq_releasable(priv, rxq_ctrl->rxq.stats.idx))
 		rte_panic("Rx queue %p is still used by a flow and cannot be"
 			  " removed\n", (void *)rxq_ctrl);
-	for (i = 0; (i != priv->rxqs_n); ++i)
-		if ((*priv->rxqs)[i] == rxq) {
-			DEBUG("%p: removing RX queue %p from list",
-			      (void *)priv->dev, (void *)rxq_ctrl);
-			(*priv->rxqs)[i] = NULL;
-			break;
-		}
-	rxq_cleanup(rxq_ctrl);
-	rte_free(rxq_ctrl);
+	mlx5_priv_rxq_release(priv, rxq_ctrl->rxq.stats.idx);
 	priv_unlock(priv);
 }
 
 /**
- * DPDK callback for RX in secondary processes.
- *
- * This function configures all queues from primary process information
- * if necessary before reverting to the normal RX burst callback.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-uint16_t
-mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
-			      uint16_t pkts_n)
-{
-	struct rxq *rxq = dpdk_rxq;
-	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
-	struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
-	struct priv *primary_priv;
-	unsigned int index;
-
-	if (priv == NULL)
-		return 0;
-	primary_priv =
-		mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
-	/* Look for queue index in both private structures. */
-	for (index = 0; index != priv->rxqs_n; ++index)
-		if (((*primary_priv->rxqs)[index] == rxq) ||
-		    ((*priv->rxqs)[index] == rxq))
-			break;
-	if (index == priv->rxqs_n)
-		return 0;
-	rxq = (*priv->rxqs)[index];
-	return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
-}
-
-/**
  * Allocate queue vector and fill epoll fd list for Rx interrupts.
  *
  * @param priv
@@ -1296,6 +327,7 @@ priv_rx_intr_vec_enable(struct priv *priv)
 	unsigned int count = 0;
 	struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
 
+	assert(!mlx5_is_secondary());
 	if (!priv->dev->data->dev_conf.intr_conf.rxq)
 		return 0;
 	priv_rx_intr_vec_disable(priv);
@@ -1307,15 +339,14 @@ priv_rx_intr_vec_enable(struct priv *priv)
 	}
 	intr_handle->type = RTE_INTR_HANDLE_EXT;
 	for (i = 0; i != n; ++i) {
-		struct rxq *rxq = (*priv->rxqs)[i];
-		struct rxq_ctrl *rxq_ctrl =
-			container_of(rxq, struct rxq_ctrl, rxq);
+		/* This rxq ibv must not be released in this function. */
+		struct mlx5_rxq_ibv *rxq_ibv = mlx5_priv_rxq_ibv_get(priv, i);
 		int fd;
 		int flags;
 		int rc;
 
 		/* Skip queues that cannot request interrupts. */
-		if (!rxq || !rxq_ctrl->channel) {
+		if (!rxq_ibv || !rxq_ibv->channel) {
 			/* Use invalid intr_vec[] index to disable entry. */
 			intr_handle->intr_vec[i] =
 				RTE_INTR_VEC_RXTX_OFFSET +
@@ -1329,7 +360,7 @@ priv_rx_intr_vec_enable(struct priv *priv)
 			priv_rx_intr_vec_disable(priv);
 			return -1;
 		}
-		fd = rxq_ctrl->channel->fd;
+		fd = rxq_ibv->channel->fd;
 		flags = fcntl(fd, F_GETFL);
 		rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
 		if (rc < 0) {
@@ -1359,14 +390,61 @@ void
 priv_rx_intr_vec_disable(struct priv *priv)
 {
 	struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
+	unsigned int i;
+	unsigned int rxqs_n = priv->rxqs_n;
+	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
 
+	if (!priv->dev->data->dev_conf.intr_conf.rxq)
+		return;
+	if (!intr_handle->intr_vec)
+		goto free;
+	for (i = 0; i != n; ++i) {
+		struct mlx5_rxq_ctrl *rxq_ctrl;
+		struct mlx5_rxq_data *rxq_data;
+
+		if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET +
+		    RTE_MAX_RXTX_INTR_VEC_ID)
+			continue;
+		/**
+		 * Need to access directly the queue to release the reference
+		 * kept in priv_rx_intr_vec_enable().
+		 */
+		rxq_data = (*priv->rxqs)[i];
+		rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+		mlx5_priv_rxq_ibv_release(priv, rxq_ctrl->ibv);
+	}
+free:
 	rte_intr_free_epoll_fd(intr_handle);
-	free(intr_handle->intr_vec);
+	if (intr_handle->intr_vec)
+		free(intr_handle->intr_vec);
 	intr_handle->nb_efd = 0;
 	intr_handle->intr_vec = NULL;
 }
 
-#ifdef HAVE_UPDATE_CQ_CI
+/**
+ *  MLX5 CQ notification .
+ *
+ *  @param rxq
+ *     Pointer to receive queue structure.
+ *  @param sq_n_rxq
+ *     Sequence number per receive queue .
+ */
+static inline void
+mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq)
+{
+	int sq_n = 0;
+	uint32_t doorbell_hi;
+	uint64_t doorbell;
+	void *cq_db_reg = (char *)rxq->cq_uar + MLX5_CQ_DOORBELL;
+
+	sq_n = sq_n_rxq & MLX5_CQ_SQN_MASK;
+	doorbell_hi = sq_n << MLX5_CQ_SQN_OFFSET | (rxq->cq_ci & MLX5_CI_MASK);
+	doorbell = (uint64_t)doorbell_hi << 32;
+	doorbell |=  rxq->cqn;
+	rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
+	rte_wmb();
+	rte_write64(rte_cpu_to_be_64(doorbell), cq_db_reg);
+}
 
 /**
  * DPDK callback for Rx queue interrupt enable.
@@ -1383,16 +461,30 @@ int
 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
 {
 	struct priv *priv = mlx5_get_priv(dev);
-	struct rxq *rxq = (*priv->rxqs)[rx_queue_id];
-	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
-	int ret;
+	struct mlx5_rxq_data *rxq_data;
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+	int ret = 0;
 
-	if (!rxq || !rxq_ctrl->channel) {
+	priv_lock(priv);
+	rxq_data = (*priv->rxqs)[rx_queue_id];
+	if (!rxq_data) {
 		ret = EINVAL;
-	} else {
-		ibv_mlx5_exp_update_cq_ci(rxq_ctrl->cq, rxq->cq_ci);
-		ret = ibv_req_notify_cq(rxq_ctrl->cq, 0);
+		goto exit;
+	}
+	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	if (rxq_ctrl->irq) {
+		struct mlx5_rxq_ibv *rxq_ibv;
+
+		rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
+		if (!rxq_ibv) {
+			ret = EINVAL;
+			goto exit;
+		}
+		mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn);
+		mlx5_priv_rxq_ibv_release(priv, rxq_ibv);
 	}
+exit:
+	priv_unlock(priv);
 	if (ret)
 		WARN("unable to arm interrupt on rx queue %d", rx_queue_id);
 	return -ret;
@@ -1413,25 +505,920 @@ int
 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
 {
 	struct priv *priv = mlx5_get_priv(dev);
-	struct rxq *rxq = (*priv->rxqs)[rx_queue_id];
-	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
+	struct mlx5_rxq_data *rxq_data;
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+	struct mlx5_rxq_ibv *rxq_ibv = NULL;
 	struct ibv_cq *ev_cq;
 	void *ev_ctx;
-	int ret;
+	int ret = 0;
 
-	if (!rxq || !rxq_ctrl->channel) {
+	priv_lock(priv);
+	rxq_data = (*priv->rxqs)[rx_queue_id];
+	if (!rxq_data) {
 		ret = EINVAL;
-	} else {
-		ret = ibv_get_cq_event(rxq_ctrl->cq->channel, &ev_cq, &ev_ctx);
-		if (ret || ev_cq != rxq_ctrl->cq)
-			ret = EINVAL;
+		goto exit;
+	}
+	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	if (!rxq_ctrl->irq)
+		goto exit;
+	rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
+	if (!rxq_ibv) {
+		ret = EINVAL;
+		goto exit;
+	}
+	ret = ibv_get_cq_event(rxq_ibv->channel, &ev_cq, &ev_ctx);
+	if (ret || ev_cq != rxq_ibv->cq) {
+		ret = EINVAL;
+		goto exit;
 	}
+	rxq_data->cq_arm_sn++;
+	ibv_ack_cq_events(rxq_ibv->cq, 1);
+exit:
+	if (rxq_ibv)
+		mlx5_priv_rxq_ibv_release(priv, rxq_ibv);
+	priv_unlock(priv);
 	if (ret)
 		WARN("unable to disable interrupt on rx queue %d",
 		     rx_queue_id);
-	else
-		ibv_ack_cq_events(rxq_ctrl->cq, 1);
 	return -ret;
 }
 
-#endif /* HAVE_UPDATE_CQ_CI */
+/**
+ * Create the Rx queue Verbs object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ *
+ * @return
+ *   The Verbs object initialised if it can be created.
+ */
+struct mlx5_rxq_ibv*
+mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx)
+{
+	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	struct ibv_wq_attr mod;
+	union {
+		struct {
+			struct ibv_cq_init_attr_ex ibv;
+			struct mlx5dv_cq_init_attr mlx5;
+		} cq;
+		struct ibv_wq_init_attr wq;
+		struct ibv_cq_ex cq_attr;
+	} attr;
+	unsigned int cqe_n = (1 << rxq_data->elts_n) - 1;
+	struct mlx5_rxq_ibv *tmpl;
+	struct mlx5dv_cq cq_info;
+	struct mlx5dv_rwq rwq;
+	unsigned int i;
+	int ret = 0;
+	struct mlx5dv_obj obj;
+
+	assert(rxq_data);
+	assert(!rxq_ctrl->ibv);
+	tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
+				 rxq_ctrl->socket);
+	if (!tmpl) {
+		ERROR("%p: cannot allocate verbs resources",
+		       (void *)rxq_ctrl);
+		goto error;
+	}
+	tmpl->rxq_ctrl = rxq_ctrl;
+	/* Use the entire RX mempool as the memory region. */
+	tmpl->mr = priv_mr_get(priv, rxq_data->mp);
+	if (!tmpl->mr) {
+		tmpl->mr = priv_mr_new(priv, rxq_data->mp);
+		if (!tmpl->mr) {
+			ERROR("%p: MR creation failure", (void *)rxq_ctrl);
+			goto error;
+		}
+	}
+	if (rxq_ctrl->irq) {
+		tmpl->channel = ibv_create_comp_channel(priv->ctx);
+		if (!tmpl->channel) {
+			ERROR("%p: Comp Channel creation failure",
+			      (void *)rxq_ctrl);
+			goto error;
+		}
+	}
+	attr.cq.ibv = (struct ibv_cq_init_attr_ex){
+		.cqe = cqe_n,
+		.channel = tmpl->channel,
+		.comp_mask = 0,
+	};
+	attr.cq.mlx5 = (struct mlx5dv_cq_init_attr){
+		.comp_mask = 0,
+	};
+	if (priv->cqe_comp && !rxq_data->hw_timestamp) {
+		attr.cq.mlx5.comp_mask |=
+			MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
+		attr.cq.mlx5.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
+		/*
+		 * For vectorized Rx, it must not be doubled in order to
+		 * make cq_ci and rq_ci aligned.
+		 */
+		if (rxq_check_vec_support(rxq_data) < 0)
+			attr.cq.ibv.cqe *= 2;
+	} else if (priv->cqe_comp && rxq_data->hw_timestamp) {
+		DEBUG("Rx CQE compression is disabled for HW timestamp");
+	}
+	tmpl->cq = ibv_cq_ex_to_cq(mlx5dv_create_cq(priv->ctx, &attr.cq.ibv,
+						    &attr.cq.mlx5));
+	if (tmpl->cq == NULL) {
+		ERROR("%p: CQ creation failure", (void *)rxq_ctrl);
+		goto error;
+	}
+	DEBUG("priv->device_attr.max_qp_wr is %d",
+	      priv->device_attr.orig_attr.max_qp_wr);
+	DEBUG("priv->device_attr.max_sge is %d",
+	      priv->device_attr.orig_attr.max_sge);
+	attr.wq = (struct ibv_wq_init_attr){
+		.wq_context = NULL, /* Could be useful in the future. */
+		.wq_type = IBV_WQT_RQ,
+		/* Max number of outstanding WRs. */
+		.max_wr = (1 << rxq_data->elts_n) >> rxq_data->sges_n,
+		/* Max number of scatter/gather elements in a WR. */
+		.max_sge = 1 << rxq_data->sges_n,
+		.pd = priv->pd,
+		.cq = tmpl->cq,
+		.comp_mask =
+			IBV_WQ_FLAGS_CVLAN_STRIPPING |
+			0,
+		.create_flags = (rxq_data->vlan_strip ?
+				 IBV_WQ_FLAGS_CVLAN_STRIPPING :
+				 0),
+	};
+	/* By default, FCS (CRC) is stripped by hardware. */
+	if (rxq_data->crc_present) {
+		attr.wq.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
+		attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+	}
+#ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
+	if (priv->hw_padding) {
+		attr.wq.create_flags |= IBV_WQ_FLAG_RX_END_PADDING;
+		attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+	}
+#endif
+	tmpl->wq = ibv_create_wq(priv->ctx, &attr.wq);
+	if (tmpl->wq == NULL) {
+		ERROR("%p: WQ creation failure", (void *)rxq_ctrl);
+		goto error;
+	}
+	/*
+	 * Make sure number of WRs*SGEs match expectations since a queue
+	 * cannot allocate more than "desc" buffers.
+	 */
+	if (((int)attr.wq.max_wr !=
+	     ((1 << rxq_data->elts_n) >> rxq_data->sges_n)) ||
+	    ((int)attr.wq.max_sge != (1 << rxq_data->sges_n))) {
+		ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
+		      (void *)rxq_ctrl,
+		      ((1 << rxq_data->elts_n) >> rxq_data->sges_n),
+		      (1 << rxq_data->sges_n),
+		      attr.wq.max_wr, attr.wq.max_sge);
+		goto error;
+	}
+	/* Change queue state to ready. */
+	mod = (struct ibv_wq_attr){
+		.attr_mask = IBV_WQ_ATTR_STATE,
+		.wq_state = IBV_WQS_RDY,
+	};
+	ret = ibv_modify_wq(tmpl->wq, &mod);
+	if (ret) {
+		ERROR("%p: WQ state to IBV_WQS_RDY failed",
+		      (void *)rxq_ctrl);
+		goto error;
+	}
+	obj.cq.in = tmpl->cq;
+	obj.cq.out = &cq_info;
+	obj.rwq.in = tmpl->wq;
+	obj.rwq.out = &rwq;
+	ret = mlx5dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ);
+	if (ret != 0)
+		goto error;
+	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
+		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
+		goto error;
+	}
+	/* Fill the rings. */
+	rxq_data->wqes = (volatile struct mlx5_wqe_data_seg (*)[])
+		(uintptr_t)rwq.buf;
+	for (i = 0; (i != (unsigned int)(1 << rxq_data->elts_n)); ++i) {
+		struct rte_mbuf *buf = (*rxq_data->elts)[i];
+		volatile struct mlx5_wqe_data_seg *scat = &(*rxq_data->wqes)[i];
+
+		/* scat->addr must be able to store a pointer. */
+		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+		*scat = (struct mlx5_wqe_data_seg){
+			.addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
+								  uintptr_t)),
+			.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
+			.lkey = tmpl->mr->lkey,
+		};
+	}
+	rxq_data->rq_db = rwq.dbrec;
+	rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
+	rxq_data->cq_ci = 0;
+	rxq_data->rq_ci = 0;
+	rxq_data->rq_pi = 0;
+	rxq_data->zip = (struct rxq_zip){
+		.ai = 0,
+	};
+	rxq_data->cq_db = cq_info.dbrec;
+	rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
+	rxq_data->cq_uar = cq_info.cq_uar;
+	rxq_data->cqn = cq_info.cqn;
+	rxq_data->cq_arm_sn = 0;
+	/* Update doorbell counter. */
+	rxq_data->rq_ci = (1 << rxq_data->elts_n) >> rxq_data->sges_n;
+	rte_wmb();
+	*rxq_data->rq_db = rte_cpu_to_be_32(rxq_data->rq_ci);
+	DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
+	rte_atomic32_inc(&tmpl->refcnt);
+	DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
+	      (void *)tmpl, rte_atomic32_read(&tmpl->refcnt));
+	LIST_INSERT_HEAD(&priv->rxqsibv, tmpl, next);
+	return tmpl;
+error:
+	if (tmpl->wq)
+		claim_zero(ibv_destroy_wq(tmpl->wq));
+	if (tmpl->cq)
+		claim_zero(ibv_destroy_cq(tmpl->cq));
+	if (tmpl->channel)
+		claim_zero(ibv_destroy_comp_channel(tmpl->channel));
+	if (tmpl->mr)
+		priv_mr_release(priv, tmpl->mr);
+	return NULL;
+}
+
+/**
+ * Get an Rx queue Verbs object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ *
+ * @return
+ *   The Verbs object if it exists.
+ */
+struct mlx5_rxq_ibv*
+mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx)
+{
+	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+
+	if (idx >= priv->rxqs_n)
+		return NULL;
+	if (!rxq_data)
+		return NULL;
+	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	if (rxq_ctrl->ibv) {
+		priv_mr_get(priv, rxq_data->mp);
+		rte_atomic32_inc(&rxq_ctrl->ibv->refcnt);
+		DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
+		      (void *)rxq_ctrl->ibv,
+		      rte_atomic32_read(&rxq_ctrl->ibv->refcnt));
+	}
+	return rxq_ctrl->ibv;
+}
+
+/**
+ * Release an Rx verbs queue object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param rxq_ibv
+ *   Verbs Rx queue object.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv)
+{
+	int ret;
+
+	assert(rxq_ibv);
+	assert(rxq_ibv->wq);
+	assert(rxq_ibv->cq);
+	assert(rxq_ibv->mr);
+	ret = priv_mr_release(priv, rxq_ibv->mr);
+	if (!ret)
+		rxq_ibv->mr = NULL;
+	DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
+	      (void *)rxq_ibv, rte_atomic32_read(&rxq_ibv->refcnt));
+	if (rte_atomic32_dec_and_test(&rxq_ibv->refcnt)) {
+		rxq_free_elts(rxq_ibv->rxq_ctrl);
+		claim_zero(ibv_destroy_wq(rxq_ibv->wq));
+		claim_zero(ibv_destroy_cq(rxq_ibv->cq));
+		if (rxq_ibv->channel)
+			claim_zero(ibv_destroy_comp_channel(rxq_ibv->channel));
+		LIST_REMOVE(rxq_ibv, next);
+		rte_free(rxq_ibv);
+		return 0;
+	}
+	return EBUSY;
+}
+
+/**
+ * Verify the Verbs Rx queue list is empty
+ *
+ * @param priv
+ *  Pointer to private structure.
+ *
+ * @return the number of object not released.
+ */
+int
+mlx5_priv_rxq_ibv_verify(struct priv *priv)
+{
+	int ret = 0;
+	struct mlx5_rxq_ibv *rxq_ibv;
+
+	LIST_FOREACH(rxq_ibv, &priv->rxqsibv, next) {
+		DEBUG("%p: Verbs Rx queue %p still referenced", (void *)priv,
+		      (void *)rxq_ibv);
+		++ret;
+	}
+	return ret;
+}
+
+/**
+ * Return true if a single reference exists on the object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param rxq_ibv
+ *   Verbs Rx queue object.
+ */
+int
+mlx5_priv_rxq_ibv_releasable(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv)
+{
+	(void)priv;
+	assert(rxq_ibv);
+	return (rte_atomic32_read(&rxq_ibv->refcnt) == 1);
+}
+
+/**
+ * Create a DPDK Rx queue.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   TX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ *
+ * @return
+ *   A DPDK queue object on success.
+ */
+struct mlx5_rxq_ctrl*
+mlx5_priv_rxq_new(struct priv *priv, uint16_t idx, uint16_t desc,
+		  unsigned int socket, struct rte_mempool *mp)
+{
+	struct rte_eth_dev *dev = priv->dev;
+	struct mlx5_rxq_ctrl *tmpl;
+	const uint16_t desc_n =
+		desc + priv->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
+	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+
+	tmpl = rte_calloc_socket("RXQ", 1,
+				 sizeof(*tmpl) +
+				 desc_n * sizeof(struct rte_mbuf *),
+				 0, socket);
+	if (!tmpl)
+		return NULL;
+	tmpl->socket = socket;
+	if (priv->dev->data->dev_conf.intr_conf.rxq)
+		tmpl->irq = 1;
+	/* Enable scattered packets support for this queue if necessary. */
+	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
+	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
+		tmpl->rxq.sges_n = 0;
+	} else if (dev->data->dev_conf.rxmode.enable_scatter) {
+		unsigned int size =
+			RTE_PKTMBUF_HEADROOM +
+			dev->data->dev_conf.rxmode.max_rx_pkt_len;
+		unsigned int sges_n;
+
+		/*
+		 * Determine the number of SGEs needed for a full packet
+		 * and round it to the next power of two.
+		 */
+		sges_n = log2above((size / mb_len) + !!(size % mb_len));
+		tmpl->rxq.sges_n = sges_n;
+		/* Make sure rxq.sges_n did not overflow. */
+		size = mb_len * (1 << tmpl->rxq.sges_n);
+		size -= RTE_PKTMBUF_HEADROOM;
+		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
+			ERROR("%p: too many SGEs (%u) needed to handle"
+			      " requested maximum packet size %u",
+			      (void *)dev,
+			      1 << sges_n,
+			      dev->data->dev_conf.rxmode.max_rx_pkt_len);
+			goto error;
+		}
+	} else {
+		WARN("%p: the requested maximum Rx packet size (%u) is"
+		     " larger than a single mbuf (%u) and scattered"
+		     " mode has not been requested",
+		     (void *)dev,
+		     dev->data->dev_conf.rxmode.max_rx_pkt_len,
+		     mb_len - RTE_PKTMBUF_HEADROOM);
+	}
+	DEBUG("%p: maximum number of segments per packet: %u",
+	      (void *)dev, 1 << tmpl->rxq.sges_n);
+	if (desc % (1 << tmpl->rxq.sges_n)) {
+		ERROR("%p: number of RX queue descriptors (%u) is not a"
+		      " multiple of SGEs per packet (%u)",
+		      (void *)dev,
+		      desc,
+		      1 << tmpl->rxq.sges_n);
+		goto error;
+	}
+	/* Toggle RX checksum offload if hardware supports it. */
+	if (priv->hw_csum)
+		tmpl->rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+	if (priv->hw_csum_l2tun)
+		tmpl->rxq.csum_l2tun =
+			!!dev->data->dev_conf.rxmode.hw_ip_checksum;
+	tmpl->rxq.hw_timestamp =
+			!!dev->data->dev_conf.rxmode.hw_timestamp;
+	/* Configure VLAN stripping. */
+	tmpl->rxq.vlan_strip = (priv->hw_vlan_strip &&
+			       !!dev->data->dev_conf.rxmode.hw_vlan_strip);
+	/* By default, FCS (CRC) is stripped by hardware. */
+	if (dev->data->dev_conf.rxmode.hw_strip_crc) {
+		tmpl->rxq.crc_present = 0;
+	} else if (priv->hw_fcs_strip) {
+		tmpl->rxq.crc_present = 1;
+	} else {
+		WARN("%p: CRC stripping has been disabled but will still"
+		     " be performed by hardware, make sure MLNX_OFED and"
+		     " firmware are up to date",
+		     (void *)dev);
+		tmpl->rxq.crc_present = 0;
+	}
+	DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
+	      " incoming frames to hide it",
+	      (void *)dev,
+	      tmpl->rxq.crc_present ? "disabled" : "enabled",
+	      tmpl->rxq.crc_present << 2);
+	/* Save port ID. */
+	tmpl->rxq.rss_hash = priv->rxqs_n > 1;
+	tmpl->rxq.port_id = dev->data->port_id;
+	tmpl->priv = priv;
+	tmpl->rxq.mp = mp;
+	tmpl->rxq.stats.idx = idx;
+	tmpl->rxq.elts_n = log2above(desc);
+	tmpl->rxq.elts =
+		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
+	rte_atomic32_inc(&tmpl->refcnt);
+	DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
+	      (void *)tmpl, rte_atomic32_read(&tmpl->refcnt));
+	LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
+	return tmpl;
+error:
+	rte_free(tmpl);
+	return NULL;
+}
+
+/**
+ * Get a Rx queue.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   TX queue index.
+ *
+ * @return
+ *   A pointer to the queue if it exists.
+ */
+struct mlx5_rxq_ctrl*
+mlx5_priv_rxq_get(struct priv *priv, uint16_t idx)
+{
+	struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
+
+	if ((*priv->rxqs)[idx]) {
+		rxq_ctrl = container_of((*priv->rxqs)[idx],
+					struct mlx5_rxq_ctrl,
+					rxq);
+
+		mlx5_priv_rxq_ibv_get(priv, idx);
+		rte_atomic32_inc(&rxq_ctrl->refcnt);
+		DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
+		      (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt));
+	}
+	return rxq_ctrl;
+}
+
+/**
+ * Release a Rx queue.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   TX queue index.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+mlx5_priv_rxq_release(struct priv *priv, uint16_t idx)
+{
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+
+	if (!(*priv->rxqs)[idx])
+		return 0;
+	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
+	assert(rxq_ctrl->priv);
+	if (rxq_ctrl->ibv) {
+		int ret;
+
+		ret = mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv);
+		if (!ret)
+			rxq_ctrl->ibv = NULL;
+	}
+	DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
+	      (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt));
+	if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) {
+		LIST_REMOVE(rxq_ctrl, next);
+		rte_free(rxq_ctrl);
+		(*priv->rxqs)[idx] = NULL;
+		return 0;
+	}
+	return EBUSY;
+}
+
+/**
+ * Verify if the queue can be released.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   TX queue index.
+ *
+ * @return
+ *   1 if the queue can be released.
+ */
+int
+mlx5_priv_rxq_releasable(struct priv *priv, uint16_t idx)
+{
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+
+	if (!(*priv->rxqs)[idx])
+		return -1;
+	rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
+	return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1);
+}
+
+/**
+ * Verify the Rx Queue list is empty
+ *
+ * @param priv
+ *  Pointer to private structure.
+ *
+ * @return the number of object not released.
+ */
+int
+mlx5_priv_rxq_verify(struct priv *priv)
+{
+	struct mlx5_rxq_ctrl *rxq_ctrl;
+	int ret = 0;
+
+	LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) {
+		DEBUG("%p: Rx Queue %p still referenced", (void *)priv,
+		      (void *)rxq_ctrl);
+		++ret;
+	}
+	return ret;
+}
+
+/**
+ * Create an indirection table.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param queues
+ *   Queues entering in the indirection table.
+ * @param queues_n
+ *   Number of queues in the array.
+ *
+ * @return
+ *   A new indirection table.
+ */
+struct mlx5_ind_table_ibv*
+mlx5_priv_ind_table_ibv_new(struct priv *priv, uint16_t queues[],
+			    uint16_t queues_n)
+{
+	struct mlx5_ind_table_ibv *ind_tbl;
+	const unsigned int wq_n = rte_is_power_of_2(queues_n) ?
+		log2above(queues_n) :
+		log2above(priv->ind_table_max_size);
+	struct ibv_wq *wq[1 << wq_n];
+	unsigned int i;
+	unsigned int j;
+
+	ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) +
+			     queues_n * sizeof(uint16_t), 0);
+	if (!ind_tbl)
+		return NULL;
+	for (i = 0; i != queues_n; ++i) {
+		struct mlx5_rxq_ctrl *rxq =
+			mlx5_priv_rxq_get(priv, queues[i]);
+
+		if (!rxq)
+			goto error;
+		wq[i] = rxq->ibv->wq;
+		ind_tbl->queues[i] = queues[i];
+	}
+	ind_tbl->queues_n = queues_n;
+	/* Finalise indirection table. */
+	for (j = 0; i != (unsigned int)(1 << wq_n); ++i, ++j)
+		wq[i] = wq[j];
+	ind_tbl->ind_table = ibv_create_rwq_ind_table(
+		priv->ctx,
+		&(struct ibv_rwq_ind_table_init_attr){
+			.log_ind_tbl_size = wq_n,
+			.ind_tbl = wq,
+			.comp_mask = 0,
+		});
+	if (!ind_tbl->ind_table)
+		goto error;
+	rte_atomic32_inc(&ind_tbl->refcnt);
+	LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next);
+	DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
+	      (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
+	return ind_tbl;
+error:
+	rte_free(ind_tbl);
+	DEBUG("%p cannot create indirection table", (void *)priv);
+	return NULL;
+}
+
+/**
+ * Get an indirection table.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param queues
+ *   Queues entering in the indirection table.
+ * @param queues_n
+ *   Number of queues in the array.
+ *
+ * @return
+ *   An indirection table if found.
+ */
+struct mlx5_ind_table_ibv*
+mlx5_priv_ind_table_ibv_get(struct priv *priv, uint16_t queues[],
+			    uint16_t queues_n)
+{
+	struct mlx5_ind_table_ibv *ind_tbl;
+
+	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
+		if ((ind_tbl->queues_n == queues_n) &&
+		    (memcmp(ind_tbl->queues, queues,
+			    ind_tbl->queues_n * sizeof(ind_tbl->queues[0]))
+		     == 0))
+			break;
+	}
+	if (ind_tbl) {
+		unsigned int i;
+
+		rte_atomic32_inc(&ind_tbl->refcnt);
+		DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
+		      (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
+		for (i = 0; i != ind_tbl->queues_n; ++i)
+			mlx5_priv_rxq_get(priv, ind_tbl->queues[i]);
+	}
+	return ind_tbl;
+}
+
+/**
+ * Release an indirection table.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param ind_table
+ *   Indirection table to release.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+mlx5_priv_ind_table_ibv_release(struct priv *priv,
+				struct mlx5_ind_table_ibv *ind_tbl)
+{
+	unsigned int i;
+
+	DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
+	      (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
+	if (rte_atomic32_dec_and_test(&ind_tbl->refcnt))
+		claim_zero(ibv_destroy_rwq_ind_table(ind_tbl->ind_table));
+	for (i = 0; i != ind_tbl->queues_n; ++i)
+		claim_nonzero(mlx5_priv_rxq_release(priv, ind_tbl->queues[i]));
+	if (!rte_atomic32_read(&ind_tbl->refcnt)) {
+		LIST_REMOVE(ind_tbl, next);
+		rte_free(ind_tbl);
+		return 0;
+	}
+	return EBUSY;
+}
+
+/**
+ * Verify the Rx Queue list is empty
+ *
+ * @param priv
+ *  Pointer to private structure.
+ *
+ * @return the number of object not released.
+ */
+int
+mlx5_priv_ind_table_ibv_verify(struct priv *priv)
+{
+	struct mlx5_ind_table_ibv *ind_tbl;
+	int ret = 0;
+
+	LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
+		DEBUG("%p: Verbs indirection table %p still referenced",
+		      (void *)priv, (void *)ind_tbl);
+		++ret;
+	}
+	return ret;
+}
+
+/**
+ * Create an Rx Hash queue.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param rss_key
+ *   RSS key for the Rx hash queue.
+ * @param rss_key_len
+ *   RSS key length.
+ * @param hash_fields
+ *   Verbs protocol hash field to make the RSS on.
+ * @param queues
+ *   Queues entering in hash queue. In case of empty hash_fields only the
+ *   first queue index will be taken for the indirection table.
+ * @param queues_n
+ *   Number of queues.
+ *
+ * @return
+ *   An hash Rx queue on success.
+ */
+struct mlx5_hrxq*
+mlx5_priv_hrxq_new(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len,
+		   uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+{
+	struct mlx5_hrxq *hrxq;
+	struct mlx5_ind_table_ibv *ind_tbl;
+	struct ibv_qp *qp;
+
+	queues_n = hash_fields ? queues_n : 1;
+	ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n);
+	if (!ind_tbl)
+		ind_tbl = mlx5_priv_ind_table_ibv_new(priv, queues, queues_n);
+	if (!ind_tbl)
+		return NULL;
+	qp = ibv_create_qp_ex(
+		priv->ctx,
+		&(struct ibv_qp_init_attr_ex){
+			.qp_type = IBV_QPT_RAW_PACKET,
+			.comp_mask =
+				IBV_QP_INIT_ATTR_PD |
+				IBV_QP_INIT_ATTR_IND_TABLE |
+				IBV_QP_INIT_ATTR_RX_HASH,
+			.rx_hash_conf = (struct ibv_rx_hash_conf){
+				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
+				.rx_hash_key_len = rss_key_len,
+				.rx_hash_key = rss_key,
+				.rx_hash_fields_mask = hash_fields,
+			},
+			.rwq_ind_tbl = ind_tbl->ind_table,
+			.pd = priv->pd,
+		});
+	if (!qp)
+		goto error;
+	hrxq = rte_calloc(__func__, 1, sizeof(*hrxq) + rss_key_len, 0);
+	if (!hrxq)
+		goto error;
+	hrxq->ind_table = ind_tbl;
+	hrxq->qp = qp;
+	hrxq->rss_key_len = rss_key_len;
+	hrxq->hash_fields = hash_fields;
+	memcpy(hrxq->rss_key, rss_key, rss_key_len);
+	rte_atomic32_inc(&hrxq->refcnt);
+	LIST_INSERT_HEAD(&priv->hrxqs, hrxq, next);
+	DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
+	      (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
+	return hrxq;
+error:
+	mlx5_priv_ind_table_ibv_release(priv, ind_tbl);
+	if (qp)
+		claim_zero(ibv_destroy_qp(qp));
+	return NULL;
+}
+
+/**
+ * Get an Rx Hash queue.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param rss_conf
+ *   RSS configuration for the Rx hash queue.
+ * @param queues
+ *   Queues entering in hash queue. In case of empty hash_fields only the
+ *   first queue index will be taken for the indirection table.
+ * @param queues_n
+ *   Number of queues.
+ *
+ * @return
+ *   An hash Rx queue on success.
+ */
+struct mlx5_hrxq*
+mlx5_priv_hrxq_get(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len,
+		   uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+{
+	struct mlx5_hrxq *hrxq;
+
+	queues_n = hash_fields ? queues_n : 1;
+	LIST_FOREACH(hrxq, &priv->hrxqs, next) {
+		struct mlx5_ind_table_ibv *ind_tbl;
+
+		if (hrxq->rss_key_len != rss_key_len)
+			continue;
+		if (memcmp(hrxq->rss_key, rss_key, rss_key_len))
+			continue;
+		if (hrxq->hash_fields != hash_fields)
+			continue;
+		ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n);
+		if (!ind_tbl)
+			continue;
+		if (ind_tbl != hrxq->ind_table) {
+			mlx5_priv_ind_table_ibv_release(priv, ind_tbl);
+			continue;
+		}
+		rte_atomic32_inc(&hrxq->refcnt);
+		DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
+		      (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
+		return hrxq;
+	}
+	return NULL;
+}
+
+/**
+ * Release the hash Rx queue.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param hrxq
+ *   Pointer to Hash Rx queue to release.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+mlx5_priv_hrxq_release(struct priv *priv, struct mlx5_hrxq *hrxq)
+{
+	DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
+	      (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
+	if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
+		claim_zero(ibv_destroy_qp(hrxq->qp));
+		mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table);
+		LIST_REMOVE(hrxq, next);
+		rte_free(hrxq);
+		return 0;
+	}
+	claim_nonzero(mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table));
+	return EBUSY;
+}
+
+/**
+ * Verify the Rx Queue list is empty
+ *
+ * @param priv
+ *  Pointer to private structure.
+ *
+ * @return the number of object not released.
+ */
+int
+mlx5_priv_hrxq_ibv_verify(struct priv *priv)
+{
+	struct mlx5_hrxq *hrxq;
+	int ret = 0;
+
+	LIST_FOREACH(hrxq, &priv->hrxqs, next) {
+		DEBUG("%p: Verbs Hash Rx queue %p still referenced",
+		      (void *)priv, (void *)hrxq);
+		++ret;
+	}
+	return ret;
+}
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index b07bcd11..9658b378 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -42,25 +42,17 @@
 #pragma GCC diagnostic ignored "-Wpedantic"
 #endif
 #include <infiniband/verbs.h>
-#include <infiniband/mlx5_hw.h>
-#include <infiniband/arch.h>
+#include <infiniband/mlx5dv.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_mbuf.h>
 #include <rte_mempool.h>
 #include <rte_prefetch.h>
 #include <rte_common.h>
 #include <rte_branch_prediction.h>
 #include <rte_ether.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
@@ -73,11 +65,11 @@ static __rte_always_inline uint32_t
 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe);
 
 static __rte_always_inline int
-mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
+mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
 		 uint16_t cqe_cnt, uint32_t *rss_hash);
 
 static __rte_always_inline uint32_t
-rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe);
+rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
 
 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
@@ -105,6 +97,8 @@ mlx5_set_ptype_table(void)
 	 * bit[6] = tunneled
 	 * bit[7] = outer_l3_type
 	 */
+	/* L2 */
+	(*p)[0x00] = RTE_PTYPE_L2_ETHER;
 	/* L3 */
 	(*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 		     RTE_PTYPE_L4_NONFRAG;
@@ -171,29 +165,29 @@ mlx5_set_ptype_table(void)
 	/* Tunneled - TCP */
 	(*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
-		     RTE_PTYPE_L4_TCP;
+		     RTE_PTYPE_INNER_L4_TCP;
 	(*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
-		     RTE_PTYPE_L4_TCP;
+		     RTE_PTYPE_INNER_L4_TCP;
 	(*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
-		     RTE_PTYPE_L4_TCP;
+		     RTE_PTYPE_INNER_L4_TCP;
 	(*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
-		     RTE_PTYPE_L4_TCP;
+		     RTE_PTYPE_INNER_L4_TCP;
 	/* Tunneled - UDP */
 	(*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
-		     RTE_PTYPE_L4_UDP;
+		     RTE_PTYPE_INNER_L4_UDP;
 	(*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
-		     RTE_PTYPE_L4_UDP;
+		     RTE_PTYPE_INNER_L4_UDP;
 	(*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 		     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
-		     RTE_PTYPE_L4_UDP;
+		     RTE_PTYPE_INNER_L4_UDP;
 	(*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 		     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
-		     RTE_PTYPE_L4_UDP;
+		     RTE_PTYPE_INNER_L4_UDP;
 }
 
 /**
@@ -208,7 +202,7 @@ mlx5_set_ptype_table(void)
  *   Size of tailroom.
  */
 static inline size_t
-tx_mlx5_wq_tailroom(struct txq *txq, void *addr)
+tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
 {
 	size_t tailroom;
 	tailroom = (uintptr_t)(txq->wqes) +
@@ -266,7 +260,7 @@ mlx5_copy_to_wq(void *dst, const void *src, size_t n,
 int
 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
 {
-	struct txq *txq = tx_queue;
+	struct mlx5_txq_data *txq = tx_queue;
 	uint16_t used;
 
 	mlx5_tx_complete(txq);
@@ -290,7 +284,7 @@ mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
 int
 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
 {
-	struct rxq *rxq = rx_queue;
+	struct mlx5_rxq_data *rxq = rx_queue;
 	struct rxq_zip *zip = &rxq->zip;
 	volatile struct mlx5_cqe *cqe;
 	const unsigned int cqe_n = (1 << rxq->cqe_n);
@@ -313,7 +307,7 @@ mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
 
 		op_own = cqe->op_own;
 		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
-			n = ntohl(cqe->byte_cnt);
+			n = rte_be_to_cpu_32(cqe->byte_cnt);
 		else
 			n = 1;
 		cq_ci += n;
@@ -342,7 +336,7 @@ mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
 uint16_t
 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
-	struct txq *txq = (struct txq *)dpdk_txq;
+	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
 	const uint16_t elts_n = 1 << txq->elts_n;
 	const uint16_t elts_m = elts_n - 1;
@@ -413,8 +407,10 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		total_length = length;
 #endif
-		if (length < (MLX5_WQE_DWORD_SIZE + 2))
+		if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
+			txq->stats.oerrors++;
 			break;
+		}
 		/* Update element. */
 		(*txq->elts)[elts_head & elts_m] = buf;
 		/* Prefetch next buffer data. */
@@ -441,7 +437,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
 		/* Replace the Ethernet type by the VLAN if necessary. */
 		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-			uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
+			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
+							 buf->vlan_tci);
 			unsigned int len = 2 * ETHER_ADDR_LEN - 2;
 
 			addr += 2;
@@ -461,6 +458,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			length -= pkt_inline_sz;
 			addr += pkt_inline_sz;
 		}
+		raw += MLX5_WQE_DWORD_SIZE;
 		if (txq->tso_en) {
 			tso = buf->ol_flags & PKT_TX_TCP_SEG;
 			if (tso) {
@@ -479,7 +477,10 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				tso_header_sz = buf->l2_len + vlan_sz +
 						buf->l3_len + buf->l4_len;
 				tso_segsz = buf->tso_segsz;
-
+				if (unlikely(tso_segsz == 0)) {
+					txq->stats.oerrors++;
+					break;
+				}
 				if (is_tunneled	&& txq->tunnel_en) {
 					tso_header_sz += buf->outer_l2_len +
 							 buf->outer_l3_len;
@@ -488,12 +489,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 					cs_flags |= MLX5_ETH_WQE_L4_CSUM;
 				}
 				if (unlikely(tso_header_sz >
-					     MLX5_MAX_TSO_HEADER))
+					     MLX5_MAX_TSO_HEADER)) {
+					txq->stats.oerrors++;
 					break;
+				}
 				copy_b = tso_header_sz - pkt_inline_sz;
 				/* First seg must contain all headers. */
 				assert(copy_b <= length);
-				raw += MLX5_WQE_DWORD_SIZE;
 				if (copy_b &&
 				   ((end - (uintptr_t)raw) > copy_b)) {
 					uint16_t n = (MLX5_WQE_DS(copy_b) -
@@ -506,19 +508,18 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 						   (void *)addr, copy_b);
 					addr += copy_b;
 					length -= copy_b;
+					/* Include padding for TSO header. */
+					copy_b = MLX5_WQE_DS(copy_b) *
+						 MLX5_WQE_DWORD_SIZE;
 					pkt_inline_sz += copy_b;
-					/*
-					 * Another DWORD will be added
-					 * in the inline part.
-					 */
-					raw += MLX5_WQE_DS(copy_b) *
-					       MLX5_WQE_DWORD_SIZE -
-					       MLX5_WQE_DWORD_SIZE;
+					raw += copy_b;
 				} else {
 					/* NOP WQE. */
 					wqe->ctrl = (rte_v128u32_t){
-						     htonl(txq->wqe_ci << 8),
-						     htonl(txq->qp_num_8s | 1),
+						     rte_cpu_to_be_32(
+							txq->wqe_ci << 8),
+						     rte_cpu_to_be_32(
+							txq->qp_num_8s | 1),
 						     0,
 						     0,
 					};
@@ -531,19 +532,20 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		}
 		/* Inline if enough room. */
 		if (inline_en || tso) {
+			uint32_t inl;
 			uintptr_t end = (uintptr_t)
 				(((uintptr_t)txq->wqes) +
 				 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
 			unsigned int inline_room = max_inline *
 						   RTE_CACHE_LINE_SIZE -
-						   (pkt_inline_sz - 2);
+						   (pkt_inline_sz - 2) -
+						   !!tso * sizeof(inl);
 			uintptr_t addr_end = (addr + inline_room) &
 					     ~(RTE_CACHE_LINE_SIZE - 1);
 			unsigned int copy_b = (addr_end > addr) ?
 				RTE_MIN((addr_end - addr), length) :
 				0;
 
-			raw += MLX5_WQE_DWORD_SIZE;
 			if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
 				/*
 				 * One Dseg remains in the current WQE.  To
@@ -556,12 +558,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 					break;
 				max_wqe -= n;
 				if (tso) {
-					uint32_t inl =
-						htonl(copy_b | MLX5_INLINE_SEG);
-
-					pkt_inline_sz =
-						MLX5_WQE_DS(tso_header_sz) *
-						MLX5_WQE_DWORD_SIZE;
+					inl = rte_cpu_to_be_32(copy_b |
+							       MLX5_INLINE_SEG);
 					rte_memcpy((void *)raw,
 						   (void *)&inl, sizeof(inl));
 					raw += sizeof(inl);
@@ -610,9 +608,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			ds = 3;
 use_dseg:
 			/* Add the remaining packet as a simple ds. */
-			naddr = htonll(addr);
+			naddr = rte_cpu_to_be_64(addr);
 			*dseg = (rte_v128u32_t){
-				htonl(length),
+				rte_cpu_to_be_32(length),
 				mlx5_tx_mb2mr(txq, buf),
 				naddr,
 				naddr >> 32,
@@ -649,9 +647,9 @@ next_seg:
 		total_length += length;
 #endif
 		/* Store segment information. */
-		naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+		naddr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
 		*dseg = (rte_v128u32_t){
-			htonl(length),
+			rte_cpu_to_be_32(length),
 			mlx5_tx_mb2mr(txq, buf),
 			naddr,
 			naddr >> 32,
@@ -664,27 +662,33 @@ next_seg:
 		else
 			j += sg;
 next_pkt:
+		if (ds > MLX5_DSEG_MAX) {
+			txq->stats.oerrors++;
+			break;
+		}
 		++elts_head;
 		++pkts;
 		++i;
 		/* Initialize known and common part of the WQE structure. */
 		if (tso) {
 			wqe->ctrl = (rte_v128u32_t){
-				htonl((txq->wqe_ci << 8) | MLX5_OPCODE_TSO),
-				htonl(txq->qp_num_8s | ds),
+				rte_cpu_to_be_32((txq->wqe_ci << 8) |
+						 MLX5_OPCODE_TSO),
+				rte_cpu_to_be_32(txq->qp_num_8s | ds),
 				0,
 				0,
 			};
 			wqe->eseg = (rte_v128u32_t){
 				0,
-				cs_flags | (htons(tso_segsz) << 16),
+				cs_flags | (rte_cpu_to_be_16(tso_segsz) << 16),
 				0,
-				(ehdr << 16) | htons(tso_header_sz),
+				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
 			};
 		} else {
 			wqe->ctrl = (rte_v128u32_t){
-				htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
-				htonl(txq->qp_num_8s | ds),
+				rte_cpu_to_be_32((txq->wqe_ci << 8) |
+						 MLX5_OPCODE_SEND),
+				rte_cpu_to_be_32(txq->qp_num_8s | ds),
 				0,
 				0,
 			};
@@ -692,7 +696,7 @@ next_pkt:
 				0,
 				cs_flags,
 				0,
-				(ehdr << 16) | htons(pkt_inline_sz),
+				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
 			};
 		}
 next_wqe:
@@ -712,7 +716,7 @@ next_wqe:
 	comp = txq->elts_comp + i + j + k;
 	if (comp >= MLX5_TX_COMP_THRESH) {
 		/* Request completion on last WQE. */
-		last_wqe->ctrl2 = htonl(8);
+		last_wqe->ctrl2 = rte_cpu_to_be_32(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
 		last_wqe->ctrl3 = txq->elts_head;
 		txq->elts_comp = 0;
@@ -739,7 +743,7 @@ next_wqe:
  *   Packet length.
  */
 static inline void
-mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
+mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
 {
 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
 	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
@@ -751,13 +755,14 @@ mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 	mpw->len = length;
 	mpw->total_len = 0;
 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->eseg.mss = htons(length);
+	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
 	mpw->wqe->eseg.inline_hdr_sz = 0;
 	mpw->wqe->eseg.rsvd0 = 0;
 	mpw->wqe->eseg.rsvd1 = 0;
 	mpw->wqe->eseg.rsvd2 = 0;
-	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
-				  (txq->wqe_ci << 8) | MLX5_OPCODE_TSO);
+	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
+					     (txq->wqe_ci << 8) |
+					     MLX5_OPCODE_TSO);
 	mpw->wqe->ctrl[2] = 0;
 	mpw->wqe->ctrl[3] = 0;
 	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
@@ -778,7 +783,7 @@ mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
  *   Pointer to MPW session structure.
  */
 static inline void
-mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
+mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
 {
 	unsigned int num = mpw->pkts_n;
 
@@ -786,7 +791,7 @@ mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
 	 * count as 2.
 	 */
-	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num));
+	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
 	mpw->state = MLX5_MPW_STATE_CLOSED;
 	if (num < 3)
 		++txq->wqe_ci;
@@ -812,7 +817,7 @@ mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
 uint16_t
 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
-	struct txq *txq = (struct txq *)dpdk_txq;
+	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
 	const uint16_t elts_n = 1 << txq->elts_n;
 	const uint16_t elts_m = elts_n - 1;
@@ -850,8 +855,10 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (max_elts < segs_n)
 			break;
 		/* Do not bother with large packets MPW cannot handle. */
-		if (segs_n > MLX5_MPW_DSEG_MAX)
+		if (segs_n > MLX5_MPW_DSEG_MAX) {
+			txq->stats.oerrors++;
 			break;
+		}
 		max_elts -= segs_n;
 		--pkts_n;
 		/* Should we enable HW CKSUM offload */
@@ -893,9 +900,9 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			dseg = mpw.data.dseg[mpw.pkts_n];
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			*dseg = (struct mlx5_wqe_data_seg){
-				.byte_count = htonl(DATA_LEN(buf)),
+				.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
 				.lkey = mlx5_tx_mb2mr(txq, buf),
-				.addr = htonll(addr),
+				.addr = rte_cpu_to_be_64(addr),
 			};
 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
 			length += DATA_LEN(buf);
@@ -923,7 +930,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		volatile struct mlx5_wqe *wqe = mpw.wqe;
 
 		/* Request completion on last WQE. */
-		wqe->ctrl[2] = htonl(8);
+		wqe->ctrl[2] = rte_cpu_to_be_32(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
 		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
@@ -953,7 +960,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
  *   Packet length.
  */
 static inline void
-mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
+mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
+		    uint32_t length)
 {
 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
 	struct mlx5_wqe_inl_small *inl;
@@ -963,12 +971,12 @@ mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
 	mpw->len = length;
 	mpw->total_len = 0;
 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
-				  (txq->wqe_ci << 8) |
-				  MLX5_OPCODE_TSO);
+	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
+					     (txq->wqe_ci << 8) |
+					     MLX5_OPCODE_TSO);
 	mpw->wqe->ctrl[2] = 0;
 	mpw->wqe->ctrl[3] = 0;
-	mpw->wqe->eseg.mss = htons(length);
+	mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
 	mpw->wqe->eseg.inline_hdr_sz = 0;
 	mpw->wqe->eseg.cs_flags = 0;
 	mpw->wqe->eseg.rsvd0 = 0;
@@ -988,7 +996,7 @@ mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
  *   Pointer to MPW session structure.
  */
 static inline void
-mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
+mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
 {
 	unsigned int size;
 	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
@@ -999,9 +1007,10 @@ mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
 	 * Store size in multiple of 16 bytes. Control and Ethernet segments
 	 * count as 2.
 	 */
-	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size));
+	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
+					     MLX5_WQE_DS(size));
 	mpw->state = MLX5_MPW_STATE_CLOSED;
-	inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
+	inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
 	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
 }
 
@@ -1022,7 +1031,7 @@ uint16_t
 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 			 uint16_t pkts_n)
 {
-	struct txq *txq = (struct txq *)dpdk_txq;
+	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
 	const uint16_t elts_n = 1 << txq->elts_n;
 	const uint16_t elts_m = elts_n - 1;
@@ -1071,8 +1080,10 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 		if (max_elts < segs_n)
 			break;
 		/* Do not bother with large packets MPW cannot handle. */
-		if (segs_n > MLX5_MPW_DSEG_MAX)
+		if (segs_n > MLX5_MPW_DSEG_MAX) {
+			txq->stats.oerrors++;
 			break;
+		}
 		max_elts -= segs_n;
 		--pkts_n;
 		/*
@@ -1139,9 +1150,10 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 				dseg = mpw.data.dseg[mpw.pkts_n];
 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
 				*dseg = (struct mlx5_wqe_data_seg){
-					.byte_count = htonl(DATA_LEN(buf)),
+					.byte_count =
+					       rte_cpu_to_be_32(DATA_LEN(buf)),
 					.lkey = mlx5_tx_mb2mr(txq, buf),
-					.addr = htonll(addr),
+					.addr = rte_cpu_to_be_64(addr),
 				};
 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
 				length += DATA_LEN(buf);
@@ -1213,7 +1225,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 		volatile struct mlx5_wqe *wqe = mpw.wqe;
 
 		/* Request completion on last WQE. */
-		wqe->ctrl[2] = htonl(8);
+		wqe->ctrl[2] = rte_cpu_to_be_32(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
 		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
@@ -1245,7 +1257,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
  *   Packet length.
  */
 static inline void
-mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding)
+mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
 {
 	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
 
@@ -1253,9 +1265,10 @@ mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding)
 	mpw->pkts_n = 0;
 	mpw->total_len = sizeof(struct mlx5_wqe);
 	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
-	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
-				  (txq->wqe_ci << 8) |
-				  MLX5_OPCODE_ENHANCED_MPSW);
+	mpw->wqe->ctrl[0] =
+		rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
+				 (txq->wqe_ci << 8) |
+				 MLX5_OPCODE_ENHANCED_MPSW);
 	mpw->wqe->ctrl[2] = 0;
 	mpw->wqe->ctrl[3] = 0;
 	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
@@ -1263,9 +1276,9 @@ mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding)
 		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
 
 		/* Pad the first 2 DWORDs with zero-length inline header. */
-		*(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG);
+		*(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
 		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
-			htonl(MLX5_INLINE_SEG);
+			rte_cpu_to_be_32(MLX5_INLINE_SEG);
 		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
 		/* Start from the next WQEBB. */
 		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
@@ -1286,14 +1299,15 @@ mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding)
  *   Number of consumed WQEs.
  */
 static inline uint16_t
-mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw)
+mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
 {
 	uint16_t ret;
 
 	/* Store size in multiple of 16 bytes. Control and Ethernet segments
 	 * count as 2.
 	 */
-	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len));
+	mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
+					     MLX5_WQE_DS(mpw->total_len));
 	mpw->state = MLX5_MPW_STATE_CLOSED;
 	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
 	txq->wqe_ci += ret;
@@ -1316,7 +1330,7 @@ mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw)
 uint16_t
 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
-	struct txq *txq = (struct txq *)dpdk_txq;
+	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
 	const uint16_t elts_n = 1 << txq->elts_n;
 	const uint16_t elts_m = elts_n - 1;
@@ -1360,8 +1374,10 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (max_elts - j < segs_n)
 			break;
 		/* Do not bother with large packets MPW cannot handle. */
-		if (segs_n > MLX5_MPW_DSEG_MAX)
+		if (segs_n > MLX5_MPW_DSEG_MAX) {
+			txq->stats.oerrors++;
 			break;
+		}
 		/* Should we enable HW CKSUM offload. */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
@@ -1446,9 +1462,10 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				dseg = mpw.data.dseg[mpw.pkts_n];
 				addr = rte_pktmbuf_mtod(buf, uintptr_t);
 				*dseg = (struct mlx5_wqe_data_seg){
-					.byte_count = htonl(DATA_LEN(buf)),
+					.byte_count = rte_cpu_to_be_32(
+								DATA_LEN(buf)),
 					.lkey = mlx5_tx_mb2mr(txq, buf),
-					.addr = htonll(addr),
+					.addr = rte_cpu_to_be_64(addr),
 				};
 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
 				length += DATA_LEN(buf);
@@ -1471,7 +1488,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 
 			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
 			assert(length == DATA_LEN(buf));
-			inl_hdr = htonl(length | MLX5_INLINE_SEG);
+			inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			mpw.data.raw = (volatile void *)
 				((uintptr_t)mpw.data.raw + inl_pad);
@@ -1527,9 +1544,9 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++)
 				rte_prefetch2((void *)(addr +
 						n * RTE_CACHE_LINE_SIZE));
-			naddr = htonll(addr);
+			naddr = rte_cpu_to_be_64(addr);
 			*dseg = (rte_v128u32_t) {
-				htonl(length),
+				rte_cpu_to_be_32(length),
 				mlx5_tx_mb2mr(txq, buf),
 				naddr,
 				naddr >> 32,
@@ -1557,7 +1574,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		volatile struct mlx5_wqe *wqe = mpw.wqe;
 
 		/* Request completion on last WQE. */
-		wqe->ctrl[2] = htonl(8);
+		wqe->ctrl[2] = rte_cpu_to_be_32(8);
 		/* Save elts_head in unused "immediate" field of WQE. */
 		wqe->ctrl[3] = elts_head;
 		txq->elts_comp = 0;
@@ -1627,7 +1644,7 @@ rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
  *   with error.
  */
 static inline int
-mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
+mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
 		 uint16_t cqe_cnt, uint32_t *rss_hash)
 {
 	struct rxq_zip *zip = &rxq->zip;
@@ -1641,8 +1658,8 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
 			(volatile struct mlx5_mini_cqe8 (*)[8])
 			(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info);
 
-		len = ntohl((*mc)[zip->ai & 7].byte_cnt);
-		*rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result);
+		len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
+		*rss_hash = rte_be_to_cpu_32((*mc)[zip->ai & 7].rx_hash_result);
 		if ((++zip->ai & 7) == 0) {
 			/* Invalidate consumed CQEs */
 			idx = zip->ca;
@@ -1690,7 +1707,7 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
 							  cqe_cnt].pkt_info);
 
 			/* Fix endianness. */
-			zip->cqe_cnt = ntohl(cqe->byte_cnt);
+			zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
 			/*
 			 * Current mini array position is the one returned by
 			 * check_cqe64().
@@ -1705,8 +1722,8 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
 			--rxq->cq_ci;
 			zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
 			/* Get packet size to return. */
-			len = ntohl((*mc)[0].byte_cnt);
-			*rss_hash = ntohl((*mc)[0].rx_hash_result);
+			len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
+			*rss_hash = rte_be_to_cpu_32((*mc)[0].rx_hash_result);
 			zip->ai = 1;
 			/* Prefetch all the entries to be invalidated */
 			idx = zip->ca;
@@ -1716,8 +1733,8 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
 				++idx;
 			}
 		} else {
-			len = ntohl(cqe->byte_cnt);
-			*rss_hash = ntohl(cqe->rx_hash_res);
+			len = rte_be_to_cpu_32(cqe->byte_cnt);
+			*rss_hash = rte_be_to_cpu_32(cqe->rx_hash_res);
 		}
 		/* Error while receiving packet. */
 		if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
@@ -1738,10 +1755,10 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
  *   Offload flags (ol_flags) for struct rte_mbuf.
  */
 static inline uint32_t
-rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
+rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
 {
 	uint32_t ol_flags = 0;
-	uint16_t flags = ntohs(cqe->hdr_type_etc);
+	uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
 
 	ol_flags =
 		TRANSPOSE(flags,
@@ -1777,7 +1794,7 @@ rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
 uint16_t
 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
-	struct rxq *rxq = dpdk_rxq;
+	struct mlx5_rxq_data *rxq = dpdk_rxq;
 	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
 	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
 	const unsigned int sges_n = rxq->sges_n;
@@ -1848,7 +1865,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			    MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
 				pkt->ol_flags |= PKT_RX_FDIR;
 				if (cqe->sop_drop_qpn !=
-				    htonl(MLX5_FLOW_MARK_DEFAULT)) {
+				    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
 					uint32_t mark = cqe->sop_drop_qpn;
 
 					pkt->ol_flags |= PKT_RX_FDIR_ID;
@@ -1860,10 +1877,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe);
 			if (rxq->vlan_strip &&
 			    (cqe->hdr_type_etc &
-			     htons(MLX5_CQE_VLAN_STRIPPED))) {
-				pkt->ol_flags |= PKT_RX_VLAN_PKT |
+			     rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
+				pkt->ol_flags |= PKT_RX_VLAN |
 					PKT_RX_VLAN_STRIPPED;
-				pkt->vlan_tci = ntohs(cqe->vlan_info);
+				pkt->vlan_tci =
+					rte_be_to_cpu_16(cqe->vlan_info);
+			}
+			if (rxq->hw_timestamp) {
+				pkt->timestamp =
+					rte_be_to_cpu_64(cqe->timestamp);
+				pkt->ol_flags |= PKT_RX_TIMESTAMP;
 			}
 			if (rxq->crc_present)
 				len -= ETHER_CRC_LEN;
@@ -1879,7 +1902,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 * of the buffers are already known, only the buffer address
 		 * changes.
 		 */
-		wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
+		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
 		if (len > DATA_LEN(seg)) {
 			len -= DATA_LEN(seg);
 			++NB_SEGS(pkt);
@@ -1906,10 +1929,10 @@ skip:
 		return 0;
 	/* Update the consumer index. */
 	rxq->rq_ci = rq_ci >> sges_n;
-	rte_wmb();
-	*rxq->cq_db = htonl(rxq->cq_ci);
-	rte_wmb();
-	*rxq->rq_db = htonl(rxq->rq_ci);
+	rte_io_wmb();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	rte_io_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	/* Increment packets counter. */
 	rxq->stats.ipackets += i;
@@ -2016,7 +2039,7 @@ priv_check_vec_tx_support(struct priv *priv)
 }
 
 int __attribute__((weak))
-rxq_check_vec_support(struct rxq *rxq)
+rxq_check_vec_support(struct mlx5_rxq_data *rxq)
 {
 	(void)rxq;
 	return -ENOTSUP;
@@ -2028,9 +2051,3 @@ priv_check_vec_rx_support(struct priv *priv)
 	(void)priv;
 	return -ENOTSUP;
 }
-
-void __attribute__((weak))
-priv_prep_vec_rx_function(struct priv *priv)
-{
-	(void)priv;
-}
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 7de1d108..d34f3cc0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -36,6 +36,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <sys/queue.h>
 
 /* Verbs header. */
 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
@@ -43,21 +44,16 @@
 #pragma GCC diagnostic ignored "-Wpedantic"
 #endif
 #include <infiniband/verbs.h>
-#include <infiniband/mlx5_hw.h>
+#include <infiniband/mlx5dv.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_mbuf.h>
 #include <rte_mempool.h>
 #include <rte_common.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
+#include <rte_hexdump.h>
+#include <rte_atomic.h>
 
 #include "mlx5_utils.h"
 #include "mlx5.h"
@@ -81,19 +77,22 @@ struct mlx5_txq_stats {
 	uint64_t opackets; /**< Total of successfully sent packets. */
 	uint64_t obytes; /**< Total of successfully sent bytes. */
 #endif
-	uint64_t odropped; /**< Total of packets not sent when TX ring full. */
-};
-
-/* Flow director queue structure. */
-struct fdir_queue {
-	struct ibv_qp *qp; /* Associated RX QP. */
-	struct ibv_exp_rwq_ind_table *ind_table; /* Indirection table. */
-	struct ibv_exp_wq *wq; /* Work queue. */
-	struct ibv_cq *cq; /* Completion queue. */
+	uint64_t oerrors; /**< Total number of failed transmitted packets. */
 };
 
 struct priv;
 
+/* Memory region queue object. */
+struct mlx5_mr {
+	LIST_ENTRY(mlx5_mr) next; /**< Pointer to the next element. */
+	rte_atomic32_t refcnt; /*<< Reference counter. */
+	uint32_t lkey; /*<< rte_cpu_to_be_32(mr->lkey) */
+	uintptr_t start; /* Start address of MR */
+	uintptr_t end; /* End address of MR */
+	struct ibv_mr *mr; /*<< Memory Region. */
+	struct rte_mempool *mp; /*<< Memory Pool. */
+};
+
 /* Compressed CQE context. */
 struct rxq_zip {
 	uint16_t ai; /* Array index. */
@@ -104,22 +103,22 @@ struct rxq_zip {
 };
 
 /* RX queue descriptor. */
-struct rxq {
+struct mlx5_rxq_data {
 	unsigned int csum:1; /* Enable checksum offloading. */
 	unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
+	unsigned int hw_timestamp:1; /* Enable HW timestamp. */
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
 	unsigned int crc_present:1; /* CRC must be subtracted. */
 	unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */
 	unsigned int cqe_n:4; /* Log 2 of CQ elements. */
 	unsigned int elts_n:4; /* Log 2 of Mbufs. */
-	unsigned int port_id:8;
 	unsigned int rss_hash:1; /* RSS hash result is enabled. */
 	unsigned int mark:1; /* Marked flow available on the queue. */
 	unsigned int pending_err:1; /* CQE error needs to be handled. */
-	unsigned int trim_elts:1; /* Whether elts needs clean-up. */
-	unsigned int :6; /* Remaining bits. */
+	unsigned int :14; /* Remaining bits. */
 	volatile uint32_t *rq_db;
 	volatile uint32_t *cq_db;
+	uint16_t port_id;
 	uint16_t rq_ci;
 	uint16_t rq_pi;
 	uint16_t cq_ci;
@@ -131,122 +130,56 @@ struct rxq {
 	struct mlx5_rxq_stats stats;
 	uint64_t mbuf_initializer; /* Default rearm_data for vectorized Rx. */
 	struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */
+	void *cq_uar; /* CQ user access region. */
+	uint32_t cqn; /* CQ number. */
+	uint8_t cq_arm_sn; /* CQ arm seq number. */
 } __rte_cache_aligned;
 
-/* RX queue control descriptor. */
-struct rxq_ctrl {
-	struct priv *priv; /* Back pointer to private data. */
+/* Verbs Rx queue elements. */
+struct mlx5_rxq_ibv {
+	LIST_ENTRY(mlx5_rxq_ibv) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	struct mlx5_rxq_ctrl *rxq_ctrl; /* Back pointer to parent. */
 	struct ibv_cq *cq; /* Completion Queue. */
-	struct ibv_exp_wq *wq; /* Work Queue. */
-	struct fdir_queue *fdir_queue; /* Flow director queue. */
-	struct ibv_mr *mr; /* Memory Region (for mp). */
+	struct ibv_wq *wq; /* Work Queue. */
 	struct ibv_comp_channel *channel;
-	unsigned int socket; /* CPU socket ID for allocations. */
-	struct rxq rxq; /* Data path structure. */
-};
-
-/* Hash RX queue types. */
-enum hash_rxq_type {
-	HASH_RXQ_TCPV4,
-	HASH_RXQ_UDPV4,
-	HASH_RXQ_IPV4,
-	HASH_RXQ_TCPV6,
-	HASH_RXQ_UDPV6,
-	HASH_RXQ_IPV6,
-	HASH_RXQ_ETH,
-};
-
-/* Flow structure with Ethernet specification. It is packed to prevent padding
- * between attr and spec as this layout is expected by libibverbs. */
-struct flow_attr_spec_eth {
-	struct ibv_exp_flow_attr attr;
-	struct ibv_exp_flow_spec_eth spec;
-} __attribute__((packed));
-
-/* Define a struct flow_attr_spec_eth object as an array of at least
- * "size" bytes. Room after the first index is normally used to store
- * extra flow specifications. */
-#define FLOW_ATTR_SPEC_ETH(name, size) \
-	struct flow_attr_spec_eth name \
-		[((size) / sizeof(struct flow_attr_spec_eth)) + \
-		 !!((size) % sizeof(struct flow_attr_spec_eth))]
-
-/* Initialization data for hash RX queue. */
-struct hash_rxq_init {
-	uint64_t hash_fields; /* Fields that participate in the hash. */
-	uint64_t dpdk_rss_hf; /* Matching DPDK RSS hash fields. */
-	unsigned int flow_priority; /* Flow priority to use. */
-	union {
-		struct {
-			enum ibv_exp_flow_spec_type type;
-			uint16_t size;
-		} hdr;
-		struct ibv_exp_flow_spec_tcp_udp tcp_udp;
-		struct ibv_exp_flow_spec_ipv4 ipv4;
-		struct ibv_exp_flow_spec_ipv6 ipv6;
-		struct ibv_exp_flow_spec_eth eth;
-	} flow_spec; /* Flow specification template. */
-	const struct hash_rxq_init *underlayer; /* Pointer to underlayer. */
+	struct mlx5_mr *mr; /* Memory Region (for mp). */
 };
 
-/* Initialization data for indirection table. */
-struct ind_table_init {
-	unsigned int max_size; /* Maximum number of WQs. */
-	/* Hash RX queues using this table. */
-	unsigned int hash_types;
-	unsigned int hash_types_n;
+/* RX queue control descriptor. */
+struct mlx5_rxq_ctrl {
+	LIST_ENTRY(mlx5_rxq_ctrl) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	struct priv *priv; /* Back pointer to private data. */
+	struct mlx5_rxq_ibv *ibv; /* Verbs elements. */
+	struct mlx5_rxq_data rxq; /* Data path structure. */
+	unsigned int socket; /* CPU socket ID for allocations. */
+	unsigned int irq:1; /* Whether IRQ is enabled. */
 };
 
-/* Initialization data for special flows. */
-struct special_flow_init {
-	uint8_t dst_mac_val[6];
-	uint8_t dst_mac_mask[6];
-	unsigned int hash_types;
-	unsigned int per_vlan:1;
+/* Indirection table. */
+struct mlx5_ind_table_ibv {
+	LIST_ENTRY(mlx5_ind_table_ibv) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */
+	uint16_t queues_n; /**< Number of queues in the list. */
+	uint16_t queues[]; /**< Queue list. */
 };
 
-enum hash_rxq_flow_type {
-	HASH_RXQ_FLOW_TYPE_PROMISC,
-	HASH_RXQ_FLOW_TYPE_ALLMULTI,
-	HASH_RXQ_FLOW_TYPE_BROADCAST,
-	HASH_RXQ_FLOW_TYPE_IPV6MULTI,
-	HASH_RXQ_FLOW_TYPE_MAC,
-};
-
-#ifndef NDEBUG
-static inline const char *
-hash_rxq_flow_type_str(enum hash_rxq_flow_type flow_type)
-{
-	switch (flow_type) {
-	case HASH_RXQ_FLOW_TYPE_PROMISC:
-		return "promiscuous";
-	case HASH_RXQ_FLOW_TYPE_ALLMULTI:
-		return "allmulticast";
-	case HASH_RXQ_FLOW_TYPE_BROADCAST:
-		return "broadcast";
-	case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
-		return "IPv6 multicast";
-	case HASH_RXQ_FLOW_TYPE_MAC:
-		return "MAC";
-	}
-	return NULL;
-}
-#endif /* NDEBUG */
-
-struct hash_rxq {
-	struct priv *priv; /* Back pointer to private data. */
-	struct ibv_qp *qp; /* Hash RX QP. */
-	enum hash_rxq_type type; /* Hash RX queue type. */
-	/* MAC flow steering rules, one per VLAN ID. */
-	struct ibv_exp_flow *mac_flow
-		[MLX5_MAX_MAC_ADDRESSES][MLX5_MAX_VLAN_IDS];
-	struct ibv_exp_flow *special_flow
-		[MLX5_MAX_SPECIAL_FLOWS][MLX5_MAX_VLAN_IDS];
+/* Hash Rx queue. */
+struct mlx5_hrxq {
+	LIST_ENTRY(mlx5_hrxq) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	struct mlx5_ind_table_ibv *ind_table; /* Indirection table. */
+	struct ibv_qp *qp; /* Verbs queue pair. */
+	uint64_t hash_fields; /* Verbs Hash fields. */
+	uint8_t rss_key_len; /* Hash key length in bytes. */
+	uint8_t rss_key[]; /* Hash key. */
 };
 
 /* TX queue descriptor. */
 __extension__
-struct txq {
+struct mlx5_txq_data {
 	uint16_t elts_head; /* Current counter in (*elts)[]. */
 	uint16_t elts_tail; /* Counter of first element awaiting completion. */
 	uint16_t elts_comp; /* Counter since last completion request. */
@@ -265,6 +198,7 @@ struct txq {
 	uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
 	uint16_t inline_max_packet_sz; /* Max packet size for inlining. */
+	uint16_t mr_cache_idx; /* Index of last hit entry. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	uint32_t flags; /* Flags for Tx Queue. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
@@ -272,61 +206,92 @@ struct txq {
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
 	volatile void *bf_reg; /* Blueflame register. */
-	struct {
-		uintptr_t start; /* Start address of MR */
-		uintptr_t end; /* End address of MR */
-		struct ibv_mr *mr; /* Memory Region (for mp). */
-		uint32_t lkey; /* htonl(mr->lkey) */
-	} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
-	uint16_t mr_cache_idx; /* Index of last hit entry. */
+	struct mlx5_mr *mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MR translation table. */
 	struct rte_mbuf *(*elts)[]; /* TX elements. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
 } __rte_cache_aligned;
 
-/* TX queue control descriptor. */
-struct txq_ctrl {
-	struct priv *priv; /* Back pointer to private data. */
+/* Verbs Rx queue elements. */
+struct mlx5_txq_ibv {
+	LIST_ENTRY(mlx5_txq_ibv) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_qp *qp; /* Queue Pair. */
+};
+
+/* TX queue control descriptor. */
+struct mlx5_txq_ctrl {
+	LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	struct priv *priv; /* Back pointer to private data. */
 	unsigned int socket; /* CPU socket ID for allocations. */
-	struct txq txq; /* Data path structure. */
+	unsigned int max_inline_data; /* Max inline data. */
+	unsigned int max_tso_header; /* Max TSO header size. */
+	struct mlx5_txq_ibv *ibv; /* Verbs queue object. */
+	struct mlx5_txq_data txq; /* Data path structure. */
+	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
 };
 
 /* mlx5_rxq.c */
 
-extern const struct hash_rxq_init hash_rxq_init[];
-extern const unsigned int hash_rxq_init_n;
-
 extern uint8_t rss_hash_default_key[];
 extern const size_t rss_hash_default_key_len;
 
-size_t priv_flow_attr(struct priv *, struct ibv_exp_flow_attr *,
-		      size_t, enum hash_rxq_type);
-int priv_create_hash_rxqs(struct priv *);
-void priv_destroy_hash_rxqs(struct priv *);
-int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type);
-int priv_rehash_flows(struct priv *);
-void rxq_cleanup(struct rxq_ctrl *);
+void mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *);
 int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
 			const struct rte_eth_rxconf *, struct rte_mempool *);
 void mlx5_rx_queue_release(void *);
-uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
 int priv_rx_intr_vec_enable(struct priv *priv);
 void priv_rx_intr_vec_disable(struct priv *priv);
-#ifdef HAVE_UPDATE_CQ_CI
 int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id);
 int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id);
-#endif /* HAVE_UPDATE_CQ_CI */
+struct mlx5_rxq_ibv *mlx5_priv_rxq_ibv_new(struct priv *, uint16_t);
+struct mlx5_rxq_ibv *mlx5_priv_rxq_ibv_get(struct priv *, uint16_t);
+int mlx5_priv_rxq_ibv_release(struct priv *, struct mlx5_rxq_ibv *);
+int mlx5_priv_rxq_ibv_releasable(struct priv *, struct mlx5_rxq_ibv *);
+int mlx5_priv_rxq_ibv_verify(struct priv *);
+struct mlx5_rxq_ctrl *mlx5_priv_rxq_new(struct priv *, uint16_t,
+					uint16_t, unsigned int,
+					struct rte_mempool *);
+struct mlx5_rxq_ctrl *mlx5_priv_rxq_get(struct priv *, uint16_t);
+int mlx5_priv_rxq_release(struct priv *, uint16_t);
+int mlx5_priv_rxq_releasable(struct priv *, uint16_t);
+int mlx5_priv_rxq_verify(struct priv *);
+int rxq_alloc_elts(struct mlx5_rxq_ctrl *);
+struct mlx5_ind_table_ibv *mlx5_priv_ind_table_ibv_new(struct priv *,
+						       uint16_t [],
+						       uint16_t);
+struct mlx5_ind_table_ibv *mlx5_priv_ind_table_ibv_get(struct priv *,
+						       uint16_t [],
+						       uint16_t);
+int mlx5_priv_ind_table_ibv_release(struct priv *, struct mlx5_ind_table_ibv *);
+int mlx5_priv_ind_table_ibv_verify(struct priv *);
+struct mlx5_hrxq *mlx5_priv_hrxq_new(struct priv *, uint8_t *, uint8_t,
+				     uint64_t, uint16_t [], uint16_t);
+struct mlx5_hrxq *mlx5_priv_hrxq_get(struct priv *, uint8_t *, uint8_t,
+				     uint64_t, uint16_t [], uint16_t);
+int mlx5_priv_hrxq_release(struct priv *, struct mlx5_hrxq *);
+int mlx5_priv_hrxq_ibv_verify(struct priv *);
 
 /* mlx5_txq.c */
 
-void txq_cleanup(struct txq_ctrl *);
-int txq_ctrl_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t,
-		   unsigned int, const struct rte_eth_txconf *);
 int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
 			const struct rte_eth_txconf *);
 void mlx5_tx_queue_release(void *);
-uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
+int priv_tx_uar_remap(struct priv *priv, int fd);
+struct mlx5_txq_ibv *mlx5_priv_txq_ibv_new(struct priv *, uint16_t);
+struct mlx5_txq_ibv *mlx5_priv_txq_ibv_get(struct priv *, uint16_t);
+int mlx5_priv_txq_ibv_release(struct priv *, struct mlx5_txq_ibv *);
+int mlx5_priv_txq_ibv_releasable(struct priv *, struct mlx5_txq_ibv *);
+int mlx5_priv_txq_ibv_verify(struct priv *);
+struct mlx5_txq_ctrl *mlx5_priv_txq_new(struct priv *, uint16_t,
+					uint16_t, unsigned int,
+					const struct rte_eth_txconf *);
+struct mlx5_txq_ctrl *mlx5_priv_txq_get(struct priv *, uint16_t);
+int mlx5_priv_txq_release(struct priv *, uint16_t);
+int mlx5_priv_txq_releasable(struct priv *, uint16_t);
+int mlx5_priv_txq_verify(struct priv *);
+void txq_alloc_elts(struct mlx5_txq_ctrl *);
 
 /* mlx5_rxtx.c */
 
@@ -346,18 +311,19 @@ int mlx5_tx_descriptor_status(void *, uint16_t);
 /* Vectorized version of mlx5_rxtx.c */
 int priv_check_raw_vec_tx_support(struct priv *);
 int priv_check_vec_tx_support(struct priv *);
-int rxq_check_vec_support(struct rxq *);
+int rxq_check_vec_support(struct mlx5_rxq_data *);
 int priv_check_vec_rx_support(struct priv *);
-void priv_prep_vec_rx_function(struct priv *);
 uint16_t mlx5_tx_burst_raw_vec(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_tx_burst_vec(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_rx_burst_vec(void *, struct rte_mbuf **, uint16_t);
 
 /* mlx5_mr.c */
 
-struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *);
-void txq_mp2mr_iter(struct rte_mempool *, void *);
-uint32_t txq_mp2mr_reg(struct txq *, struct rte_mempool *, unsigned int);
+void mlx5_mp2mr_iter(struct rte_mempool *, void *);
+struct mlx5_mr *priv_txq_mp2mr_reg(struct priv *priv, struct mlx5_txq_data *,
+				   struct rte_mempool *, unsigned int);
+struct mlx5_mr *mlx5_txq_mp2mr_reg(struct mlx5_txq_data *, struct rte_mempool *,
+				   unsigned int);
 
 #ifndef NDEBUG
 /**
@@ -419,16 +385,24 @@ check_cqe(volatile struct mlx5_cqe *cqe,
 		if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
 		    (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
 			return 0;
-		if (!check_cqe_seen(cqe))
+		if (!check_cqe_seen(cqe)) {
 			ERROR("unexpected CQE error %u (0x%02x)"
 			      " syndrome 0x%02x",
 			      op_code, op_code, syndrome);
+			rte_hexdump(stderr, "MLX5 Error CQE:",
+				    (const void *)((uintptr_t)err_cqe),
+				    sizeof(*err_cqe));
+		}
 		return 1;
 	} else if ((op_code != MLX5_CQE_RESP_SEND) &&
 		   (op_code != MLX5_CQE_REQ)) {
-		if (!check_cqe_seen(cqe))
+		if (!check_cqe_seen(cqe)) {
 			ERROR("unexpected CQE opcode %u (0x%02x)",
 			      op_code, op_code);
+			rte_hexdump(stderr, "MLX5 CQE:",
+				    (const void *)((uintptr_t)cqe),
+				    sizeof(*cqe));
+		}
 		return 1;
 	}
 #endif /* NDEBUG */
@@ -447,7 +421,7 @@ check_cqe(volatile struct mlx5_cqe *cqe,
  *   WQE address.
  */
 static inline uintptr_t *
-tx_mlx5_wqe(struct txq *txq, uint16_t ci)
+tx_mlx5_wqe(struct mlx5_txq_data *txq, uint16_t ci)
 {
 	ci &= ((1 << txq->wqe_n) - 1);
 	return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
@@ -462,7 +436,7 @@ tx_mlx5_wqe(struct txq *txq, uint16_t ci)
  *   Pointer to TX queue structure.
  */
 static __rte_always_inline void
-mlx5_tx_complete(struct txq *txq)
+mlx5_tx_complete(struct mlx5_txq_data *txq)
 {
 	const uint16_t elts_n = 1 << txq->elts_n;
 	const uint16_t elts_m = elts_n - 1;
@@ -483,13 +457,18 @@ mlx5_tx_complete(struct txq *txq)
 #ifndef NDEBUG
 	if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
 	    (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
-		if (!check_cqe_seen(cqe))
+		if (!check_cqe_seen(cqe)) {
 			ERROR("unexpected error CQE, TX stopped");
+			rte_hexdump(stderr, "MLX5 TXQ:",
+				    (const void *)((uintptr_t)txq->wqes),
+				    ((1 << txq->wqe_n) *
+				     MLX5_WQE_SIZE));
+		}
 		return;
 	}
 #endif /* NDEBUG */
 	++cq_ci;
-	txq->wqe_pi = ntohs(cqe->wqe_counter);
+	txq->wqe_pi = rte_be_to_cpu_16(cqe->wqe_counter);
 	ctrl = (volatile struct mlx5_wqe_ctrl *)
 		tx_mlx5_wqe(txq, txq->wqe_pi);
 	elts_tail = ctrl->ctrl3;
@@ -526,8 +505,8 @@ mlx5_tx_complete(struct txq *txq)
 	txq->cq_ci = cq_ci;
 	txq->elts_tail = elts_tail;
 	/* Update the consumer index. */
-	rte_wmb();
-	*txq->cq_db = htonl(cq_ci);
+	rte_compiler_barrier();
+	*txq->cq_db = rte_cpu_to_be_32(cq_ci);
 }
 
 /**
@@ -562,51 +541,80 @@ mlx5_tx_mb2mp(struct rte_mbuf *buf)
  *   mr->lkey on success, (uint32_t)-1 on failure.
  */
 static __rte_always_inline uint32_t
-mlx5_tx_mb2mr(struct txq *txq, struct rte_mbuf *mb)
+mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
 {
 	uint16_t i = txq->mr_cache_idx;
 	uintptr_t addr = rte_pktmbuf_mtod(mb, uintptr_t);
+	struct mlx5_mr *mr;
 
 	assert(i < RTE_DIM(txq->mp2mr));
-	if (likely(txq->mp2mr[i].start <= addr && txq->mp2mr[i].end >= addr))
-		return txq->mp2mr[i].lkey;
+	if (likely(txq->mp2mr[i]->start <= addr && txq->mp2mr[i]->end >= addr))
+		return txq->mp2mr[i]->lkey;
 	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
-		if (unlikely(txq->mp2mr[i].mr == NULL)) {
+		if (unlikely(txq->mp2mr[i]->mr == NULL)) {
 			/* Unknown MP, add a new MR for it. */
 			break;
 		}
-		if (txq->mp2mr[i].start <= addr &&
-		    txq->mp2mr[i].end >= addr) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(htonl(txq->mp2mr[i].mr->lkey) ==
-			       txq->mp2mr[i].lkey);
+		if (txq->mp2mr[i]->start <= addr &&
+		    txq->mp2mr[i]->end >= addr) {
+			assert(txq->mp2mr[i]->lkey != (uint32_t)-1);
+			assert(rte_cpu_to_be_32(txq->mp2mr[i]->mr->lkey) ==
+			       txq->mp2mr[i]->lkey);
 			txq->mr_cache_idx = i;
-			return txq->mp2mr[i].lkey;
+			return txq->mp2mr[i]->lkey;
 		}
 	}
 	txq->mr_cache_idx = 0;
-	return txq_mp2mr_reg(txq, mlx5_tx_mb2mp(mb), i);
+	mr = mlx5_txq_mp2mr_reg(txq, mlx5_tx_mb2mp(mb), i);
+	/*
+	 * Request the reference to use in this queue, the original one is
+	 * kept by the control plane.
+	 */
+	if (mr) {
+		rte_atomic32_inc(&mr->refcnt);
+		return mr->lkey;
+	}
+	return (uint32_t)-1;
 }
 
 /**
- * Ring TX queue doorbell.
+ * Ring TX queue doorbell and flush the update if requested.
  *
  * @param txq
  *   Pointer to TX queue structure.
  * @param wqe
  *   Pointer to the last WQE posted in the NIC.
+ * @param cond
+ *   Request for write memory barrier after BlueFlame update.
  */
 static __rte_always_inline void
-mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
+mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe,
+		       int cond)
 {
 	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
 	volatile uint64_t *src = ((volatile uint64_t *)wqe);
 
-	rte_wmb();
-	*txq->qp_db = htonl(txq->wqe_ci);
+	rte_io_wmb();
+	*txq->qp_db = rte_cpu_to_be_32(txq->wqe_ci);
 	/* Ensure ordering between DB record and BF copy. */
 	rte_wmb();
 	*dst = *src;
+	if (cond)
+		rte_wmb();
+}
+
+/**
+ * Ring TX queue doorbell and flush the update by write memory barrier.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param wqe
+ *   Pointer to the last WQE posted in the NIC.
+ */
+static __rte_always_inline void
+mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
+{
+	mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
 }
 
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
new file mode 100644
index 00000000..ba6c8cef
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -0,0 +1,388 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#include <infiniband/mlx5dv.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
+#include "mlx5_autoconf.h"
+#include "mlx5_defs.h"
+#include "mlx5_prm.h"
+
+#if defined RTE_ARCH_X86_64
+#include "mlx5_rxtx_vec_sse.h"
+#elif defined RTE_ARCH_ARM64
+#include "mlx5_rxtx_vec_neon.h"
+#else
+#error "This should not be compiled if SIMD instructions are not supported."
+#endif
+
+/**
+ * Count the number of continuous single segment packets.
+ *
+ * @param pkts
+ *   Pointer to array of packets.
+ * @param pkts_n
+ *   Number of packets.
+ *
+ * @return
+ *   Number of continuous single segment packets.
+ */
+static inline unsigned int
+txq_check_multiseg(struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	unsigned int pos;
+
+	if (!pkts_n)
+		return 0;
+	/* Count the number of continuous single segment packets. */
+	for (pos = 0; pos < pkts_n; ++pos)
+		if (NB_SEGS(pkts[pos]) > 1)
+			break;
+	return pos;
+}
+
+/**
+ * Count the number of packets having same ol_flags and calculate cs_flags.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param pkts
+ *   Pointer to array of packets.
+ * @param pkts_n
+ *   Number of packets.
+ * @param cs_flags
+ *   Pointer of flags to be returned.
+ *
+ * @return
+ *   Number of packets having same ol_flags.
+ */
+static inline unsigned int
+txq_calc_offload(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
+		 uint16_t pkts_n, uint8_t *cs_flags)
+{
+	unsigned int pos;
+	const uint64_t ol_mask =
+		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
+		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
+		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
+
+	if (!pkts_n)
+		return 0;
+	/* Count the number of packets having same ol_flags. */
+	for (pos = 1; pos < pkts_n; ++pos)
+		if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask)
+			break;
+	/* Should open another MPW session for the rest. */
+	if (pkts[0]->ol_flags &
+	    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
+		const uint64_t is_tunneled =
+			pkts[0]->ol_flags &
+			(PKT_TX_TUNNEL_GRE |
+			 PKT_TX_TUNNEL_VXLAN);
+
+		if (is_tunneled && txq->tunnel_en) {
+			*cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM |
+				    MLX5_ETH_WQE_L4_INNER_CSUM;
+			if (pkts[0]->ol_flags & PKT_TX_OUTER_IP_CKSUM)
+				*cs_flags |= MLX5_ETH_WQE_L3_CSUM;
+		} else {
+			*cs_flags = MLX5_ETH_WQE_L3_CSUM |
+				    MLX5_ETH_WQE_L4_CSUM;
+		}
+	}
+	return pos;
+}
+
+/**
+ * DPDK callback for vectorized TX.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts,
+		      uint16_t pkts_n)
+{
+	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
+	uint16_t nb_tx = 0;
+
+	while (pkts_n > nb_tx) {
+		uint16_t n;
+		uint16_t ret;
+
+		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
+		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0);
+		nb_tx += ret;
+		if (!ret)
+			break;
+	}
+	return nb_tx;
+}
+
+/**
+ * DPDK callback for vectorized TX with multi-seg packets and offload.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
+	uint16_t nb_tx = 0;
+
+	while (pkts_n > nb_tx) {
+		uint8_t cs_flags = 0;
+		uint16_t n;
+		uint16_t ret;
+
+		/* Transmit multi-seg packets in the head of pkts list. */
+		if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) &&
+		    NB_SEGS(pkts[nb_tx]) > 1)
+			nb_tx += txq_scatter_v(txq,
+					       &pkts[nb_tx],
+					       pkts_n - nb_tx);
+		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
+		if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS))
+			n = txq_check_multiseg(&pkts[nb_tx], n);
+		if (!(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS))
+			n = txq_calc_offload(txq, &pkts[nb_tx], n, &cs_flags);
+		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags);
+		nb_tx += ret;
+		if (!ret)
+			break;
+	}
+	return nb_tx;
+}
+
+/**
+ * Skip error packets.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+static uint16_t
+rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+			 uint16_t pkts_n)
+{
+	uint16_t n = 0;
+	unsigned int i;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t err_bytes = 0;
+#endif
+
+	for (i = 0; i < pkts_n; ++i) {
+		struct rte_mbuf *pkt = pkts[i];
+
+		if (pkt->packet_type == RTE_PTYPE_ALL_MASK) {
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			err_bytes += PKT_LEN(pkt);
+#endif
+			rte_pktmbuf_free_seg(pkt);
+		} else {
+			pkts[n++] = pkt;
+		}
+	}
+	rxq->stats.idropped += (pkts_n - n);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Correct counters of errored completions. */
+	rxq->stats.ipackets -= (pkts_n - n);
+	rxq->stats.ibytes -= err_bytes;
+#endif
+	rxq->pending_err = 0;
+	return n;
+}
+
+/**
+ * DPDK callback for vectorized RX.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct mlx5_rxq_data *rxq = dpdk_rxq;
+	uint16_t nb_rx;
+
+	nb_rx = rxq_burst_v(rxq, pkts, pkts_n);
+	if (unlikely(rxq->pending_err))
+		nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx);
+	return nb_rx;
+}
+
+/**
+ * Check Tx queue flags are set for raw vectorized Tx.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   1 if supported, negative errno value if not.
+ */
+int __attribute__((cold))
+priv_check_raw_vec_tx_support(struct priv *priv)
+{
+	uint16_t i;
+
+	/* All the configured queues should support. */
+	for (i = 0; i < priv->txqs_n; ++i) {
+		struct mlx5_txq_data *txq = (*priv->txqs)[i];
+
+		if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) ||
+		    !(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS))
+			break;
+	}
+	if (i != priv->txqs_n)
+		return -ENOTSUP;
+	return 1;
+}
+
+/**
+ * Check a device can support vectorized TX.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   1 if supported, negative errno value if not.
+ */
+int __attribute__((cold))
+priv_check_vec_tx_support(struct priv *priv)
+{
+	if (!priv->tx_vec_en ||
+	    priv->txqs_n > MLX5_VPMD_MIN_TXQS ||
+	    priv->mps != MLX5_MPW_ENHANCED ||
+	    priv->tso)
+		return -ENOTSUP;
+	return 1;
+}
+
+/**
+ * Check a RX queue can support vectorized RX.
+ *
+ * @param rxq
+ *   Pointer to RX queue.
+ *
+ * @return
+ *   1 if supported, negative errno value if not.
+ */
+int __attribute__((cold))
+rxq_check_vec_support(struct mlx5_rxq_data *rxq)
+{
+	struct mlx5_rxq_ctrl *ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+
+	if (!ctrl->priv->rx_vec_en || rxq->sges_n != 0)
+		return -ENOTSUP;
+	return 1;
+}
+
+/**
+ * Check a device can support vectorized RX.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   1 if supported, negative errno value if not.
+ */
+int __attribute__((cold))
+priv_check_vec_rx_support(struct priv *priv)
+{
+	uint16_t i;
+
+	if (!priv->rx_vec_en)
+		return -ENOTSUP;
+	/* All the configured queues should support. */
+	for (i = 0; i < priv->rxqs_n; ++i) {
+		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+
+		if (!rxq)
+			continue;
+		if (rxq_check_vec_support(rxq) < 0)
+			break;
+	}
+	if (i != priv->rxqs_n)
+		return -ENOTSUP;
+	return 1;
+}
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
new file mode 100644
index 00000000..1f08ed0b
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -0,0 +1,130 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_VEC_H_
+#define RTE_PMD_MLX5_RXTX_VEC_H_
+
+#include <rte_common.h>
+#include <rte_mbuf.h>
+
+#include "mlx5_autoconf.h"
+#include "mlx5_prm.h"
+
+/*
+ * Compile time sanity check for vectorized functions.
+ */
+
+#define S_ASSERT_RTE_MBUF(s) \
+	static_assert(s, "A field of struct rte_mbuf is changed")
+#define S_ASSERT_MLX5_CQE(s) \
+	static_assert(s, "A field of struct mlx5_cqe is changed")
+
+/* rxq_cq_decompress_v() */
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, pkt_len) ==
+		  offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, data_len) ==
+		  offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, hash) ==
+		  offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+/* rxq_cq_to_ptype_oflags_v() */
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, ol_flags) ==
+		  offsetof(struct rte_mbuf, rearm_data) + 8);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, rearm_data) ==
+		  RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+
+/* rxq_burst_v() */
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, pkt_len) ==
+		  offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+S_ASSERT_RTE_MBUF(offsetof(struct rte_mbuf, data_len) ==
+		  offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+#if (RTE_CACHE_LINE_SIZE == 128)
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, pkt_info) == 64);
+#else
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, pkt_info) == 0);
+#endif
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, rx_hash_res) ==
+		  offsetof(struct mlx5_cqe, pkt_info) + 12);
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, rsvd1) +
+		  sizeof(((struct mlx5_cqe *)0)->rsvd1) ==
+		  offsetof(struct mlx5_cqe, hdr_type_etc));
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, vlan_info) ==
+		  offsetof(struct mlx5_cqe, hdr_type_etc) + 2);
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, rsvd2) +
+		  sizeof(((struct mlx5_cqe *)0)->rsvd2) ==
+		  offsetof(struct mlx5_cqe, byte_cnt));
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, sop_drop_qpn) ==
+		  RTE_ALIGN(offsetof(struct mlx5_cqe, sop_drop_qpn), 8));
+S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, op_own) ==
+		  offsetof(struct mlx5_cqe, sop_drop_qpn) + 7);
+
+/**
+ * Replenish buffers for RX in bulk.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param n
+ *   Number of buffers to be replenished.
+ */
+static inline void
+mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
+{
+	const uint16_t q_n = 1 << rxq->elts_n;
+	const uint16_t q_mask = q_n - 1;
+	uint16_t elts_idx = rxq->rq_ci & q_mask;
+	struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
+	volatile struct mlx5_wqe_data_seg *wq = &(*rxq->wqes)[elts_idx];
+	unsigned int i;
+
+	assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH);
+	assert(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi)));
+	assert(MLX5_VPMD_RXQ_RPLNSH_THRESH > MLX5_VPMD_DESCS_PER_LOOP);
+	/* Not to cross queue end. */
+	n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
+	if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
+		rxq->stats.rx_nombuf += n;
+		return;
+	}
+	for (i = 0; i < n; ++i)
+		wq[i].addr = rte_cpu_to_be_64((uintptr_t)elts[i]->buf_addr +
+					      RTE_PKTMBUF_HEADROOM);
+	rxq->rq_ci += n;
+	/* Prevent overflowing into consumed mbufs. */
+	elts_idx = rxq->rq_ci & q_mask;
+	for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+		(*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf;
+	rte_io_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
new file mode 100644
index 00000000..c721d80e
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -0,0 +1,1039 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_VEC_NEON_H_
+#define RTE_PMD_MLX5_RXTX_VEC_NEON_H_
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <arm_neon.h>
+
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
+#include "mlx5_autoconf.h"
+#include "mlx5_defs.h"
+#include "mlx5_prm.h"
+
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+/**
+ * Fill in buffer descriptors in a multi-packet send descriptor.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param dseg
+ *   Pointer to buffer descriptor to be written.
+ * @param pkts
+ *   Pointer to array of packets to be sent.
+ * @param n
+ *   Number of packets to be filled.
+ */
+static inline void
+txq_wr_dseg_v(struct mlx5_txq_data *txq, uint8_t *dseg,
+	      struct rte_mbuf **pkts, unsigned int n)
+{
+	unsigned int pos;
+	uintptr_t addr;
+	const uint8x16_t dseg_shuf_m = {
+		 3,  2,  1,  0, /* length, bswap32 */
+		 4,  5,  6,  7, /* lkey */
+		15, 14, 13, 12, /* addr, bswap64 */
+		11, 10,  9,  8
+	};
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t tx_byte = 0;
+#endif
+
+	for (pos = 0; pos < n; ++pos, dseg += MLX5_WQE_DWORD_SIZE) {
+		uint8x16_t desc;
+		struct rte_mbuf *pkt = pkts[pos];
+
+		addr = rte_pktmbuf_mtod(pkt, uintptr_t);
+		desc = vreinterpretq_u8_u32((uint32x4_t) {
+				DATA_LEN(pkt),
+				mlx5_tx_mb2mr(txq, pkt),
+				addr,
+				addr >> 32 });
+		desc = vqtbl1q_u8(desc, dseg_shuf_m);
+		vst1q_u8(dseg, desc);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		tx_byte += DATA_LEN(pkt);
+#endif
+	}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	txq->stats.obytes += tx_byte;
+#endif
+}
+
+/**
+ * Send multi-segmented packets until it encounters a single segment packet in
+ * the pkts list.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be sent.
+ * @param pkts_n
+ *   Number of packets to be sent.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+static uint16_t
+txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
+	      uint16_t pkts_n)
+{
+	uint16_t elts_head = txq->elts_head;
+	const uint16_t elts_n = 1 << txq->elts_n;
+	const uint16_t elts_m = elts_n - 1;
+	const uint16_t wq_n = 1 << txq->wqe_n;
+	const uint16_t wq_mask = wq_n - 1;
+	const unsigned int nb_dword_per_wqebb =
+		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
+	const unsigned int nb_dword_in_hdr =
+		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
+	unsigned int n;
+	volatile struct mlx5_wqe *wqe = NULL;
+
+	assert(elts_n > pkts_n);
+	mlx5_tx_complete(txq);
+	if (unlikely(!pkts_n))
+		return 0;
+	for (n = 0; n < pkts_n; ++n) {
+		struct rte_mbuf *buf = pkts[n];
+		unsigned int segs_n = buf->nb_segs;
+		unsigned int ds = nb_dword_in_hdr;
+		unsigned int len = PKT_LEN(buf);
+		uint16_t wqe_ci = txq->wqe_ci;
+		const uint8x16_t ctrl_shuf_m = {
+			3,  2,  1,  0, /* bswap32 */
+			7,  6,  5,  4, /* bswap32 */
+			11, 10,  9,  8, /* bswap32 */
+			12, 13, 14, 15
+		};
+		uint8_t cs_flags = 0;
+		uint16_t max_elts;
+		uint16_t max_wqe;
+		uint8x16_t *t_wqe;
+		uint8_t *dseg;
+		uint8x16_t ctrl;
+
+		assert(segs_n);
+		max_elts = elts_n - (elts_head - txq->elts_tail);
+		max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi);
+		/*
+		 * A MPW session consumes 2 WQEs at most to
+		 * include MLX5_MPW_DSEG_MAX pointers.
+		 */
+		if (segs_n == 1 ||
+		    max_elts < segs_n || max_wqe < 2)
+			break;
+		wqe = &((volatile struct mlx5_wqe64 *)
+			 txq->wqes)[wqe_ci & wq_mask].hdr;
+		if (buf->ol_flags &
+		     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
+			const uint64_t is_tunneled =
+				buf->ol_flags & (PKT_TX_TUNNEL_GRE |
+						 PKT_TX_TUNNEL_VXLAN);
+
+			if (is_tunneled && txq->tunnel_en) {
+				cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM |
+					   MLX5_ETH_WQE_L4_INNER_CSUM;
+				if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
+					cs_flags |= MLX5_ETH_WQE_L3_CSUM;
+			} else {
+				cs_flags = MLX5_ETH_WQE_L3_CSUM |
+					   MLX5_ETH_WQE_L4_CSUM;
+			}
+		}
+		/* Title WQEBB pointer. */
+		t_wqe = (uint8x16_t *)wqe;
+		dseg = (uint8_t *)(wqe + 1);
+		do {
+			if (!(ds++ % nb_dword_per_wqebb)) {
+				dseg = (uint8_t *)
+					&((volatile struct mlx5_wqe64 *)
+					   txq->wqes)[++wqe_ci & wq_mask];
+			}
+			txq_wr_dseg_v(txq, dseg, &buf, 1);
+			dseg += MLX5_WQE_DWORD_SIZE;
+			(*txq->elts)[elts_head++ & elts_m] = buf;
+			buf = buf->next;
+		} while (--segs_n);
+		++wqe_ci;
+		/* Fill CTRL in the header. */
+		ctrl = vreinterpretq_u8_u32((uint32x4_t) {
+				MLX5_OPC_MOD_MPW << 24 |
+				txq->wqe_ci << 8 | MLX5_OPCODE_TSO,
+				txq->qp_num_8s | ds, 0, 0});
+		ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
+		vst1q_u8((void *)t_wqe, ctrl);
+		/* Fill ESEG in the header. */
+		vst1q_u16((void *)(t_wqe + 1),
+			  (uint16x8_t) { 0, 0, cs_flags, rte_cpu_to_be_16(len),
+					 0, 0, 0, 0 });
+		txq->wqe_ci = wqe_ci;
+	}
+	if (!n)
+		return 0;
+	txq->elts_comp += (uint16_t)(elts_head - txq->elts_head);
+	txq->elts_head = elts_head;
+	if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
+		wqe->ctrl[2] = rte_cpu_to_be_32(8);
+		wqe->ctrl[3] = txq->elts_head;
+		txq->elts_comp = 0;
+		++txq->cq_pi;
+	}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	txq->stats.opackets += n;
+#endif
+	mlx5_tx_dbrec(txq, wqe);
+	return n;
+}
+
+/**
+ * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet,
+ * it returns to make it processed by txq_scatter_v(). All the packets in
+ * the pkts list should be single segment packets having same offload flags.
+ * This must be checked by txq_check_multiseg() and txq_calc_offload().
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be sent.
+ * @param pkts_n
+ *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
+ * @param cs_flags
+ *   Checksum offload flags to be written in the descriptor.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+static inline uint16_t
+txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
+	    uint8_t cs_flags)
+{
+	struct rte_mbuf **elts;
+	uint16_t elts_head = txq->elts_head;
+	const uint16_t elts_n = 1 << txq->elts_n;
+	const uint16_t elts_m = elts_n - 1;
+	const unsigned int nb_dword_per_wqebb =
+		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
+	const unsigned int nb_dword_in_hdr =
+		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
+	unsigned int n = 0;
+	unsigned int pos;
+	uint16_t max_elts;
+	uint16_t max_wqe;
+	uint32_t comp_req = 0;
+	const uint16_t wq_n = 1 << txq->wqe_n;
+	const uint16_t wq_mask = wq_n - 1;
+	uint16_t wq_idx = txq->wqe_ci & wq_mask;
+	volatile struct mlx5_wqe64 *wq =
+		&((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx];
+	volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq;
+	const uint8x16_t ctrl_shuf_m = {
+		 3,  2,  1,  0, /* bswap32 */
+		 7,  6,  5,  4, /* bswap32 */
+		11, 10,  9,  8, /* bswap32 */
+		12, 13, 14, 15
+	};
+	uint8x16_t *t_wqe;
+	uint8_t *dseg;
+	uint8x16_t ctrl;
+
+	/* Make sure all packets can fit into a single WQE. */
+	assert(elts_n > pkts_n);
+	mlx5_tx_complete(txq);
+	max_elts = (elts_n - (elts_head - txq->elts_tail));
+	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
+	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
+	if (unlikely(!pkts_n))
+		return 0;
+	elts = &(*txq->elts)[elts_head & elts_m];
+	/* Loop for available tailroom first. */
+	n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n);
+	for (pos = 0; pos < (n & -2); pos += 2)
+		vst1q_u64((void *)&elts[pos], vld1q_u64((void *)&pkts[pos]));
+	if (n & 1)
+		elts[pos] = pkts[pos];
+	/* Check if it crosses the end of the queue. */
+	if (unlikely(n < pkts_n)) {
+		elts = &(*txq->elts)[0];
+		for (pos = 0; pos < pkts_n - n; ++pos)
+			elts[pos] = pkts[n + pos];
+	}
+	txq->elts_head += pkts_n;
+	/* Save title WQEBB pointer. */
+	t_wqe = (uint8x16_t *)wqe;
+	dseg = (uint8_t *)(wqe + 1);
+	/* Calculate the number of entries to the end. */
+	n = RTE_MIN(
+		(wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr,
+		pkts_n);
+	/* Fill DSEGs. */
+	txq_wr_dseg_v(txq, dseg, pkts, n);
+	/* Check if it crosses the end of the queue. */
+	if (n < pkts_n) {
+		dseg = (uint8_t *)txq->wqes;
+		txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n);
+	}
+	if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) {
+		txq->elts_comp += pkts_n;
+	} else {
+		/* Request a completion. */
+		txq->elts_comp = 0;
+		++txq->cq_pi;
+		comp_req = 8;
+	}
+	/* Fill CTRL in the header. */
+	ctrl = vreinterpretq_u8_u32((uint32x4_t) {
+			MLX5_OPC_MOD_ENHANCED_MPSW << 24 |
+			txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW,
+			txq->qp_num_8s | (pkts_n + 2),
+			comp_req,
+			txq->elts_head });
+	ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
+	vst1q_u8((void *)t_wqe, ctrl);
+	/* Fill ESEG in the header. */
+	vst1q_u8((void *)(t_wqe + 1),
+		 (uint8x16_t) { 0, 0, 0, 0,
+				cs_flags, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0 });
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	txq->stats.opackets += pkts_n;
+#endif
+	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
+		       nb_dword_per_wqebb;
+	/* Ring QP doorbell. */
+	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
+	return pkts_n;
+}
+
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ */
+static inline void
+rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
+{
+	const uint16_t q_mask = (1 << rxq->elts_n) - 1;
+	struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
+	unsigned int pos;
+	uint16_t p = n & -2;
+
+	for (pos = 0; pos < p; pos += 2) {
+		uint64x2_t mbp;
+
+		mbp = vld1q_u64((void *)&elts[pos]);
+		vst1q_u64((void *)&pkts[pos], mbp);
+	}
+	if (n & 1)
+		pkts[pos] = elts[pos];
+}
+
+/**
+ * Decompress a compressed completion and fill in mbufs in RX SW ring with data
+ * extracted from the title completion descriptor.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param cq
+ *   Pointer to completion array having a compressed completion at first.
+ * @param elts
+ *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
+ *   the title completion descriptor to be copied to the rest of mbufs.
+ */
+static inline void
+rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
+		    struct rte_mbuf **elts)
+{
+	volatile struct mlx5_mini_cqe8 *mcq = (void *)&(cq + 1)->pkt_info;
+	struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+	unsigned int pos;
+	unsigned int i;
+	unsigned int inv = 0;
+	/* Mask to shuffle from extracted mini CQE to mbuf. */
+	const uint8x16_t mcqe_shuf_m1 = {
+		-1, -1, -1, -1, /* skip packet_type */
+		 7,  6, -1, -1, /* pkt_len, bswap16 */
+		 7,  6,         /* data_len, bswap16 */
+		-1, -1,         /* skip vlan_tci */
+		 3,  2,  1,  0  /* hash.rss, bswap32 */
+	};
+	const uint8x16_t mcqe_shuf_m2 = {
+		-1, -1, -1, -1, /* skip packet_type */
+		15, 14, -1, -1, /* pkt_len, bswap16 */
+		15, 14,         /* data_len, bswap16 */
+		-1, -1,         /* skip vlan_tci */
+		11, 10,  9,  8  /* hash.rss, bswap32 */
+	};
+	/* Restore the compressed count. Must be 16 bits. */
+	const uint16_t mcqe_n = t_pkt->data_len +
+				(rxq->crc_present * ETHER_CRC_LEN);
+	const uint64x2_t rearm =
+		vld1q_u64((void *)&t_pkt->rearm_data);
+	const uint32x4_t rxdf_mask = {
+		0xffffffff, /* packet_type */
+		0,          /* skip pkt_len */
+		0xffff0000, /* vlan_tci, skip data_len */
+		0,          /* skip hash.rss */
+	};
+	const uint8x16_t rxdf =
+		vandq_u8(vld1q_u8((void *)&t_pkt->rx_descriptor_fields1),
+			 vreinterpretq_u8_u32(rxdf_mask));
+	const uint16x8_t crc_adj = {
+		0, 0,
+		rxq->crc_present * ETHER_CRC_LEN, 0,
+		rxq->crc_present * ETHER_CRC_LEN, 0,
+		0, 0
+	};
+	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t rcvd_byte = 0;
+#endif
+	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+	const uint8x8_t len_shuf_m = {
+		 7,  6,         /* 1st mCQE */
+		15, 14,         /* 2nd mCQE */
+		23, 22,         /* 3rd mCQE */
+		31, 30          /* 4th mCQE */
+	};
+
+	/*
+	 * A. load mCQEs into a 128bit register.
+	 * B. store rearm data to mbuf.
+	 * C. combine data from mCQEs with rx_descriptor_fields1.
+	 * D. store rx_descriptor_fields1.
+	 * E. store flow tag (rte_flow mark).
+	 */
+	for (pos = 0; pos < mcqe_n; ) {
+		uint8_t *p = (void *)&mcq[pos % 8];
+		uint8_t *e0 = (void *)&elts[pos]->rearm_data;
+		uint8_t *e1 = (void *)&elts[pos + 1]->rearm_data;
+		uint8_t *e2 = (void *)&elts[pos + 2]->rearm_data;
+		uint8_t *e3 = (void *)&elts[pos + 3]->rearm_data;
+		uint16x4_t byte_cnt;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		uint16x4_t invalid_mask =
+			vcreate_u16(mcqe_n - pos < MLX5_VPMD_DESCS_PER_LOOP ?
+				    -1UL << ((mcqe_n - pos) *
+					     sizeof(uint16_t) * 8) : 0);
+#endif
+
+		if (!(pos & 0x7) && pos + 8 < mcqe_n)
+			rte_prefetch0((void *)(cq + pos + 8));
+		__asm__ volatile (
+		/* A.1 load mCQEs into a 128bit register. */
+		"ld1 {v16.16b - v17.16b}, [%[mcq]] \n\t"
+		/* B.1 store rearm data to mbuf. */
+		"st1 {%[rearm].2d}, [%[e0]] \n\t"
+		"add %[e0], %[e0], #16 \n\t"
+		"st1 {%[rearm].2d}, [%[e1]] \n\t"
+		"add %[e1], %[e1], #16 \n\t"
+		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+		"tbl v18.16b, {v16.16b}, %[mcqe_shuf_m1].16b \n\t"
+		"tbl v19.16b, {v16.16b}, %[mcqe_shuf_m2].16b \n\t"
+		"sub v18.8h, v18.8h, %[crc_adj].8h \n\t"
+		"sub v19.8h, v19.8h, %[crc_adj].8h \n\t"
+		"orr v18.16b, v18.16b, %[rxdf].16b \n\t"
+		"orr v19.16b, v19.16b, %[rxdf].16b \n\t"
+		/* D.1 store rx_descriptor_fields1. */
+		"st1 {v18.2d}, [%[e0]] \n\t"
+		"st1 {v19.2d}, [%[e1]] \n\t"
+		/* B.1 store rearm data to mbuf. */
+		"st1 {%[rearm].2d}, [%[e2]] \n\t"
+		"add %[e2], %[e2], #16 \n\t"
+		"st1 {%[rearm].2d}, [%[e3]] \n\t"
+		"add %[e3], %[e3], #16 \n\t"
+		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+		"tbl v18.16b, {v17.16b}, %[mcqe_shuf_m1].16b \n\t"
+		"tbl v19.16b, {v17.16b}, %[mcqe_shuf_m2].16b \n\t"
+		"sub v18.8h, v18.8h, %[crc_adj].8h \n\t"
+		"sub v19.8h, v19.8h, %[crc_adj].8h \n\t"
+		"orr v18.16b, v18.16b, %[rxdf].16b \n\t"
+		"orr v19.16b, v19.16b, %[rxdf].16b \n\t"
+		/* D.1 store rx_descriptor_fields1. */
+		"st1 {v18.2d}, [%[e2]] \n\t"
+		"st1 {v19.2d}, [%[e3]] \n\t"
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		"tbl %[byte_cnt].8b, {v16.16b - v17.16b}, %[len_shuf_m].8b \n\t"
+#endif
+		:[byte_cnt]"=&w"(byte_cnt)
+		:[mcq]"r"(p),
+		 [rxdf]"w"(rxdf),
+		 [rearm]"w"(rearm),
+		 [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
+		 [mcqe_shuf_m1]"w"(mcqe_shuf_m1),
+		 [mcqe_shuf_m2]"w"(mcqe_shuf_m2),
+		 [crc_adj]"w"(crc_adj),
+		 [len_shuf_m]"w"(len_shuf_m)
+		:"memory", "v16", "v17", "v18", "v19");
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		byte_cnt = vbic_u16(byte_cnt, invalid_mask);
+		rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);
+#endif
+		if (rxq->mark) {
+			/* E.1 store flow tag (rte_flow mark). */
+			elts[pos]->hash.fdir.hi = flow_tag;
+			elts[pos + 1]->hash.fdir.hi = flow_tag;
+			elts[pos + 2]->hash.fdir.hi = flow_tag;
+			elts[pos + 3]->hash.fdir.hi = flow_tag;
+		}
+		pos += MLX5_VPMD_DESCS_PER_LOOP;
+		/* Move to next CQE and invalidate consumed CQEs. */
+		if (!(pos & 0x7) && pos < mcqe_n) {
+			mcq = (void *)&(cq + pos)->pkt_info;
+			for (i = 0; i < 8; ++i)
+				cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+		}
+	}
+	/* Invalidate the rest of CQEs. */
+	for (; inv < mcqe_n; ++inv)
+		cq[inv].op_own = MLX5_CQE_INVALIDATE;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	rxq->stats.ipackets += mcqe_n;
+	rxq->stats.ibytes += rcvd_byte;
+#endif
+	rxq->cq_ci += mcqe_n;
+}
+
+/**
+ * Calculate packet type and offload flag for mbuf and store it.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param ptype_info
+ *   Array of four 4bytes packet type info extracted from the original
+ *   completion descriptor.
+ * @param flow_tag
+ *   Array of four 4bytes flow ID extracted from the original completion
+ *   descriptor.
+ * @param op_err
+ *   Opcode vector having responder error status. Each field is 4B.
+ * @param pkts
+ *   Pointer to array of packets to be filled.
+ */
+static inline void
+rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
+			 uint32x4_t ptype_info, uint32x4_t flow_tag,
+			 uint16x4_t op_err, struct rte_mbuf **pkts)
+{
+	uint16x4_t ptype;
+	uint32x4_t pinfo, cv_flags;
+	uint32x4_t ol_flags =
+		vdupq_n_u32(rxq->rss_hash * PKT_RX_RSS_HASH |
+			    rxq->hw_timestamp * PKT_RX_TIMESTAMP);
+	const uint32x4_t ptype_ol_mask = { 0x106, 0x106, 0x106, 0x106 };
+	const uint8x16_t cv_flag_sel = {
+		0,
+		(uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+		(uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1),
+		0,
+		(uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1),
+		0,
+		(uint8_t)((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
+		0, 0, 0, 0, 0, 0, 0, 0, 0
+	};
+	const uint32x4_t cv_mask =
+		vdupq_n_u32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+			    PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
+	const uint64x1_t mbuf_init = vld1_u64(&rxq->mbuf_initializer);
+	const uint64x1_t r32_mask = vcreate_u64(0xffffffff);
+	uint64x2_t rearm0, rearm1, rearm2, rearm3;
+
+	if (rxq->mark) {
+		const uint32x4_t ft_def = vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT);
+		const uint32x4_t fdir_flags = vdupq_n_u32(PKT_RX_FDIR);
+		const uint32x4_t fdir_id_flags = vdupq_n_u32(PKT_RX_FDIR_ID);
+
+		/* Check if flow tag is non-zero then set PKT_RX_FDIR. */
+		ol_flags = vorrq_u32(ol_flags, vbicq_u32(fdir_flags,
+							 vceqzq_u32(flow_tag)));
+		/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+		ol_flags = vorrq_u32(ol_flags,
+				     vbicq_u32(fdir_id_flags,
+					       vceqq_u32(flow_tag, ft_def)));
+	}
+	/*
+	 * ptype_info has the following:
+	 * bit[1]     = l3_ok
+	 * bit[2]     = l4_ok
+	 * bit[8]     = cv
+	 * bit[11:10] = l3_hdr_type
+	 * bit[14:12] = l4_hdr_type
+	 * bit[15]    = ip_frag
+	 * bit[16]    = tunneled
+	 * bit[17]    = outer_l3_type
+	 */
+	ptype = vshrn_n_u32(ptype_info, 10);
+	/* Errored packets will have RTE_PTYPE_ALL_MASK. */
+	ptype = vorr_u16(ptype, op_err);
+	pkts[0]->packet_type =
+		mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 6)];
+	pkts[1]->packet_type =
+		mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 4)];
+	pkts[2]->packet_type =
+		mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 2)];
+	pkts[3]->packet_type =
+		mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 0)];
+	/* Fill flags for checksum and VLAN. */
+	pinfo = vandq_u32(ptype_info, ptype_ol_mask);
+	pinfo = vreinterpretq_u32_u8(
+		vqtbl1q_u8(cv_flag_sel, vreinterpretq_u8_u32(pinfo)));
+	/* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
+	cv_flags = vshlq_n_u32(pinfo, 9);
+	cv_flags = vorrq_u32(pinfo, cv_flags);
+	/* Move back flags to start from byte[0]. */
+	cv_flags = vshrq_n_u32(cv_flags, 8);
+	/* Mask out garbage bits. */
+	cv_flags = vandq_u32(cv_flags, cv_mask);
+	/* Merge to ol_flags. */
+	ol_flags = vorrq_u32(ol_flags, cv_flags);
+	/* Merge mbuf_init and ol_flags, and store. */
+	rearm0 = vcombine_u64(mbuf_init,
+			      vshr_n_u64(vget_high_u64(vreinterpretq_u64_u32(
+						       ol_flags)), 32));
+	rearm1 = vcombine_u64(mbuf_init,
+			      vand_u64(vget_high_u64(vreinterpretq_u64_u32(
+						     ol_flags)), r32_mask));
+	rearm2 = vcombine_u64(mbuf_init,
+			      vshr_n_u64(vget_low_u64(vreinterpretq_u64_u32(
+						      ol_flags)), 32));
+	rearm3 = vcombine_u64(mbuf_init,
+			      vand_u64(vget_low_u64(vreinterpretq_u64_u32(
+						    ol_flags)), r32_mask));
+	vst1q_u64((void *)&pkts[0]->rearm_data, rearm0);
+	vst1q_u64((void *)&pkts[1]->rearm_data, rearm1);
+	vst1q_u64((void *)&pkts[2]->rearm_data, rearm2);
+	vst1q_u64((void *)&pkts[3]->rearm_data, rearm3);
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	const uint16_t q_n = 1 << rxq->cqe_n;
+	const uint16_t q_mask = q_n - 1;
+	volatile struct mlx5_cqe *cq;
+	struct rte_mbuf **elts;
+	unsigned int pos;
+	uint64_t n;
+	uint16_t repl_n;
+	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+	uint16_t nocmp_n = 0;
+	uint16_t rcvd_pkt = 0;
+	unsigned int cq_idx = rxq->cq_ci & q_mask;
+	unsigned int elts_idx;
+	const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1)));
+	const uint16x4_t owner_check = vcreate_u16(0x0001000100010001);
+	const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0);
+	const uint16x4_t format_check = vcreate_u16(0x000c000c000c000c);
+	const uint16x4_t resp_err_check = vcreate_u16(0x00e000e000e000e0);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t rcvd_byte = 0;
+#endif
+	/* Mask to generate 16B length vector. */
+	const uint8x8_t len_shuf_m = {
+		52, 53,         /* 4th CQE */
+		36, 37,         /* 3rd CQE */
+		20, 21,         /* 2nd CQE */
+		 4,  5          /* 1st CQE */
+	};
+	/* Mask to extract 16B data from a 64B CQE. */
+	const uint8x16_t cqe_shuf_m = {
+		28, 29,         /* hdr_type_etc */
+		 0,             /* pkt_info */
+		-1,             /* null */
+		47, 46,         /* byte_cnt, bswap16 */
+		31, 30,         /* vlan_info, bswap16 */
+		15, 14, 13, 12, /* rx_hash_res, bswap32 */
+		57, 58, 59,     /* flow_tag */
+		63              /* op_own */
+	};
+	/* Mask to generate 16B data for mbuf. */
+	const uint8x16_t mb_shuf_m = {
+		 4,  5, -1, -1, /* pkt_len */
+		 4,  5,         /* data_len */
+		 6,  7,         /* vlan_tci */
+		 8,  9, 10, 11, /* hash.rss */
+		12, 13, 14, -1  /* hash.fdir.hi */
+	};
+	/* Mask to generate 16B owner vector. */
+	const uint8x8_t owner_shuf_m = {
+		63, -1,         /* 4th CQE */
+		47, -1,         /* 3rd CQE */
+		31, -1,         /* 2nd CQE */
+		15, -1          /* 1st CQE */
+	};
+	/* Mask to generate a vector having packet_type/ol_flags. */
+	const uint8x16_t ptype_shuf_m = {
+		48, 49, 50, -1, /* 4th CQE */
+		32, 33, 34, -1, /* 3rd CQE */
+		16, 17, 18, -1, /* 2nd CQE */
+		 0,  1,  2, -1  /* 1st CQE */
+	};
+	/* Mask to generate a vector having flow tags. */
+	const uint8x16_t ftag_shuf_m = {
+		60, 61, 62, -1, /* 4th CQE */
+		44, 45, 46, -1, /* 3rd CQE */
+		28, 29, 30, -1, /* 2nd CQE */
+		12, 13, 14, -1  /* 1st CQE */
+	};
+	const uint16x8_t crc_adj = {
+		0, 0, rxq->crc_present * ETHER_CRC_LEN, 0, 0, 0, 0, 0
+	};
+	const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) };
+
+	assert(rxq->sges_n == 0);
+	assert(rxq->cqe_n == rxq->elts_n);
+	cq = &(*rxq->cqes)[cq_idx];
+	rte_prefetch_non_temporal(cq);
+	rte_prefetch_non_temporal(cq + 1);
+	rte_prefetch_non_temporal(cq + 2);
+	rte_prefetch_non_temporal(cq + 3);
+	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+	/*
+	 * Order of indexes:
+	 *   rq_ci >= cq_ci >= rq_pi
+	 * Definition of indexes:
+	 *   rq_ci - cq_ci := # of buffers owned by HW (posted).
+	 *   cq_ci - rq_pi := # of buffers not returned to app (decompressed).
+	 *   N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished).
+	 */
+	repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
+	if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH)
+		mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
+	/* See if there're unreturned mbufs from compressed CQE. */
+	rcvd_pkt = rxq->cq_ci - rxq->rq_pi;
+	if (rcvd_pkt > 0) {
+		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+		rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt);
+		rxq->rq_pi += rcvd_pkt;
+		pkts += rcvd_pkt;
+	}
+	elts_idx = rxq->rq_pi & q_mask;
+	elts = &(*rxq->elts)[elts_idx];
+	/* Not to overflow pkts array. */
+	pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP);
+	/* Not to cross queue end. */
+	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
+	if (!pkts_n)
+		return rcvd_pkt;
+	/* At this point, there shouldn't be any remained packets. */
+	assert(rxq->rq_pi == rxq->cq_ci);
+	/*
+	 * Note that vectors have reverse order - {v3, v2, v1, v0}, because
+	 * there's no instruction to count trailing zeros. __builtin_clzl() is
+	 * used instead.
+	 *
+	 * A. copy 4 mbuf pointers from elts ring to returing pkts.
+	 * B. load 64B CQE and extract necessary fields
+	 *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
+	 *    following structure:
+	 *        struct {
+	 *          uint16_t hdr_type_etc;
+	 *          uint8_t  pkt_info;
+	 *          uint8_t  rsvd;
+	 *          uint16_t byte_cnt;
+	 *          uint16_t vlan_info;
+	 *          uint32_t rx_has_res;
+	 *          uint8_t  flow_tag[3];
+	 *          uint8_t  op_own;
+	 *        } c;
+	 * C. fill in mbuf.
+	 * D. get valid CQEs.
+	 * E. find compressed CQE.
+	 */
+	for (pos = 0;
+	     pos < pkts_n;
+	     pos += MLX5_VPMD_DESCS_PER_LOOP) {
+		uint16x4_t op_own;
+		uint16x4_t opcode, owner_mask, invalid_mask;
+		uint16x4_t comp_mask;
+		uint16x4_t mask;
+		uint16x4_t byte_cnt;
+		uint32x4_t ptype_info, flow_tag;
+		uint8_t *p0, *p1, *p2, *p3;
+		uint8_t *e0 = (void *)&elts[pos]->pkt_len;
+		uint8_t *e1 = (void *)&elts[pos + 1]->pkt_len;
+		uint8_t *e2 = (void *)&elts[pos + 2]->pkt_len;
+		uint8_t *e3 = (void *)&elts[pos + 3]->pkt_len;
+		void *elts_p = (void *)&elts[pos];
+		void *pkts_p = (void *)&pkts[pos];
+
+		/* A.0 do not cross the end of CQ. */
+		mask = vcreate_u16(pkts_n - pos < MLX5_VPMD_DESCS_PER_LOOP ?
+				   -1UL >> ((pkts_n - pos) *
+					    sizeof(uint16_t) * 8) : 0);
+		p0 = (void *)&cq[pos].pkt_info;
+		p1 = p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe);
+		p2 = p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe);
+		p3 = p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe);
+		/* Prefetch next 4 CQEs. */
+		if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+			unsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP;
+			rte_prefetch_non_temporal(&cq[next]);
+			rte_prefetch_non_temporal(&cq[next + 1]);
+			rte_prefetch_non_temporal(&cq[next + 2]);
+			rte_prefetch_non_temporal(&cq[next + 3]);
+		}
+		__asm__ volatile (
+		/* B.1 (CQE 3) load a block having op_own. */
+		"ld1 {v19.16b}, [%[p3]] \n\t"
+		"sub %[p3], %[p3], #48 \n\t"
+		/* B.2 (CQE 3) load the rest blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p3]] \n\t"
+		/* B.3 (CQE 3) extract 16B fields. */
+		"tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.4 (CQE 3) adjust CRC length. */
+		"sub v23.8h, v23.8h, %[crc_adj].8h \n\t"
+		/* B.1 (CQE 2) load a block having op_own. */
+		"ld1 {v19.16b}, [%[p2]] \n\t"
+		"sub %[p2], %[p2], #48 \n\t"
+		/* C.1 (CQE 3) generate final structure for mbuf. */
+		"tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t"
+		/* B.2 (CQE 2) load the rest blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
+		/* B.3 (CQE 2) extract 16B fields. */
+		"tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.4 (CQE 2) adjust CRC length. */
+		"sub v22.8h, v22.8h, %[crc_adj].8h \n\t"
+		/* B.1 (CQE 1) load a block having op_own. */
+		"ld1 {v19.16b}, [%[p1]] \n\t"
+		"sub %[p1], %[p1], #48 \n\t"
+		/* C.1 (CQE 2) generate final structure for mbuf. */
+		"tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t"
+		/* B.2 (CQE 1) load the rest blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
+		/* B.3 (CQE 1) extract 16B fields. */
+		"tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.4 (CQE 1) adjust CRC length. */
+		"sub v21.8h, v21.8h, %[crc_adj].8h \n\t"
+		/* B.1 (CQE 0) load a block having op_own. */
+		"ld1 {v19.16b}, [%[p0]] \n\t"
+		"sub %[p0], %[p0], #48 \n\t"
+		/* C.1 (CQE 1) generate final structure for mbuf. */
+		"tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t"
+		/* B.2 (CQE 0) load the rest blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
+		/* B.3 (CQE 0) extract 16B fields. */
+		"tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.4 (CQE 0) adjust CRC length. */
+		"sub v20.8h, v20.8h, %[crc_adj].8h \n\t"
+		/* A.1 load mbuf pointers. */
+		"ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
+		/* D.1 extract op_own byte. */
+		"tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b \n\t"
+		/* C.2 (CQE 3) adjust flow mark. */
+		"add v15.4s, v15.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 3) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v15.2d}, [%[e3]] \n\t"
+		/* C.2 (CQE 2) adjust flow mark. */
+		"add v14.4s, v14.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 2) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v14.2d}, [%[e2]] \n\t"
+		/* C.1 (CQE 0) generate final structure for mbuf. */
+		"tbl v12.16b, {v20.16b}, %[mb_shuf_m].16b \n\t"
+		/* C.2 (CQE 1) adjust flow mark. */
+		"add v13.4s, v13.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v13.2d}, [%[e1]] \n\t"
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Extract byte_cnt. */
+		"tbl %[byte_cnt].8b, {v20.16b - v23.16b}, %[len_shuf_m].8b \n\t"
+#endif
+		/* Extract ptype_info. */
+		"tbl %[ptype_info].16b, {v20.16b - v23.16b}, %[ptype_shuf_m].16b \n\t"
+		/* Extract flow_tag. */
+		"tbl %[flow_tag].16b, {v20.16b - v23.16b}, %[ftag_shuf_m].16b \n\t"
+		/* A.2 copy mbuf pointers. */
+		"st1 {v24.2d - v25.2d}, [%[pkts_p]] \n\t"
+		/* C.2 (CQE 0) adjust flow mark. */
+		"add v12.4s, v12.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v12.2d}, [%[e0]] \n\t"
+		:[op_own]"=&w"(op_own),
+		 [byte_cnt]"=&w"(byte_cnt),
+		 [ptype_info]"=&w"(ptype_info),
+		 [flow_tag]"=&w"(flow_tag)
+		:[p3]"r"(p3 + 48), [p2]"r"(p2 + 48),
+		 [p1]"r"(p1 + 48), [p0]"r"(p0 + 48),
+		 [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
+		 [elts_p]"r"(elts_p),
+		 [pkts_p]"r"(pkts_p),
+		 [cqe_shuf_m]"w"(cqe_shuf_m),
+		 [mb_shuf_m]"w"(mb_shuf_m),
+		 [owner_shuf_m]"w"(owner_shuf_m),
+		 [len_shuf_m]"w"(len_shuf_m),
+		 [ptype_shuf_m]"w"(ptype_shuf_m),
+		 [ftag_shuf_m]"w"(ftag_shuf_m),
+		 [crc_adj]"w"(crc_adj),
+		 [flow_mark_adj]"w"(flow_mark_adj)
+		:"memory",
+		 "v12", "v13", "v14", "v15",
+		 "v16", "v17", "v18", "v19",
+		 "v20", "v21", "v22", "v23",
+		 "v24", "v25");
+		/* D.2 flip owner bit to mark CQEs from last round. */
+		owner_mask = vand_u16(op_own, owner_check);
+		owner_mask = vceq_u16(owner_mask, ownership);
+		/* D.3 get mask for invalidated CQEs. */
+		opcode = vand_u16(op_own, opcode_check);
+		invalid_mask = vceq_u16(opcode_check, opcode);
+		/* E.1 find compressed CQE format. */
+		comp_mask = vand_u16(op_own, format_check);
+		comp_mask = vceq_u16(comp_mask, format_check);
+		/* D.4 mask out beyond boundary. */
+		invalid_mask = vorr_u16(invalid_mask, mask);
+		/* D.5 merge invalid_mask with invalid owner. */
+		invalid_mask = vorr_u16(invalid_mask, owner_mask);
+		/* E.2 mask out invalid entries. */
+		comp_mask = vbic_u16(comp_mask, invalid_mask);
+		/* E.3 get the first compressed CQE. */
+		comp_idx = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+					  comp_mask), 0)) /
+					  (sizeof(uint16_t) * 8);
+		/* D.6 mask out entries after the compressed CQE. */
+		mask = vcreate_u16(comp_idx < MLX5_VPMD_DESCS_PER_LOOP ?
+				   -1UL >> (comp_idx * sizeof(uint16_t) * 8) :
+				   0);
+		invalid_mask = vorr_u16(invalid_mask, mask);
+		/* D.7 count non-compressed valid CQEs. */
+		n = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+				   invalid_mask), 0)) / (sizeof(uint16_t) * 8);
+		nocmp_n += n;
+		/* D.2 get the final invalid mask. */
+		mask = vcreate_u16(n < MLX5_VPMD_DESCS_PER_LOOP ?
+				   -1UL >> (n * sizeof(uint16_t) * 8) : 0);
+		invalid_mask = vorr_u16(invalid_mask, mask);
+		/* D.3 check error in opcode. */
+		opcode = vceq_u16(resp_err_check, opcode);
+		opcode = vbic_u16(opcode, invalid_mask);
+		/* D.4 mark if any error is set */
+		rxq->pending_err |=
+			!!vget_lane_u64(vreinterpret_u64_u16(opcode), 0);
+		/* C.4 fill in mbuf - rearm_data and packet_type. */
+		rxq_cq_to_ptype_oflags_v(rxq, ptype_info, flow_tag,
+					 opcode, &elts[pos]);
+		if (rxq->hw_timestamp) {
+			elts[pos]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p0, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+			elts[pos + 1]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p1, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+			elts[pos + 2]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p2, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+			elts[pos + 3]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p3, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+		}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Add up received bytes count. */
+		byte_cnt = vbic_u16(byte_cnt, invalid_mask);
+		rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);
+#endif
+		/*
+		 * Break the loop unless more valid CQE is expected, or if
+		 * there's a compressed CQE.
+		 */
+		if (n != MLX5_VPMD_DESCS_PER_LOOP)
+			break;
+	}
+	/* If no new CQE seen, return without updating cq_db. */
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP))
+		return rcvd_pkt;
+	/* Update the consumer indexes for non-compressed CQEs. */
+	assert(nocmp_n <= pkts_n);
+	rxq->cq_ci += nocmp_n;
+	rxq->rq_pi += nocmp_n;
+	rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	rxq->stats.ipackets += nocmp_n;
+	rxq->stats.ibytes += rcvd_byte;
+#endif
+	/* Decompress the last CQE if compressed. */
+	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+		assert(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+		rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]);
+		/* Return more packets if needed. */
+		if (nocmp_n < pkts_n) {
+			uint16_t n = rxq->cq_ci - rxq->rq_pi;
+
+			n = RTE_MIN(n, pkts_n - nocmp_n);
+			rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n);
+			rxq->rq_pi += n;
+			rcvd_pkt += n;
+		}
+	}
+	rte_compiler_barrier();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	return rcvd_pkt;
+}
+
+#endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.c b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 8560f745..2b9f1601 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -31,38 +31,23 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_
+#define RTE_PMD_MLX5_RXTX_VEC_SSE_H_
+
 #include <assert.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdlib.h>
 #include <smmintrin.h>
 
-/* Verbs header. */
-/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
-#include <infiniband/verbs.h>
-#include <infiniband/mlx5_hw.h>
-#include <infiniband/arch.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
-
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_mbuf.h>
 #include <rte_mempool.h>
 #include <rte_prefetch.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
 #include "mlx5_rxtx.h"
+#include "mlx5_rxtx_vec.h"
 #include "mlx5_autoconf.h"
 #include "mlx5_defs.h"
 #include "mlx5_prm.h"
@@ -77,14 +62,14 @@
  * @param txq
  *   Pointer to TX queue structure.
  * @param dseg
- *   Pointer to buffer descriptor to be writen.
+ *   Pointer to buffer descriptor to be written.
  * @param pkts
  *   Pointer to array of packets to be sent.
  * @param n
  *   Number of packets to be filled.
  */
 static inline void
-txq_wr_dseg_v(struct txq *txq, __m128i *dseg,
+txq_wr_dseg_v(struct mlx5_txq_data *txq, __m128i *dseg,
 	      struct rte_mbuf **pkts, unsigned int n)
 {
 	unsigned int pos;
@@ -119,85 +104,6 @@ txq_wr_dseg_v(struct txq *txq, __m128i *dseg,
 }
 
 /**
- * Count the number of continuous single segment packets. The first packet must
- * be a single segment packet.
- *
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- *
- * @return
- *   Number of continuous single segment packets.
- */
-static inline unsigned int
-txq_check_multiseg(struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	unsigned int pos;
-
-	if (!pkts_n)
-		return 0;
-	assert(NB_SEGS(pkts[0]) == 1);
-	/* Count the number of continuous single segment packets. */
-	for (pos = 1; pos < pkts_n; ++pos)
-		if (NB_SEGS(pkts[pos]) > 1)
-			break;
-	return pos;
-}
-
-/**
- * Count the number of packets having same ol_flags and calculate cs_flags.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param pkts
- *   Pointer to array of packets.
- * @param pkts_n
- *   Number of packets.
- * @param cs_flags
- *   Pointer of flags to be returned.
- *
- * @return
- *   Number of packets having same ol_flags.
- */
-static inline unsigned int
-txq_calc_offload(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-		 uint8_t *cs_flags)
-{
-	unsigned int pos;
-	const uint64_t ol_mask =
-		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
-		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
-		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
-
-	if (!pkts_n)
-		return 0;
-	/* Count the number of packets having same ol_flags. */
-	for (pos = 1; pos < pkts_n; ++pos)
-		if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask)
-			break;
-	/* Should open another MPW session for the rest. */
-	if (pkts[0]->ol_flags &
-	    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
-		const uint64_t is_tunneled =
-			pkts[0]->ol_flags &
-			(PKT_TX_TUNNEL_GRE |
-			 PKT_TX_TUNNEL_VXLAN);
-
-		if (is_tunneled && txq->tunnel_en) {
-			*cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM |
-				    MLX5_ETH_WQE_L4_INNER_CSUM;
-			if (pkts[0]->ol_flags & PKT_TX_OUTER_IP_CKSUM)
-				*cs_flags |= MLX5_ETH_WQE_L3_CSUM;
-		} else {
-			*cs_flags = MLX5_ETH_WQE_L3_CSUM |
-				    MLX5_ETH_WQE_L4_CSUM;
-		}
-	}
-	return pos;
-}
-
-/**
  * Send multi-segmented packets until it encounters a single segment packet in
  * the pkts list.
  *
@@ -212,7 +118,8 @@ txq_calc_offload(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
  *   Number of packets successfully transmitted (<= pkts_n).
  */
 static uint16_t
-txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
+	      uint16_t pkts_n)
 {
 	uint16_t elts_head = txq->elts_head;
 	const uint16_t elts_n = 1 << txq->elts_n;
@@ -257,13 +164,17 @@ txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (segs_n == 1 ||
 		    max_elts < segs_n || max_wqe < 2)
 			break;
+		if (segs_n > MLX5_MPW_DSEG_MAX) {
+			txq->stats.oerrors++;
+			break;
+		}
 		wqe = &((volatile struct mlx5_wqe64 *)
 			 txq->wqes)[wqe_ci & wq_mask].hdr;
 		if (buf->ol_flags &
 		     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
-			const uint64_t is_tunneled = buf->ol_flags &
-						      (PKT_TX_TUNNEL_GRE |
-						       PKT_TX_TUNNEL_VXLAN);
+			const uint64_t is_tunneled =
+				buf->ol_flags & (PKT_TX_TUNNEL_GRE |
+						 PKT_TX_TUNNEL_VXLAN);
 
 			if (is_tunneled && txq->tunnel_en) {
 				cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM |
@@ -298,7 +209,7 @@ txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* Fill ESEG in the header. */
 		_mm_store_si128(t_wqe + 1,
 				_mm_set_epi16(0, 0, 0, 0,
-					      htons(len), cs_flags,
+					      rte_cpu_to_be_16(len), cs_flags,
 					      0, 0));
 		txq->wqe_ci = wqe_ci;
 	}
@@ -307,7 +218,7 @@ txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	txq->elts_comp += (uint16_t)(elts_head - txq->elts_head);
 	txq->elts_head = elts_head;
 	if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
-		wqe->ctrl[2] = htonl(8);
+		wqe->ctrl[2] = rte_cpu_to_be_32(8);
 		wqe->ctrl[3] = txq->elts_head;
 		txq->elts_comp = 0;
 		++txq->cq_pi;
@@ -338,7 +249,7 @@ txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
  *   Number of packets successfully transmitted (<= pkts_n).
  */
 static inline uint16_t
-txq_burst_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
+txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	    uint8_t cs_flags)
 {
 	struct rte_mbuf **elts;
@@ -374,6 +285,7 @@ txq_burst_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	max_elts = (elts_n - (elts_head - txq->elts_tail));
 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
 	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
+	assert(pkts_n <= MLX5_DSEG_MAX - nb_dword_in_hdr);
 	if (unlikely(!pkts_n))
 		return 0;
 	elts = &(*txq->elts)[elts_head & elts_m];
@@ -432,87 +344,11 @@ txq_burst_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) /
 		       nb_dword_per_wqebb;
 	/* Ring QP doorbell. */
-	mlx5_tx_dbrec(txq, wqe);
+	mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST);
 	return pkts_n;
 }
 
 /**
- * DPDK callback for vectorized TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts,
-		      uint16_t pkts_n)
-{
-	struct txq *txq = (struct txq *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint16_t n;
-		uint16_t ret;
-
-		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0);
-		nb_tx += ret;
-		if (!ret)
-			break;
-	}
-	return nb_tx;
-}
-
-/**
- * DPDK callback for vectorized TX with multi-seg packets and offload.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-uint16_t
-mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct txq *txq = (struct txq *)dpdk_txq;
-	uint16_t nb_tx = 0;
-
-	while (pkts_n > nb_tx) {
-		uint8_t cs_flags = 0;
-		uint16_t n;
-		uint16_t ret;
-
-		/* Transmit multi-seg packets in the head of pkts list. */
-		if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) &&
-		    NB_SEGS(pkts[nb_tx]) > 1)
-			nb_tx += txq_scatter_v(txq,
-					       &pkts[nb_tx],
-					       pkts_n - nb_tx);
-		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-		if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS))
-			n = txq_check_multiseg(&pkts[nb_tx], n);
-		if (!(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS))
-			n = txq_calc_offload(txq, &pkts[nb_tx], n, &cs_flags);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags);
-		nb_tx += ret;
-		if (!ret)
-			break;
-	}
-	return nb_tx;
-}
-
-/**
  * Store free buffers to RX SW ring.
  *
  * @param rxq
@@ -523,7 +359,7 @@ mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
  *   Number of packets to be stored.
  */
 static inline void
-rxq_copy_mbuf_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t n)
+rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
 {
 	const uint16_t q_mask = (1 << rxq->elts_n) - 1;
 	struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
@@ -541,41 +377,6 @@ rxq_copy_mbuf_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t n)
 }
 
 /**
- * Replenish buffers for RX in bulk.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- * @param n
- *   Number of buffers to be replenished.
- */
-static inline void
-rxq_replenish_bulk_mbuf(struct rxq *rxq, uint16_t n)
-{
-	const uint16_t q_n = 1 << rxq->elts_n;
-	const uint16_t q_mask = q_n - 1;
-	const uint16_t elts_idx = rxq->rq_ci & q_mask;
-	struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
-	volatile struct mlx5_wqe_data_seg *wq = &(*rxq->wqes)[elts_idx];
-	unsigned int i;
-
-	assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH);
-	assert(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi)));
-	assert(MLX5_VPMD_RXQ_RPLNSH_THRESH > MLX5_VPMD_DESCS_PER_LOOP);
-	/* Not to cross queue end. */
-	n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
-	if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
-		rxq->stats.rx_nombuf += n;
-		return;
-	}
-	for (i = 0; i < n; ++i)
-		wq[i].addr = htonll((uintptr_t)elts[i]->buf_addr +
-				    RTE_PKTMBUF_HEADROOM);
-	rxq->rq_ci += n;
-	rte_wmb();
-	*rxq->rq_db = htonl(rxq->rq_ci);
-}
-
-/**
  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
  * extracted from the title completion descriptor.
  *
@@ -588,8 +389,7 @@ rxq_replenish_bulk_mbuf(struct rxq *rxq, uint16_t n)
  *   the title completion descriptor to be copied to the rest of mbufs.
  */
 static inline void
-rxq_cq_decompress_v(struct rxq *rxq,
-		    volatile struct mlx5_cqe *cq,
+rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		    struct rte_mbuf **elts)
 {
 	volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1);
@@ -636,13 +436,6 @@ rxq_cq_decompress_v(struct rxq *rxq,
 			     10, 11,  2,  3);
 #endif
 
-	/* Compile time sanity check for this function. */
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
-			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
-			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
-			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 	/*
 	 * A. load mCQEs into a 128bit register.
 	 * B. store rearm data to mbuf.
@@ -747,12 +540,13 @@ rxq_cq_decompress_v(struct rxq *rxq,
  *   Pointer to array of packets to be filled.
  */
 static inline void
-rxq_cq_to_ptype_oflags_v(struct rxq *rxq, __m128i cqes[4], __m128i op_err,
-			 struct rte_mbuf **pkts)
+rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
+			 __m128i op_err, struct rte_mbuf **pkts)
 {
 	__m128i pinfo0, pinfo1;
 	__m128i pinfo, ptype;
-	__m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH);
+	__m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH |
+					  rxq->hw_timestamp * PKT_RX_TIMESTAMP);
 	__m128i cv_flags;
 	const __m128i zero = _mm_setzero_si128();
 	const __m128i ptype_mask =
@@ -769,17 +563,17 @@ rxq_cq_to_ptype_oflags_v(struct rxq *rxq, __m128i cqes[4], __m128i op_err,
 			     (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1),
 			     0,
 			     (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1),
-			     (uint8_t)(PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED),
+			     (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
 			     0);
 	const __m128i cv_mask =
 		_mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
+			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
 			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
+			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
 			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
+			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
 			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED);
+			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
 	const __m128i mbuf_init =
 		_mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer);
 	__m128i rearm0, rearm1, rearm2, rearm3;
@@ -853,15 +647,11 @@ rxq_cq_to_ptype_oflags_v(struct rxq *rxq, __m128i cqes[4], __m128i op_err,
 	/* Merge to ol_flags. */
 	ol_flags = _mm_or_si128(ol_flags, cv_flags);
 	/* Merge mbuf_init and ol_flags. */
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
-			 offsetof(struct rte_mbuf, rearm_data) + 8);
 	rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30);
 	rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30);
 	rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30);
 	rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30);
 	/* Write 8B rearm_data and 8B ol_flags. */
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
-			 RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
 	_mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0);
 	_mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1);
 	_mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2);
@@ -869,51 +659,6 @@ rxq_cq_to_ptype_oflags_v(struct rxq *rxq, __m128i cqes[4], __m128i op_err,
 }
 
 /**
- * Skip error packets.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-rxq_handle_pending_error(struct rxq *rxq, struct rte_mbuf **pkts,
-			 uint16_t pkts_n)
-{
-	uint16_t n = 0;
-	unsigned int i;
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	uint32_t err_bytes = 0;
-#endif
-
-	for (i = 0; i < pkts_n; ++i) {
-		struct rte_mbuf *pkt = pkts[i];
-
-		if (pkt->packet_type == RTE_PTYPE_ALL_MASK) {
-#ifdef MLX5_PMD_SOFT_COUNTERS
-			err_bytes += PKT_LEN(pkt);
-#endif
-			rte_pktmbuf_free_seg(pkt);
-		} else {
-			pkts[n++] = pkt;
-		}
-	}
-	rxq->stats.idropped += (pkts_n - n);
-#ifdef MLX5_PMD_SOFT_COUNTERS
-	/* Correct counters of errored completions. */
-	rxq->stats.ipackets -= (pkts_n - n);
-	rxq->stats.ibytes -= err_bytes;
-#endif
-	rxq->pending_err = 0;
-	return n;
-}
-
-/**
  * Receive burst of packets. An errored completion also consumes a mbuf, but the
  * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
  * before returning to application.
@@ -929,7 +674,7 @@ rxq_handle_pending_error(struct rxq *rxq, struct rte_mbuf **pkts,
  *   Number of packets received including errors (<= pkts_n).
  */
 static inline uint16_t
-rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	const uint16_t q_n = 1 << rxq->cqe_n;
 	const uint16_t q_mask = q_n - 1;
@@ -984,26 +729,6 @@ rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			      rxq->crc_present * ETHER_CRC_LEN);
 	const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
 
-	/* Compile time sanity check for this function. */
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
-			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
-			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
-	RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, pkt_info) != 0);
-	RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rx_hash_res) !=
-			 offsetof(struct mlx5_cqe, pkt_info) + 12);
-	RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rsvd1) +
-			  sizeof(((struct mlx5_cqe *)0)->rsvd1) !=
-			 offsetof(struct mlx5_cqe, hdr_type_etc));
-	RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, vlan_info) !=
-			 offsetof(struct mlx5_cqe, hdr_type_etc) + 2);
-	RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rsvd2) +
-			  sizeof(((struct mlx5_cqe *)0)->rsvd2) !=
-			 offsetof(struct mlx5_cqe, byte_cnt));
-	RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, sop_drop_qpn) !=
-			 RTE_ALIGN(offsetof(struct mlx5_cqe, sop_drop_qpn), 8));
-	RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, op_own) !=
-			 offsetof(struct mlx5_cqe, sop_drop_qpn) + 7);
 	assert(rxq->sges_n == 0);
 	assert(rxq->cqe_n == rxq->elts_n);
 	cq = &(*rxq->cqes)[cq_idx];
@@ -1022,7 +747,7 @@ rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	 */
 	repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
 	if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH)
-		rxq_replenish_bulk_mbuf(rxq, repl_n);
+		mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
 	/* See if there're unreturned mbufs from compressed CQE. */
 	rcvd_pkt = rxq->cq_ci - rxq->rq_pi;
 	if (rcvd_pkt > 0) {
@@ -1214,6 +939,16 @@ rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		rxq->pending_err |= !!_mm_cvtsi128_si64(opcode);
 		/* D.5 fill in mbuf - rearm_data and packet_type. */
 		rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+		if (rxq->hw_timestamp) {
+			pkts[pos]->timestamp =
+				rte_be_to_cpu_64(cq[pos].timestamp);
+			pkts[pos + 1]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p1].timestamp);
+			pkts[pos + 2]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p2].timestamp);
+			pkts[pos + 3]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p3].timestamp);
+		}
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Add up received bytes count. */
 		byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
@@ -1254,164 +989,9 @@ rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			rcvd_pkt += n;
 		}
 	}
-	rte_wmb();
-	*rxq->cq_db = htonl(rxq->cq_ci);
+	rte_compiler_barrier();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
 	return rcvd_pkt;
 }
 
-/**
- * DPDK callback for vectorized RX.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-uint16_t
-mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-	struct rxq *rxq = dpdk_rxq;
-	uint16_t nb_rx;
-
-	nb_rx = rxq_burst_v(rxq, pkts, pkts_n);
-	if (unlikely(rxq->pending_err))
-		nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx);
-	return nb_rx;
-}
-
-/**
- * Check Tx queue flags are set for raw vectorized Tx.
- *
- * @param priv
- *   Pointer to private structure.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-priv_check_raw_vec_tx_support(struct priv *priv)
-{
-	uint16_t i;
-
-	/* All the configured queues should support. */
-	for (i = 0; i < priv->txqs_n; ++i) {
-		struct txq *txq = (*priv->txqs)[i];
-
-		if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) ||
-		    !(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS))
-			break;
-	}
-	if (i != priv->txqs_n)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
- * Check a device can support vectorized TX.
- *
- * @param priv
- *   Pointer to private structure.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-priv_check_vec_tx_support(struct priv *priv)
-{
-	if (!priv->tx_vec_en ||
-	    priv->txqs_n > MLX5_VPMD_MIN_TXQS ||
-	    priv->mps != MLX5_MPW_ENHANCED ||
-	    priv->tso)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
- * Check a RX queue can support vectorized RX.
- *
- * @param rxq
- *   Pointer to RX queue.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-rxq_check_vec_support(struct rxq *rxq)
-{
-	struct rxq_ctrl *ctrl = container_of(rxq, struct rxq_ctrl, rxq);
-
-	if (!ctrl->priv->rx_vec_en || rxq->sges_n != 0)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
- * Check a device can support vectorized RX.
- *
- * @param priv
- *   Pointer to private structure.
- *
- * @return
- *   1 if supported, negative errno value if not.
- */
-int __attribute__((cold))
-priv_check_vec_rx_support(struct priv *priv)
-{
-	uint16_t i;
-
-	if (!priv->rx_vec_en)
-		return -ENOTSUP;
-	/* All the configured queues should support. */
-	for (i = 0; i < priv->rxqs_n; ++i) {
-		struct rxq *rxq = (*priv->rxqs)[i];
-
-		if (rxq_check_vec_support(rxq) < 0)
-			break;
-	}
-	if (i != priv->rxqs_n)
-		return -ENOTSUP;
-	return 1;
-}
-
-/**
- * Prepare for vectorized RX.
- *
- * @param priv
- *   Pointer to private structure.
- */
-void
-priv_prep_vec_rx_function(struct priv *priv)
-{
-	uint16_t i;
-
-	for (i = 0; i < priv->rxqs_n; ++i) {
-		struct rxq *rxq = (*priv->rxqs)[i];
-		struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
-		const uint16_t desc = 1 << rxq->elts_n;
-		int j;
-
-		assert(rxq->elts_n == rxq->cqe_n);
-		/* Initialize default rearm_data for vPMD. */
-		mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
-		rte_mbuf_refcnt_set(mbuf_init, 1);
-		mbuf_init->nb_segs = 1;
-		mbuf_init->port = rxq->port_id;
-		/*
-		 * prevent compiler reordering:
-		 * rearm_data covers previous fields.
-		 */
-		rte_compiler_barrier();
-		rxq->mbuf_initializer =
-			*(uint64_t *)&mbuf_init->rearm_data;
-		/* Padding with a fake mbuf for vectorized Rx. */
-		for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
-			(*rxq->elts)[desc + j] = &rxq->fake_mbuf;
-		/* Mark that it need to be cleaned up for rxq_alloc_elts(). */
-		rxq->trim_elts = 1;
-	}
-}
+#endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */
diff --git a/drivers/net/mlx5/mlx5_socket.c b/drivers/net/mlx5/mlx5_socket.c
new file mode 100644
index 00000000..5cd1ab80
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_socket.c
@@ -0,0 +1,294 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2016 6WIND S.A.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#define _GNU_SOURCE
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "mlx5.h"
+#include "mlx5_utils.h"
+
+/**
+ * Initialise the socket to communicate with the secondary process
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+priv_socket_init(struct priv *priv)
+{
+	struct sockaddr_un sun = {
+		.sun_family = AF_UNIX,
+	};
+	int ret;
+	int flags;
+	struct stat file_stat;
+
+	/*
+	 * Initialise the socket to communicate with the secondary
+	 * process.
+	 */
+	ret = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (ret < 0) {
+		WARN("secondary process not supported: %s", strerror(errno));
+		return ret;
+	}
+	priv->primary_socket = ret;
+	flags = fcntl(priv->primary_socket, F_GETFL, 0);
+	if (flags == -1)
+		goto out;
+	ret = fcntl(priv->primary_socket, F_SETFL, flags | O_NONBLOCK);
+	if (ret < 0)
+		goto out;
+	snprintf(sun.sun_path, sizeof(sun.sun_path), "/var/tmp/%s_%d",
+		 MLX5_DRIVER_NAME, priv->primary_socket);
+	ret = stat(sun.sun_path, &file_stat);
+	if (!ret)
+		claim_zero(remove(sun.sun_path));
+	ret = bind(priv->primary_socket, (const struct sockaddr *)&sun,
+		   sizeof(sun));
+	if (ret < 0) {
+		WARN("cannot bind socket, secondary process not supported: %s",
+		     strerror(errno));
+		goto close;
+	}
+	ret = listen(priv->primary_socket, 0);
+	if (ret < 0) {
+		WARN("Secondary process not supported: %s", strerror(errno));
+		goto close;
+	}
+	return ret;
+close:
+	remove(sun.sun_path);
+out:
+	claim_zero(close(priv->primary_socket));
+	priv->primary_socket = 0;
+	return -(ret);
+}
+
+/**
+ * Un-Initialise the socket to communicate with the secondary process
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+priv_socket_uninit(struct priv *priv)
+{
+	MKSTR(path, "/var/tmp/%s_%d", MLX5_DRIVER_NAME, priv->primary_socket);
+	claim_zero(close(priv->primary_socket));
+	priv->primary_socket = 0;
+	claim_zero(remove(path));
+	return 0;
+}
+
+/**
+ * Handle socket interrupts.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+void
+priv_socket_handle(struct priv *priv)
+{
+	int conn_sock;
+	int ret = 0;
+	struct cmsghdr *cmsg = NULL;
+	struct ucred *cred = NULL;
+	char buf[CMSG_SPACE(sizeof(struct ucred))] = { 0 };
+	char vbuf[1024] = { 0 };
+	struct iovec io = {
+		.iov_base = vbuf,
+		.iov_len = sizeof(*vbuf),
+	};
+	struct msghdr msg = {
+		.msg_iov = &io,
+		.msg_iovlen = 1,
+		.msg_control = buf,
+		.msg_controllen = sizeof(buf),
+	};
+	int *fd;
+
+	/* Accept the connection from the client. */
+	conn_sock = accept(priv->primary_socket, NULL, NULL);
+	if (conn_sock < 0) {
+		WARN("connection failed: %s", strerror(errno));
+		return;
+	}
+	ret = setsockopt(conn_sock, SOL_SOCKET, SO_PASSCRED, &(int){1},
+					 sizeof(int));
+	if (ret < 0) {
+		WARN("cannot change socket options");
+		goto out;
+	}
+	ret = recvmsg(conn_sock, &msg, MSG_WAITALL);
+	if (ret < 0) {
+		WARN("received an empty message: %s", strerror(errno));
+		goto out;
+	}
+	/* Expect to receive credentials only. */
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if (cmsg == NULL) {
+		WARN("no message");
+		goto out;
+	}
+	if ((cmsg->cmsg_type == SCM_CREDENTIALS) &&
+		(cmsg->cmsg_len >= sizeof(*cred))) {
+		cred = (struct ucred *)CMSG_DATA(cmsg);
+		assert(cred != NULL);
+	}
+	cmsg = CMSG_NXTHDR(&msg, cmsg);
+	if (cmsg != NULL) {
+		WARN("Message wrongly formatted");
+		goto out;
+	}
+	/* Make sure all the ancillary data was received and valid. */
+	if ((cred == NULL) || (cred->uid != getuid()) ||
+	    (cred->gid != getgid())) {
+		WARN("wrong credentials");
+		goto out;
+	}
+	/* Set-up the ancillary data. */
+	cmsg = CMSG_FIRSTHDR(&msg);
+	assert(cmsg != NULL);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(priv->ctx->cmd_fd));
+	fd = (int *)CMSG_DATA(cmsg);
+	*fd = priv->ctx->cmd_fd;
+	ret = sendmsg(conn_sock, &msg, 0);
+	if (ret < 0)
+		WARN("cannot send response");
+out:
+	close(conn_sock);
+}
+
+/**
+ * Connect to the primary process.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   fd on success, negative errno value on failure.
+ */
+int
+priv_socket_connect(struct priv *priv)
+{
+	struct sockaddr_un sun = {
+		.sun_family = AF_UNIX,
+	};
+	int socket_fd;
+	int *fd = NULL;
+	int ret;
+	struct ucred *cred;
+	char buf[CMSG_SPACE(sizeof(*cred))] = { 0 };
+	char vbuf[1024] = { 0 };
+	struct iovec io = {
+		.iov_base = vbuf,
+		.iov_len = sizeof(*vbuf),
+	};
+	struct msghdr msg = {
+		.msg_control = buf,
+		.msg_controllen = sizeof(buf),
+		.msg_iov = &io,
+		.msg_iovlen = 1,
+	};
+	struct cmsghdr *cmsg;
+
+	ret = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (ret < 0) {
+		WARN("cannot connect to primary");
+		return ret;
+	}
+	socket_fd = ret;
+	snprintf(sun.sun_path, sizeof(sun.sun_path), "/var/tmp/%s_%d",
+		 MLX5_DRIVER_NAME, priv->primary_socket);
+	ret = connect(socket_fd, (const struct sockaddr *)&sun, sizeof(sun));
+	if (ret < 0) {
+		WARN("cannot connect to primary");
+		goto out;
+	}
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if (cmsg == NULL) {
+		DEBUG("cannot get first message");
+		goto out;
+	}
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_CREDENTIALS;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(*cred));
+	cred = (struct ucred *)CMSG_DATA(cmsg);
+	if (cred == NULL) {
+		DEBUG("no credentials received");
+		goto out;
+	}
+	cred->pid = getpid();
+	cred->uid = getuid();
+	cred->gid = getgid();
+	ret = sendmsg(socket_fd, &msg, MSG_DONTWAIT);
+	if (ret < 0) {
+		WARN("cannot send credentials to primary: %s",
+		     strerror(errno));
+		goto out;
+	}
+	ret = recvmsg(socket_fd, &msg, MSG_WAITALL);
+	if (ret <= 0) {
+		WARN("no message from primary: %s", strerror(errno));
+		goto out;
+	}
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if (cmsg == NULL) {
+		WARN("No file descriptor received");
+		goto out;
+	}
+	fd = (int *)CMSG_DATA(cmsg);
+	if (*fd <= 0) {
+		WARN("no file descriptor received: %s", strerror(errno));
+		ret = *fd;
+		goto out;
+	}
+	ret = *fd;
+out:
+	close(socket_fd);
+	return ret;
+}
diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c
index 703f48c3..5e225d37 100644
--- a/drivers/net/mlx5/mlx5_stats.c
+++ b/drivers/net/mlx5/mlx5_stats.c
@@ -34,16 +34,9 @@
 #include <linux/sockios.h>
 #include <linux/ethtool.h>
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_ethdev.h>
 #include <rte_common.h>
 #include <rte_malloc.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
@@ -325,7 +318,7 @@ priv_xstats_reset(struct priv *priv)
  * @param[out] stats
  *   Stats structure output buffer.
  */
-void
+int
 mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 {
 	struct priv *priv = mlx5_get_priv(dev);
@@ -336,7 +329,7 @@ mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 	priv_lock(priv);
 	/* Add software counters. */
 	for (i = 0; (i != priv->rxqs_n); ++i) {
-		struct rxq *rxq = (*priv->rxqs)[i];
+		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
 
 		if (rxq == NULL)
 			continue;
@@ -357,7 +350,7 @@ mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 		tmp.rx_nombuf += rxq->stats.rx_nombuf;
 	}
 	for (i = 0; (i != priv->txqs_n); ++i) {
-		struct txq *txq = (*priv->txqs)[i];
+		struct mlx5_txq_data *txq = (*priv->txqs)[i];
 
 		if (txq == NULL)
 			continue;
@@ -367,19 +360,20 @@ mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 			tmp.q_opackets[idx] += txq->stats.opackets;
 			tmp.q_obytes[idx] += txq->stats.obytes;
 #endif
-			tmp.q_errors[idx] += txq->stats.odropped;
+			tmp.q_errors[idx] += txq->stats.oerrors;
 		}
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		tmp.opackets += txq->stats.opackets;
 		tmp.obytes += txq->stats.obytes;
 #endif
-		tmp.oerrors += txq->stats.odropped;
+		tmp.oerrors += txq->stats.oerrors;
 	}
 #ifndef MLX5_PMD_SOFT_COUNTERS
 	/* FIXME: retrieve and add hardware counters. */
 #endif
 	*stats = tmp;
 	priv_unlock(priv);
+	return 0;
 }
 
 /**
@@ -442,8 +436,10 @@ mlx5_xstats_get(struct rte_eth_dev *dev,
 
 		priv_lock(priv);
 		stats_n = priv_ethtool_get_stats_n(priv);
-		if (stats_n < 0)
+		if (stats_n < 0) {
+			priv_unlock(priv);
 			return -1;
+		}
 		if (xstats_ctrl->stats_n != stats_n)
 			priv_xstats_init(priv);
 		ret = priv_xstats_get(priv, stats);
@@ -468,10 +464,11 @@ mlx5_xstats_reset(struct rte_eth_dev *dev)
 	priv_lock(priv);
 	stats_n = priv_ethtool_get_stats_n(priv);
 	if (stats_n < 0)
-		return;
+		goto unlock;
 	if (xstats_ctrl->stats_n != stats_n)
 		priv_xstats_init(priv);
 	priv_xstats_reset(priv);
+unlock:
 	priv_unlock(priv);
 }
 
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 595a9e06..5de2d026 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -30,23 +30,90 @@
  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+#include <unistd.h>
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_ether.h>
 #include <rte_ethdev.h>
 #include <rte_interrupts.h>
 #include <rte_alarm.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
 #include "mlx5_utils.h"
 
+static void
+priv_txq_stop(struct priv *priv)
+{
+	unsigned int i;
+
+	for (i = 0; i != priv->txqs_n; ++i)
+		mlx5_priv_txq_release(priv, i);
+}
+
+static int
+priv_txq_start(struct priv *priv)
+{
+	unsigned int i;
+	int ret = 0;
+
+	/* Add memory regions to Tx queues. */
+	for (i = 0; i != priv->txqs_n; ++i) {
+		unsigned int idx = 0;
+		struct mlx5_mr *mr;
+		struct mlx5_txq_ctrl *txq_ctrl = mlx5_priv_txq_get(priv, i);
+
+		if (!txq_ctrl)
+			continue;
+		LIST_FOREACH(mr, &priv->mr, next)
+			priv_txq_mp2mr_reg(priv, &txq_ctrl->txq, mr->mp, idx++);
+		txq_alloc_elts(txq_ctrl);
+		txq_ctrl->ibv = mlx5_priv_txq_ibv_new(priv, i);
+		if (!txq_ctrl->ibv) {
+			ret = ENOMEM;
+			goto error;
+		}
+	}
+	return -ret;
+error:
+	priv_txq_stop(priv);
+	return -ret;
+}
+
+static void
+priv_rxq_stop(struct priv *priv)
+{
+	unsigned int i;
+
+	for (i = 0; i != priv->rxqs_n; ++i)
+		mlx5_priv_rxq_release(priv, i);
+}
+
+static int
+priv_rxq_start(struct priv *priv)
+{
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i != priv->rxqs_n; ++i) {
+		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_priv_rxq_get(priv, i);
+
+		if (!rxq_ctrl)
+			continue;
+		ret = rxq_alloc_elts(rxq_ctrl);
+		if (ret)
+			goto error;
+		rxq_ctrl->ibv = mlx5_priv_rxq_ibv_new(priv, i);
+		if (!rxq_ctrl->ibv) {
+			ret = ENOMEM;
+			goto error;
+		}
+	}
+	return -ret;
+error:
+	priv_rxq_stop(priv);
+	return -ret;
+}
+
 /**
  * DPDK callback to start the device.
  *
@@ -62,36 +129,47 @@ int
 mlx5_dev_start(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
+	struct mlx5_mr *mr = NULL;
 	int err;
 
 	if (mlx5_is_secondary())
 		return -E_RTE_SECONDARY;
 
+	dev->data->dev_started = 1;
 	priv_lock(priv);
-	if (priv->started) {
-		priv_unlock(priv);
-		return 0;
+	err = priv_flow_create_drop_queue(priv);
+	if (err) {
+		ERROR("%p: Drop queue allocation failed: %s",
+		      (void *)dev, strerror(err));
+		goto error;
 	}
-	/* Update Rx/Tx callback. */
-	priv_select_tx_function(priv);
-	priv_select_rx_function(priv);
 	DEBUG("%p: allocating and configuring hash RX queues", (void *)dev);
-	err = priv_create_hash_rxqs(priv);
-	if (!err)
-		err = priv_rehash_flows(priv);
-	if (!err)
-		priv->started = 1;
-	else {
-		ERROR("%p: an error occurred while configuring hash RX queues:"
+	rte_mempool_walk(mlx5_mp2mr_iter, priv);
+	err = priv_txq_start(priv);
+	if (err) {
+		ERROR("%p: TXQ allocation failed: %s",
+		      (void *)dev, strerror(err));
+		goto error;
+	}
+	/* Update send callback. */
+	priv_dev_select_tx_function(priv, dev);
+	err = priv_rxq_start(priv);
+	if (err) {
+		ERROR("%p: RXQ allocation failed: %s",
+		      (void *)dev, strerror(err));
+		goto error;
+	}
+	/* Update receive callback. */
+	priv_dev_select_rx_function(priv, dev);
+	err = priv_dev_traffic_enable(priv, dev);
+	if (err) {
+		ERROR("%p: an error occurred while configuring control flows:"
 		      " %s",
 		      (void *)priv, strerror(err));
 		goto error;
 	}
-	if (dev->data->dev_conf.fdir_conf.mode != RTE_FDIR_MODE_NONE)
-		priv_fdir_enable(priv);
-	err = priv_flow_start(priv);
+	err = priv_flow_start(priv, &priv->flows);
 	if (err) {
-		priv->started = 0;
 		ERROR("%p: an error occurred while configuring flows:"
 		      " %s",
 		      (void *)priv, strerror(err));
@@ -109,10 +187,14 @@ mlx5_dev_start(struct rte_eth_dev *dev)
 	return 0;
 error:
 	/* Rollback. */
-	priv_special_flow_disable_all(priv);
-	priv_mac_addrs_disable(priv);
-	priv_destroy_hash_rxqs(priv);
-	priv_flow_stop(priv);
+	dev->data->dev_started = 0;
+	for (mr = LIST_FIRST(&priv->mr); mr; mr = LIST_FIRST(&priv->mr))
+		priv_mr_release(priv, mr);
+	priv_flow_stop(priv, &priv->flows);
+	priv_dev_traffic_disable(priv, dev);
+	priv_txq_stop(priv);
+	priv_rxq_stop(priv);
+	priv_flow_delete_drop_queue(priv);
 	priv_unlock(priv);
 	return -err;
 }
@@ -129,23 +211,215 @@ void
 mlx5_dev_stop(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
+	struct mlx5_mr *mr;
 
 	if (mlx5_is_secondary())
 		return;
 
 	priv_lock(priv);
-	if (!priv->started) {
-		priv_unlock(priv);
-		return;
-	}
+	dev->data->dev_started = 0;
+	/* Prevent crashes when queues are still in use. */
+	dev->rx_pkt_burst = removed_rx_burst;
+	dev->tx_pkt_burst = removed_tx_burst;
+	rte_wmb();
+	usleep(1000 * priv->rxqs_n);
 	DEBUG("%p: cleaning up and destroying hash RX queues", (void *)dev);
-	priv_special_flow_disable_all(priv);
-	priv_mac_addrs_disable(priv);
-	priv_destroy_hash_rxqs(priv);
-	priv_fdir_disable(priv);
-	priv_flow_stop(priv);
+	priv_flow_stop(priv, &priv->flows);
+	priv_dev_traffic_disable(priv, dev);
 	priv_rx_intr_vec_disable(priv);
 	priv_dev_interrupt_handler_uninstall(priv, dev);
-	priv->started = 0;
+	priv_txq_stop(priv);
+	priv_rxq_stop(priv);
+	for (mr = LIST_FIRST(&priv->mr); mr; mr = LIST_FIRST(&priv->mr))
+		priv_mr_release(priv, mr);
+	priv_flow_delete_drop_queue(priv);
+	priv_unlock(priv);
+}
+
+/**
+ * Enable traffic flows configured by control plane
+ *
+ * @param priv
+ *   Pointer to Ethernet device private data.
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success.
+ */
+int
+priv_dev_traffic_enable(struct priv *priv, struct rte_eth_dev *dev)
+{
+	struct rte_flow_item_eth bcast = {
+		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+	};
+	struct rte_flow_item_eth ipv6_multi_spec = {
+		.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
+	};
+	struct rte_flow_item_eth ipv6_multi_mask = {
+		.dst.addr_bytes = "\xff\xff\x00\x00\x00\x00",
+	};
+	struct rte_flow_item_eth unicast = {
+		.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+	};
+	struct rte_flow_item_eth unicast_mask = {
+		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+	};
+	const unsigned int vlan_filter_n = priv->vlan_filter_n;
+	const struct ether_addr cmp = {
+		.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+	};
+	unsigned int i;
+	unsigned int j;
+	int ret;
+
+	if (priv->isolated)
+		return 0;
+	if (dev->data->promiscuous) {
+		struct rte_flow_item_eth promisc = {
+			.dst.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+			.type = 0,
+		};
+
+		claim_zero(mlx5_ctrl_flow(dev, &promisc, &promisc));
+		return 0;
+	}
+	if (dev->data->all_multicast) {
+		struct rte_flow_item_eth multicast = {
+			.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
+			.src.addr_bytes = "\x00\x00\x00\x00\x00\x00",
+			.type = 0,
+		};
+
+		claim_zero(mlx5_ctrl_flow(dev, &multicast, &multicast));
+	} else {
+		/* Add broadcast/multicast flows. */
+		for (i = 0; i != vlan_filter_n; ++i) {
+			uint16_t vlan = priv->vlan_filter[i];
+
+			struct rte_flow_item_vlan vlan_spec = {
+				.tci = rte_cpu_to_be_16(vlan),
+			};
+			struct rte_flow_item_vlan vlan_mask = {
+				.tci = 0xffff,
+			};
+
+			ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast,
+						  &vlan_spec, &vlan_mask);
+			if (ret)
+				goto error;
+			ret = mlx5_ctrl_flow_vlan(dev, &ipv6_multi_spec,
+						  &ipv6_multi_mask,
+						  &vlan_spec, &vlan_mask);
+			if (ret)
+				goto error;
+		}
+		if (!vlan_filter_n) {
+			ret = mlx5_ctrl_flow(dev, &bcast, &bcast);
+			if (ret)
+				goto error;
+			ret = mlx5_ctrl_flow(dev, &ipv6_multi_spec,
+					     &ipv6_multi_mask);
+			if (ret)
+				goto error;
+		}
+	}
+	/* Add MAC address flows. */
+	for (i = 0; i != MLX5_MAX_MAC_ADDRESSES; ++i) {
+		struct ether_addr *mac = &dev->data->mac_addrs[i];
+
+		if (!memcmp(mac, &cmp, sizeof(*mac)))
+			continue;
+		memcpy(&unicast.dst.addr_bytes,
+		       mac->addr_bytes,
+		       ETHER_ADDR_LEN);
+		for (j = 0; j != vlan_filter_n; ++j) {
+			uint16_t vlan = priv->vlan_filter[j];
+
+			struct rte_flow_item_vlan vlan_spec = {
+				.tci = rte_cpu_to_be_16(vlan),
+			};
+			struct rte_flow_item_vlan vlan_mask = {
+				.tci = 0xffff,
+			};
+
+			ret = mlx5_ctrl_flow_vlan(dev, &unicast,
+						  &unicast_mask,
+						  &vlan_spec,
+						  &vlan_mask);
+			if (ret)
+				goto error;
+		}
+		if (!vlan_filter_n) {
+			ret = mlx5_ctrl_flow(dev, &unicast,
+					     &unicast_mask);
+			if (ret)
+				goto error;
+		}
+	}
+	return 0;
+error:
+	return rte_errno;
+}
+
+
+/**
+ * Disable traffic flows configured by control plane
+ *
+ * @param priv
+ *   Pointer to Ethernet device private data.
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success.
+ */
+int
+priv_dev_traffic_disable(struct priv *priv, struct rte_eth_dev *dev)
+{
+	(void)dev;
+	priv_flow_flush(priv, &priv->ctrl_flows);
+	return 0;
+}
+
+/**
+ * Restart traffic flows configured by control plane
+ *
+ * @param priv
+ *   Pointer to Ethernet device private data.
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success.
+ */
+int
+priv_dev_traffic_restart(struct priv *priv, struct rte_eth_dev *dev)
+{
+	if (dev->data->dev_started) {
+		priv_dev_traffic_disable(priv, dev);
+		priv_dev_traffic_enable(priv, dev);
+	}
+	return 0;
+}
+
+/**
+ * Restart traffic flows configured by control plane
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success.
+ */
+int
+mlx5_traffic_restart(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	priv_lock(priv);
+	priv_dev_traffic_restart(priv, dev);
 	priv_unlock(priv);
+	return 0;
 }
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 98aaa7ca..9c5860ff 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -36,6 +36,8 @@
 #include <errno.h>
 #include <string.h>
 #include <stdint.h>
+#include <unistd.h>
+#include <sys/mman.h>
 
 /* Verbs header. */
 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
@@ -47,17 +49,10 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_mbuf.h>
 #include <rte_malloc.h>
 #include <rte_ethdev.h>
 #include <rte_common.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5_utils.h"
 #include "mlx5_defs.h"
@@ -70,23 +65,15 @@
  *
  * @param txq_ctrl
  *   Pointer to TX queue structure.
- * @param elts_n
- *   Number of elements to allocate.
  */
-static void
-txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
+void
+txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl)
 {
+	const unsigned int elts_n = 1 << txq_ctrl->txq.elts_n;
 	unsigned int i;
 
 	for (i = 0; (i != elts_n); ++i)
 		(*txq_ctrl->txq.elts)[i] = NULL;
-	for (i = 0; (i != (1u << txq_ctrl->txq.wqe_n)); ++i) {
-		volatile struct mlx5_wqe64 *wqe =
-			(volatile struct mlx5_wqe64 *)
-			txq_ctrl->txq.wqes + i;
-
-		memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
-	}
 	DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
 	txq_ctrl->txq.elts_head = 0;
 	txq_ctrl->txq.elts_tail = 0;
@@ -100,7 +87,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
  *   Pointer to TX queue structure.
  */
 static void
-txq_free_elts(struct txq_ctrl *txq_ctrl)
+txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl)
 {
 	const uint16_t elts_n = 1 << txq_ctrl->txq.elts_n;
 	const uint16_t elts_m = elts_n - 1;
@@ -129,155 +116,231 @@ txq_free_elts(struct txq_ctrl *txq_ctrl)
 }
 
 /**
- * Clean up a TX queue.
+ * DPDK callback to configure a TX queue.
  *
- * Destroy objects, free allocated memory and reset the structure for reuse.
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   TX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ *   Thresholds parameters.
  *
- * @param txq_ctrl
- *   Pointer to TX queue structure.
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+		    unsigned int socket, const struct rte_eth_txconf *conf)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq, struct mlx5_txq_ctrl, txq);
+	int ret = 0;
+
+	if (mlx5_is_secondary())
+		return -E_RTE_SECONDARY;
+
+	priv_lock(priv);
+	if (desc <= MLX5_TX_COMP_THRESH) {
+		WARN("%p: number of descriptors requested for TX queue %u"
+		     " must be higher than MLX5_TX_COMP_THRESH, using"
+		     " %u instead of %u",
+		     (void *)dev, idx, MLX5_TX_COMP_THRESH + 1, desc);
+		desc = MLX5_TX_COMP_THRESH + 1;
+	}
+	if (!rte_is_power_of_2(desc)) {
+		desc = 1 << log2above(desc);
+		WARN("%p: increased number of descriptors in TX queue %u"
+		     " to the next power of two (%d)",
+		     (void *)dev, idx, desc);
+	}
+	DEBUG("%p: configuring queue %u for %u descriptors",
+	      (void *)dev, idx, desc);
+	if (idx >= priv->txqs_n) {
+		ERROR("%p: queue index out of range (%u >= %u)",
+		      (void *)dev, idx, priv->txqs_n);
+		priv_unlock(priv);
+		return -EOVERFLOW;
+	}
+	if (!mlx5_priv_txq_releasable(priv, idx)) {
+		ret = EBUSY;
+		ERROR("%p: unable to release queue index %u",
+		      (void *)dev, idx);
+		goto out;
+	}
+	mlx5_priv_txq_release(priv, idx);
+	txq_ctrl = mlx5_priv_txq_new(priv, idx, desc, socket, conf);
+	if (!txq_ctrl) {
+		ERROR("%p: unable to allocate queue index %u",
+		      (void *)dev, idx);
+		ret = ENOMEM;
+		goto out;
+	}
+	DEBUG("%p: adding TX queue %p to list",
+	      (void *)dev, (void *)txq_ctrl);
+	(*priv->txqs)[idx] = &txq_ctrl->txq;
+out:
+	priv_unlock(priv);
+	return -ret;
+}
+
+/**
+ * DPDK callback to release a TX queue.
+ *
+ * @param dpdk_txq
+ *   Generic TX queue pointer.
  */
 void
-txq_cleanup(struct txq_ctrl *txq_ctrl)
+mlx5_tx_queue_release(void *dpdk_txq)
 {
-	size_t i;
-
-	DEBUG("cleaning up %p", (void *)txq_ctrl);
-	txq_free_elts(txq_ctrl);
-	if (txq_ctrl->qp != NULL)
-		claim_zero(ibv_destroy_qp(txq_ctrl->qp));
-	if (txq_ctrl->cq != NULL)
-		claim_zero(ibv_destroy_cq(txq_ctrl->cq));
-	for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) {
-		if (txq_ctrl->txq.mp2mr[i].mr == NULL)
+	struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
+	struct mlx5_txq_ctrl *txq_ctrl;
+	struct priv *priv;
+	unsigned int i;
+
+	if (mlx5_is_secondary())
+		return;
+
+	if (txq == NULL)
+		return;
+	txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
+	priv = txq_ctrl->priv;
+	priv_lock(priv);
+	for (i = 0; (i != priv->txqs_n); ++i)
+		if ((*priv->txqs)[i] == txq) {
+			DEBUG("%p: removing TX queue %p from list",
+			      (void *)priv->dev, (void *)txq_ctrl);
+			mlx5_priv_txq_release(priv, i);
 			break;
-		claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[i].mr));
-	}
-	memset(txq_ctrl, 0, sizeof(*txq_ctrl));
+		}
+	priv_unlock(priv);
 }
 
+
 /**
- * Initialize TX queue.
+ * Map locally UAR used in Tx queues for BlueFlame doorbell.
  *
- * @param tmpl
- *   Pointer to TX queue control template.
- * @param txq_ctrl
- *   Pointer to TX queue control.
+ * @param[in] priv
+ *   Pointer to private structure.
+ * @param fd
+ *   Verbs file descriptor to map UAR pages.
  *
  * @return
  *   0 on success, errno value on failure.
  */
-static inline int
-txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
+int
+priv_tx_uar_remap(struct priv *priv, int fd)
 {
-	struct mlx5_qp *qp = to_mqp(tmpl->qp);
-	struct ibv_cq *ibcq = tmpl->cq;
-	struct ibv_mlx5_cq_info cq_info;
+	unsigned int i, j;
+	uintptr_t pages[priv->txqs_n];
+	unsigned int pages_n = 0;
+	uintptr_t uar_va;
+	void *addr;
+	struct mlx5_txq_data *txq;
+	struct mlx5_txq_ctrl *txq_ctrl;
+	int already_mapped;
+	size_t page_size = sysconf(_SC_PAGESIZE);
 
-	if (ibv_mlx5_exp_get_cq_info(ibcq, &cq_info)) {
-		ERROR("Unable to query CQ info. check your OFED.");
-		return ENOTSUP;
-	}
-	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
-		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
-		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
-		return EINVAL;
+	memset(pages, 0, priv->txqs_n * sizeof(uintptr_t));
+	/*
+	 * As rdma-core, UARs are mapped in size of OS page size.
+	 * Use aligned address to avoid duplicate mmap.
+	 * Ref to libmlx5 function: mlx5_init_context()
+	 */
+	for (i = 0; i != priv->txqs_n; ++i) {
+		txq = (*priv->txqs)[i];
+		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
+		uar_va = (uintptr_t)txq_ctrl->txq.bf_reg;
+		uar_va = RTE_ALIGN_FLOOR(uar_va, page_size);
+		already_mapped = 0;
+		for (j = 0; j != pages_n; ++j) {
+			if (pages[j] == uar_va) {
+				already_mapped = 1;
+				break;
+			}
+		}
+		if (already_mapped)
+			continue;
+		pages[pages_n++] = uar_va;
+		addr = mmap((void *)uar_va, page_size,
+			    PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
+			    txq_ctrl->uar_mmap_offset);
+		if (addr != (void *)uar_va) {
+			ERROR("call to mmap failed on UAR for txq %d\n", i);
+			return -1;
+		}
 	}
-	tmpl->txq.cqe_n = log2above(cq_info.cqe_cnt);
-	tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
-	tmpl->txq.wqes = qp->gen_data.sqstart;
-	tmpl->txq.wqe_n = log2above(qp->sq.wqe_cnt);
-	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
-	tmpl->txq.bf_reg = qp->gen_data.bf->reg;
-	tmpl->txq.cq_db = cq_info.dbrec;
-	tmpl->txq.cqes =
-		(volatile struct mlx5_cqe (*)[])
-		(uintptr_t)cq_info.buf;
-	tmpl->txq.elts =
-		(struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])
-		((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
 	return 0;
 }
 
 /**
- * Configure a TX queue.
+ * Create the Tx queue Verbs object.
  *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param txq_ctrl
- *   Pointer to TX queue structure.
- * @param desc
- *   Number of descriptors to configure in queue.
- * @param socket
- *   NUMA socket on which memory must be allocated.
- * @param[in] conf
- *   Thresholds parameters.
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
  *
  * @return
- *   0 on success, errno value on failure.
+ *   The Verbs object initialised if it can be created.
  */
-int
-txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
-	       uint16_t desc, unsigned int socket,
-	       const struct rte_eth_txconf *conf)
+struct mlx5_txq_ibv*
+mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx)
 {
-	struct priv *priv = mlx5_get_priv(dev);
-	struct txq_ctrl tmpl = {
-		.priv = priv,
-		.socket = socket,
-	};
+	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq_data, struct mlx5_txq_ctrl, txq);
+	struct mlx5_txq_ibv tmpl;
+	struct mlx5_txq_ibv *txq_ibv;
 	union {
-		struct ibv_exp_qp_init_attr init;
-		struct ibv_exp_cq_init_attr cq;
-		struct ibv_exp_qp_attr mod;
-		struct ibv_exp_cq_attr cq_attr;
+		struct ibv_qp_init_attr_ex init;
+		struct ibv_cq_init_attr_ex cq;
+		struct ibv_qp_attr mod;
+		struct ibv_cq_ex cq_attr;
 	} attr;
 	unsigned int cqe_n;
-	const unsigned int max_tso_inline = ((MLX5_MAX_TSO_HEADER +
-					     (RTE_CACHE_LINE_SIZE - 1)) /
-					      RTE_CACHE_LINE_SIZE);
+	struct mlx5dv_qp qp = { .comp_mask = MLX5DV_QP_MASK_UAR_MMAP_OFFSET };
+	struct mlx5dv_cq cq_info;
+	struct mlx5dv_obj obj;
+	const int desc = 1 << txq_data->elts_n;
 	int ret = 0;
 
+	assert(txq_data);
 	if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
-		ret = ENOTSUP;
 		ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set");
 		goto error;
 	}
-	tmpl.txq.flags = conf->txq_flags;
-	assert(desc > MLX5_TX_COMP_THRESH);
-	tmpl.txq.elts_n = log2above(desc);
-	if (priv->mps == MLX5_MPW_ENHANCED)
-		tmpl.txq.mpw_hdr_dseg = priv->mpw_hdr_dseg;
+	memset(&tmpl, 0, sizeof(struct mlx5_txq_ibv));
 	/* MRs will be registered in mp2mr[] later. */
-	attr.cq = (struct ibv_exp_cq_init_attr){
+	attr.cq = (struct ibv_cq_init_attr_ex){
 		.comp_mask = 0,
 	};
 	cqe_n = ((desc / MLX5_TX_COMP_THRESH) - 1) ?
 		((desc / MLX5_TX_COMP_THRESH) - 1) : 1;
 	if (priv->mps == MLX5_MPW_ENHANCED)
 		cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV;
-	tmpl.cq = ibv_exp_create_cq(priv->ctx,
-				    cqe_n,
-				    NULL, NULL, 0, &attr.cq);
+	tmpl.cq = ibv_create_cq(priv->ctx, cqe_n, NULL, NULL, 0);
 	if (tmpl.cq == NULL) {
-		ret = ENOMEM;
-		ERROR("%p: CQ creation failure: %s",
-		      (void *)dev, strerror(ret));
+		ERROR("%p: CQ creation failure", (void *)txq_ctrl);
 		goto error;
 	}
-	DEBUG("priv->device_attr.max_qp_wr is %d",
-	      priv->device_attr.max_qp_wr);
-	DEBUG("priv->device_attr.max_sge is %d",
-	      priv->device_attr.max_sge);
-	attr.init = (struct ibv_exp_qp_init_attr){
+	attr.init = (struct ibv_qp_init_attr_ex){
 		/* CQ to be associated with the send queue. */
 		.send_cq = tmpl.cq,
 		/* CQ to be associated with the receive queue. */
 		.recv_cq = tmpl.cq,
 		.cap = {
 			/* Max number of outstanding WRs. */
-			.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
-					priv->device_attr.max_qp_wr :
-					desc),
+			.max_send_wr =
+				((priv->device_attr.orig_attr.max_qp_wr <
+				  desc) ?
+				 priv->device_attr.orig_attr.max_qp_wr :
+				 desc),
 			/*
 			 * Max number of scatter/gather elements in a WR,
 			 * must be 1 to prevent libmlx5 from trying to affect
@@ -288,124 +351,204 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 			.max_send_sge = 1,
 		},
 		.qp_type = IBV_QPT_RAW_PACKET,
-		/* Do *NOT* enable this, completions events are managed per
-		 * TX burst. */
+		/*
+		 * Do *NOT* enable this, completions events are managed per
+		 * Tx burst.
+		 */
 		.sq_sig_all = 0,
 		.pd = priv->pd,
-		.comp_mask = IBV_EXP_QP_INIT_ATTR_PD,
+		.comp_mask = IBV_QP_INIT_ATTR_PD,
 	};
-	if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
-		tmpl.txq.max_inline =
-			((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
-			 RTE_CACHE_LINE_SIZE);
-		tmpl.txq.inline_en = 1;
-		/* TSO and MPS can't be enabled concurrently. */
-		assert(!priv->tso || !priv->mps);
-		if (priv->mps == MLX5_MPW_ENHANCED) {
-			tmpl.txq.inline_max_packet_sz =
-				priv->inline_max_packet_sz;
-			/* To minimize the size of data set, avoid requesting
-			 * too large WQ.
-			 */
-			attr.init.cap.max_inline_data =
-				((RTE_MIN(priv->txq_inline,
-					  priv->inline_max_packet_sz) +
-				  (RTE_CACHE_LINE_SIZE - 1)) /
-				 RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE;
-		} else if (priv->tso) {
-			int inline_diff = tmpl.txq.max_inline - max_tso_inline;
-
-			/*
-			 * Adjust inline value as Verbs aggregates
-			 * tso_inline and txq_inline fields.
-			 */
-			attr.init.cap.max_inline_data = inline_diff > 0 ?
-							inline_diff *
-							RTE_CACHE_LINE_SIZE :
-							0;
-		} else {
-			attr.init.cap.max_inline_data =
-				tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
-		}
+	if (txq_data->inline_en)
+		attr.init.cap.max_inline_data = txq_ctrl->max_inline_data;
+	if (txq_data->tso_en) {
+		attr.init.max_tso_header = txq_ctrl->max_tso_header;
+		attr.init.comp_mask |= IBV_QP_INIT_ATTR_MAX_TSO_HEADER;
 	}
-	if (priv->tso) {
-		attr.init.max_tso_header =
-			max_tso_inline * RTE_CACHE_LINE_SIZE;
-		attr.init.comp_mask |= IBV_EXP_QP_INIT_ATTR_MAX_TSO_HEADER;
-		tmpl.txq.max_inline = RTE_MAX(tmpl.txq.max_inline,
-					      max_tso_inline);
-		tmpl.txq.tso_en = 1;
-	}
-	if (priv->tunnel_en)
-		tmpl.txq.tunnel_en = 1;
-	tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
+	tmpl.qp = ibv_create_qp_ex(priv->ctx, &attr.init);
 	if (tmpl.qp == NULL) {
-		ret = (errno ? errno : EINVAL);
-		ERROR("%p: QP creation failure: %s",
-		      (void *)dev, strerror(ret));
+		ERROR("%p: QP creation failure", (void *)txq_ctrl);
 		goto error;
 	}
-	DEBUG("TX queue capabilities: max_send_wr=%u, max_send_sge=%u,"
-	      " max_inline_data=%u",
-	      attr.init.cap.max_send_wr,
-	      attr.init.cap.max_send_sge,
-	      attr.init.cap.max_inline_data);
-	attr.mod = (struct ibv_exp_qp_attr){
+	attr.mod = (struct ibv_qp_attr){
 		/* Move the QP to this state. */
 		.qp_state = IBV_QPS_INIT,
 		/* Primary port number. */
 		.port_num = priv->port
 	};
-	ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
-				(IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
+	ret = ibv_modify_qp(tmpl.qp, &attr.mod, (IBV_QP_STATE | IBV_QP_PORT));
 	if (ret) {
-		ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
-		      (void *)dev, strerror(ret));
+		ERROR("%p: QP state to IBV_QPS_INIT failed", (void *)txq_ctrl);
 		goto error;
 	}
-	ret = txq_setup(&tmpl, txq_ctrl);
-	if (ret) {
-		ERROR("%p: cannot initialize TX queue structure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	txq_alloc_elts(&tmpl, desc);
-	attr.mod = (struct ibv_exp_qp_attr){
+	attr.mod = (struct ibv_qp_attr){
 		.qp_state = IBV_QPS_RTR
 	};
-	ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
+	ret = ibv_modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE);
 	if (ret) {
-		ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
-		      (void *)dev, strerror(ret));
+		ERROR("%p: QP state to IBV_QPS_RTR failed", (void *)txq_ctrl);
 		goto error;
 	}
 	attr.mod.qp_state = IBV_QPS_RTS;
-	ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
+	ret = ibv_modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE);
 	if (ret) {
-		ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
-		      (void *)dev, strerror(ret));
+		ERROR("%p: QP state to IBV_QPS_RTS failed", (void *)txq_ctrl);
 		goto error;
 	}
-	/* Clean up txq in case we're reinitializing it. */
-	DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl);
-	txq_cleanup(txq_ctrl);
-	*txq_ctrl = tmpl;
-	DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl);
-	/* Pre-register known mempools. */
-	rte_mempool_walk(txq_mp2mr_iter, txq_ctrl);
-	assert(ret == 0);
-	return 0;
+	txq_ibv = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_txq_ibv), 0,
+				    txq_ctrl->socket);
+	if (!txq_ibv) {
+		ERROR("%p: cannot allocate memory", (void *)txq_ctrl);
+		goto error;
+	}
+	obj.cq.in = tmpl.cq;
+	obj.cq.out = &cq_info;
+	obj.qp.in = tmpl.qp;
+	obj.qp.out = &qp;
+	ret = mlx5dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP);
+	if (ret != 0)
+		goto error;
+	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
+		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
+		goto error;
+	}
+	txq_data->cqe_n = log2above(cq_info.cqe_cnt);
+	txq_data->qp_num_8s = tmpl.qp->qp_num << 8;
+	txq_data->wqes = qp.sq.buf;
+	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
+	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
+	txq_data->bf_reg = qp.bf.reg;
+	txq_data->cq_db = cq_info.dbrec;
+	txq_data->cqes =
+		(volatile struct mlx5_cqe (*)[])
+		(uintptr_t)cq_info.buf;
+	txq_data->cq_ci = 0;
+	txq_data->cq_pi = 0;
+	txq_data->wqe_ci = 0;
+	txq_data->wqe_pi = 0;
+	txq_ibv->qp = tmpl.qp;
+	txq_ibv->cq = tmpl.cq;
+	rte_atomic32_inc(&txq_ibv->refcnt);
+	if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
+		txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;
+	} else {
+		ERROR("Failed to retrieve UAR info, invalid libmlx5.so version");
+		goto error;
+	}
+	DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv,
+	      (void *)txq_ibv, rte_atomic32_read(&txq_ibv->refcnt));
+	LIST_INSERT_HEAD(&priv->txqsibv, txq_ibv, next);
+	return txq_ibv;
 error:
-	txq_cleanup(&tmpl);
-	assert(ret > 0);
+	if (tmpl.cq)
+		claim_zero(ibv_destroy_cq(tmpl.cq));
+	if (tmpl.qp)
+		claim_zero(ibv_destroy_qp(tmpl.qp));
+	return NULL;
+}
+
+/**
+ * Get an Tx queue Verbs object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ *
+ * @return
+ *   The Verbs object if it exists.
+ */
+struct mlx5_txq_ibv*
+mlx5_priv_txq_ibv_get(struct priv *priv, uint16_t idx)
+{
+	struct mlx5_txq_ctrl *txq_ctrl;
+
+	if (idx >= priv->txqs_n)
+		return NULL;
+	if (!(*priv->txqs)[idx])
+		return NULL;
+	txq_ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
+	if (txq_ctrl->ibv) {
+		rte_atomic32_inc(&txq_ctrl->ibv->refcnt);
+		DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv,
+		      (void *)txq_ctrl->ibv,
+		      rte_atomic32_read(&txq_ctrl->ibv->refcnt));
+	}
+	return txq_ctrl->ibv;
+}
+
+/**
+ * Release an Tx verbs queue object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param txq_ibv
+ *   Verbs Tx queue object.
+ *
+ * @return
+ *   0 on success, errno on failure.
+ */
+int
+mlx5_priv_txq_ibv_release(struct priv *priv, struct mlx5_txq_ibv *txq_ibv)
+{
+	(void)priv;
+	assert(txq_ibv);
+	DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv,
+	      (void *)txq_ibv, rte_atomic32_read(&txq_ibv->refcnt));
+	if (rte_atomic32_dec_and_test(&txq_ibv->refcnt)) {
+		claim_zero(ibv_destroy_qp(txq_ibv->qp));
+		claim_zero(ibv_destroy_cq(txq_ibv->cq));
+		LIST_REMOVE(txq_ibv, next);
+		rte_free(txq_ibv);
+		return 0;
+	}
+	return EBUSY;
+}
+
+/**
+ * Return true if a single reference exists on the object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param txq_ibv
+ *   Verbs Tx queue object.
+ */
+int
+mlx5_priv_txq_ibv_releasable(struct priv *priv, struct mlx5_txq_ibv *txq_ibv)
+{
+	(void)priv;
+	assert(txq_ibv);
+	return (rte_atomic32_read(&txq_ibv->refcnt) == 1);
+}
+
+/**
+ * Verify the Verbs Tx queue list is empty
+ *
+ * @param priv
+ *  Pointer to private structure.
+ *
+ * @return the number of object not released.
+ */
+int
+mlx5_priv_txq_ibv_verify(struct priv *priv)
+{
+	int ret = 0;
+	struct mlx5_txq_ibv *txq_ibv;
+
+	LIST_FOREACH(txq_ibv, &priv->txqsibv, next) {
+		DEBUG("%p: Verbs Tx queue %p still referenced", (void *)priv,
+		      (void *)txq_ibv);
+		++ret;
+	}
 	return ret;
 }
 
 /**
- * DPDK callback to configure a TX queue.
+ * Create a DPDK Tx queue.
  *
- * @param dev
- *   Pointer to Ethernet device structure.
+ * @param priv
+ *   Pointer to private structure.
  * @param idx
  *   TX queue index.
  * @param desc
@@ -413,164 +556,236 @@ error:
  * @param socket
  *   NUMA socket on which memory must be allocated.
  * @param[in] conf
- *   Thresholds parameters.
+ *  Thresholds parameters.
  *
  * @return
- *   0 on success, negative errno value on failure.
+ *   A DPDK queue object on success.
  */
-int
-mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
-		    unsigned int socket, const struct rte_eth_txconf *conf)
+struct mlx5_txq_ctrl*
+mlx5_priv_txq_new(struct priv *priv, uint16_t idx, uint16_t desc,
+		  unsigned int socket,
+		  const struct rte_eth_txconf *conf)
 {
-	struct priv *priv = dev->data->dev_private;
-	struct txq *txq = (*priv->txqs)[idx];
-	struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
-	int ret;
+	const unsigned int max_tso_inline =
+		((MLX5_MAX_TSO_HEADER + (RTE_CACHE_LINE_SIZE - 1)) /
+		 RTE_CACHE_LINE_SIZE);
+	struct mlx5_txq_ctrl *tmpl;
 
-	if (mlx5_is_secondary())
-		return -E_RTE_SECONDARY;
+	tmpl = rte_calloc_socket("TXQ", 1,
+				 sizeof(*tmpl) +
+				 desc * sizeof(struct rte_mbuf *),
+				 0, socket);
+	if (!tmpl)
+		return NULL;
+	assert(desc > MLX5_TX_COMP_THRESH);
+	tmpl->txq.flags = conf->txq_flags;
+	tmpl->priv = priv;
+	tmpl->socket = socket;
+	tmpl->txq.elts_n = log2above(desc);
+	if (priv->mps == MLX5_MPW_ENHANCED)
+		tmpl->txq.mpw_hdr_dseg = priv->mpw_hdr_dseg;
+	/* MRs will be registered in mp2mr[] later. */
+	DEBUG("priv->device_attr.max_qp_wr is %d",
+	      priv->device_attr.orig_attr.max_qp_wr);
+	DEBUG("priv->device_attr.max_sge is %d",
+	      priv->device_attr.orig_attr.max_sge);
+	if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
+		unsigned int ds_cnt;
 
-	priv_lock(priv);
-	if (desc <= MLX5_TX_COMP_THRESH) {
-		WARN("%p: number of descriptors requested for TX queue %u"
-		     " must be higher than MLX5_TX_COMP_THRESH, using"
-		     " %u instead of %u",
-		     (void *)dev, idx, MLX5_TX_COMP_THRESH + 1, desc);
-		desc = MLX5_TX_COMP_THRESH + 1;
-	}
-	if (!rte_is_power_of_2(desc)) {
-		desc = 1 << log2above(desc);
-		WARN("%p: increased number of descriptors in TX queue %u"
-		     " to the next power of two (%d)",
-		     (void *)dev, idx, desc);
-	}
-	DEBUG("%p: configuring queue %u for %u descriptors",
-	      (void *)dev, idx, desc);
-	if (idx >= priv->txqs_n) {
-		ERROR("%p: queue index out of range (%u >= %u)",
-		      (void *)dev, idx, priv->txqs_n);
-		priv_unlock(priv);
-		return -EOVERFLOW;
-	}
-	if (txq != NULL) {
-		DEBUG("%p: reusing already allocated queue index %u (%p)",
-		      (void *)dev, idx, (void *)txq);
-		if (priv->started) {
-			priv_unlock(priv);
-			return -EEXIST;
-		}
-		(*priv->txqs)[idx] = NULL;
-		txq_cleanup(txq_ctrl);
-		/* Resize if txq size is changed. */
-		if (txq_ctrl->txq.elts_n != log2above(desc)) {
-			txq_ctrl = rte_realloc(txq_ctrl,
-					       sizeof(*txq_ctrl) +
-					       desc * sizeof(struct rte_mbuf *),
-					       RTE_CACHE_LINE_SIZE);
-			if (!txq_ctrl) {
-				ERROR("%p: unable to reallocate queue index %u",
-					(void *)dev, idx);
-				priv_unlock(priv);
-				return -ENOMEM;
-			}
+		tmpl->txq.max_inline =
+			((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
+			 RTE_CACHE_LINE_SIZE);
+		tmpl->txq.inline_en = 1;
+		/* TSO and MPS can't be enabled concurrently. */
+		assert(!priv->tso || !priv->mps);
+		if (priv->mps == MLX5_MPW_ENHANCED) {
+			tmpl->txq.inline_max_packet_sz =
+				priv->inline_max_packet_sz;
+			/* To minimize the size of data set, avoid requesting
+			 * too large WQ.
+			 */
+			tmpl->max_inline_data =
+				((RTE_MIN(priv->txq_inline,
+					  priv->inline_max_packet_sz) +
+				  (RTE_CACHE_LINE_SIZE - 1)) /
+				 RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE;
+		} else if (priv->tso) {
+			int inline_diff = tmpl->txq.max_inline - max_tso_inline;
+
+			/*
+			 * Adjust inline value as Verbs aggregates
+			 * tso_inline and txq_inline fields.
+			 */
+			tmpl->max_inline_data = inline_diff > 0 ?
+					       inline_diff *
+					       RTE_CACHE_LINE_SIZE :
+					       0;
+		} else {
+			tmpl->max_inline_data =
+				tmpl->txq.max_inline * RTE_CACHE_LINE_SIZE;
 		}
-	} else {
-		txq_ctrl =
-			rte_calloc_socket("TXQ", 1,
-					  sizeof(*txq_ctrl) +
-					  desc * sizeof(struct rte_mbuf *),
-					  0, socket);
-		if (txq_ctrl == NULL) {
-			ERROR("%p: unable to allocate queue index %u",
-			      (void *)dev, idx);
-			priv_unlock(priv);
-			return -ENOMEM;
+		/*
+		 * Check if the inline size is too large in a way which
+		 * can make the WQE DS to overflow.
+		 * Considering in calculation:
+		 *      WQE CTRL (1 DS)
+		 *      WQE ETH  (1 DS)
+		 *      Inline part (N DS)
+		 */
+		ds_cnt = 2 + (tmpl->txq.max_inline / MLX5_WQE_DWORD_SIZE);
+		if (ds_cnt > MLX5_DSEG_MAX) {
+			unsigned int max_inline = (MLX5_DSEG_MAX - 2) *
+						  MLX5_WQE_DWORD_SIZE;
+
+			max_inline = max_inline - (max_inline %
+						   RTE_CACHE_LINE_SIZE);
+			WARN("txq inline is too large (%d) setting it to "
+			     "the maximum possible: %d\n",
+			     priv->txq_inline, max_inline);
+			tmpl->txq.max_inline = max_inline / RTE_CACHE_LINE_SIZE;
 		}
 	}
-	ret = txq_ctrl_setup(dev, txq_ctrl, desc, socket, conf);
-	if (ret)
-		rte_free(txq_ctrl);
-	else {
-		txq_ctrl->txq.stats.idx = idx;
-		DEBUG("%p: adding TX queue %p to list",
-		      (void *)dev, (void *)txq_ctrl);
-		(*priv->txqs)[idx] = &txq_ctrl->txq;
+	if (priv->tso) {
+		tmpl->max_tso_header = max_tso_inline * RTE_CACHE_LINE_SIZE;
+		tmpl->txq.max_inline = RTE_MAX(tmpl->txq.max_inline,
+					       max_tso_inline);
+		tmpl->txq.tso_en = 1;
 	}
-	priv_unlock(priv);
-	return -ret;
+	if (priv->tunnel_en)
+		tmpl->txq.tunnel_en = 1;
+	tmpl->txq.elts =
+		(struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])(tmpl + 1);
+	tmpl->txq.stats.idx = idx;
+	rte_atomic32_inc(&tmpl->refcnt);
+	DEBUG("%p: Tx queue %p: refcnt %d", (void *)priv,
+	      (void *)tmpl, rte_atomic32_read(&tmpl->refcnt));
+	LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next);
+	return tmpl;
 }
 
 /**
- * DPDK callback to release a TX queue.
+ * Get a Tx queue.
  *
- * @param dpdk_txq
- *   Generic TX queue pointer.
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   TX queue index.
+ *
+ * @return
+ *   A pointer to the queue if it exists.
  */
-void
-mlx5_tx_queue_release(void *dpdk_txq)
+struct mlx5_txq_ctrl*
+mlx5_priv_txq_get(struct priv *priv, uint16_t idx)
 {
-	struct txq *txq = (struct txq *)dpdk_txq;
-	struct txq_ctrl *txq_ctrl;
-	struct priv *priv;
-	unsigned int i;
+	struct mlx5_txq_ctrl *ctrl = NULL;
 
-	if (mlx5_is_secondary())
-		return;
+	if ((*priv->txqs)[idx]) {
+		ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl,
+				    txq);
+		unsigned int i;
 
-	if (txq == NULL)
-		return;
-	txq_ctrl = container_of(txq, struct txq_ctrl, txq);
-	priv = txq_ctrl->priv;
-	priv_lock(priv);
-	for (i = 0; (i != priv->txqs_n); ++i)
-		if ((*priv->txqs)[i] == txq) {
-			DEBUG("%p: removing TX queue %p from list",
-			      (void *)priv->dev, (void *)txq_ctrl);
-			(*priv->txqs)[i] = NULL;
-			break;
+		mlx5_priv_txq_ibv_get(priv, idx);
+		for (i = 0; i != MLX5_PMD_TX_MP_CACHE; ++i) {
+			struct mlx5_mr *mr = NULL;
+
+			(void)mr;
+			if (ctrl->txq.mp2mr[i]) {
+				mr = priv_mr_get(priv, ctrl->txq.mp2mr[i]->mp);
+				assert(mr);
+			}
 		}
-	txq_cleanup(txq_ctrl);
-	rte_free(txq_ctrl);
-	priv_unlock(priv);
+		rte_atomic32_inc(&ctrl->refcnt);
+		DEBUG("%p: Tx queue %p: refcnt %d", (void *)priv,
+		      (void *)ctrl, rte_atomic32_read(&ctrl->refcnt));
+	}
+	return ctrl;
 }
 
 /**
- * DPDK callback for TX in secondary processes.
- *
- * This function configures all queues from primary process information
- * if necessary before reverting to the normal TX burst callback.
+ * Release a Tx queue.
  *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   TX queue index.
  *
  * @return
- *   Number of packets successfully transmitted (<= pkts_n).
+ *   0 on success, errno on failure.
  */
-uint16_t
-mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts,
-			      uint16_t pkts_n)
+int
+mlx5_priv_txq_release(struct priv *priv, uint16_t idx)
 {
-	struct txq *txq = dpdk_txq;
-	struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
-	struct priv *priv = mlx5_secondary_data_setup(txq_ctrl->priv);
-	struct priv *primary_priv;
-	unsigned int index;
+	unsigned int i;
+	struct mlx5_txq_ctrl *txq;
 
-	if (priv == NULL)
+	if (!(*priv->txqs)[idx])
 		return 0;
-	primary_priv =
-		mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
-	/* Look for queue index in both private structures. */
-	for (index = 0; index != priv->txqs_n; ++index)
-		if (((*primary_priv->txqs)[index] == txq) ||
-		    ((*priv->txqs)[index] == txq))
-			break;
-	if (index == priv->txqs_n)
+	txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
+	DEBUG("%p: Tx queue %p: refcnt %d", (void *)priv,
+	      (void *)txq, rte_atomic32_read(&txq->refcnt));
+	if (txq->ibv) {
+		int ret;
+
+		ret = mlx5_priv_txq_ibv_release(priv, txq->ibv);
+		if (!ret)
+			txq->ibv = NULL;
+	}
+	for (i = 0; i != MLX5_PMD_TX_MP_CACHE; ++i) {
+		if (txq->txq.mp2mr[i]) {
+			priv_mr_release(priv, txq->txq.mp2mr[i]);
+			txq->txq.mp2mr[i] = NULL;
+		}
+	}
+	if (rte_atomic32_dec_and_test(&txq->refcnt)) {
+		txq_free_elts(txq);
+		LIST_REMOVE(txq, next);
+		rte_free(txq);
+		(*priv->txqs)[idx] = NULL;
 		return 0;
-	txq = (*priv->txqs)[index];
-	return priv->dev->tx_pkt_burst(txq, pkts, pkts_n);
+	}
+	return EBUSY;
+}
+
+/**
+ * Verify if the queue can be released.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   TX queue index.
+ *
+ * @return
+ *   1 if the queue can be released.
+ */
+int
+mlx5_priv_txq_releasable(struct priv *priv, uint16_t idx)
+{
+	struct mlx5_txq_ctrl *txq;
+
+	if (!(*priv->txqs)[idx])
+		return -1;
+	txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
+	return (rte_atomic32_read(&txq->refcnt) == 1);
+}
+
+/**
+ * Verify the Tx Queue list is empty
+ *
+ * @param priv
+ *  Pointer to private structure.
+ *
+ * @return the number of object not released.
+ */
+int
+mlx5_priv_txq_verify(struct priv *priv)
+{
+	struct mlx5_txq_ctrl *txq;
+	int ret = 0;
+
+	LIST_FOREACH(txq, &priv->txqsctrl, next) {
+		DEBUG("%p: Tx Queue %p still referenced", (void *)priv,
+		      (void *)txq);
+		++ret;
+	}
+	return ret;
 }
diff --git a/drivers/net/mlx5/mlx5_utils.h b/drivers/net/mlx5/mlx5_utils.h
index a824787f..218ae831 100644
--- a/drivers/net/mlx5/mlx5_utils.h
+++ b/drivers/net/mlx5/mlx5_utils.h
@@ -128,11 +128,13 @@ pmd_drv_log_basename(const char *s)
 
 #define DEBUG(...) PMD_DRV_LOG(DEBUG, __VA_ARGS__)
 #define claim_zero(...) assert((__VA_ARGS__) == 0)
+#define claim_nonzero(...) assert((__VA_ARGS__) != 0)
 
 #else /* NDEBUG */
 
 #define DEBUG(...) (void)0
 #define claim_zero(...) (__VA_ARGS__)
+#define claim_nonzero(...) (__VA_ARGS__)
 
 #endif /* NDEBUG */
 
diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c
index 1b0fa40a..6fc315ef 100644
--- a/drivers/net/mlx5/mlx5_vlan.c
+++ b/drivers/net/mlx5/mlx5_vlan.c
@@ -36,22 +36,15 @@
 #include <assert.h>
 #include <stdint.h>
 
-/* DPDK headers don't like -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
 #include <rte_ethdev.h>
 #include <rte_common.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include "mlx5_utils.h"
 #include "mlx5.h"
 #include "mlx5_autoconf.h"
 
 /**
- * Configure a VLAN filter.
+ * DPDK callback to configure a VLAN filter.
  *
  * @param dev
  *   Pointer to Ethernet device structure.
@@ -61,14 +54,16 @@
  *   Toggle filter.
  *
  * @return
- *   0 on success, errno value on failure.
+ *   0 on success, negative errno value on failure.
  */
-static int
-vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
+int
+mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
 {
 	struct priv *priv = dev->data->dev_private;
 	unsigned int i;
+	int ret = 0;
 
+	priv_lock(priv);
 	DEBUG("%p: %s VLAN filter ID %" PRIu16,
 	      (void *)dev, (on ? "enable" : "disable"), vlan_id);
 	assert(priv->vlan_filter_n <= RTE_DIM(priv->vlan_filter));
@@ -76,13 +71,15 @@ vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
 		if (priv->vlan_filter[i] == vlan_id)
 			break;
 	/* Check if there's room for another VLAN filter. */
-	if (i == RTE_DIM(priv->vlan_filter))
-		return ENOMEM;
+	if (i == RTE_DIM(priv->vlan_filter)) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	if (i < priv->vlan_filter_n) {
 		assert(priv->vlan_filter_n != 0);
 		/* Enabling an existing VLAN filter has no effect. */
 		if (on)
-			return 0;
+			goto out;
 		/* Remove VLAN filter from list. */
 		--priv->vlan_filter_n;
 		memmove(&priv->vlan_filter[i],
@@ -94,41 +91,16 @@ vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
 		assert(i == priv->vlan_filter_n);
 		/* Disabling an unknown VLAN filter has no effect. */
 		if (!on)
-			return 0;
+			goto out;
 		/* Add new VLAN filter. */
 		priv->vlan_filter[priv->vlan_filter_n] = vlan_id;
 		++priv->vlan_filter_n;
 	}
-	/* Rehash flows in all hash RX queues. */
-	priv_mac_addrs_disable(priv);
-	priv_special_flow_disable_all(priv);
-	return priv_rehash_flows(priv);
-}
-
-/**
- * DPDK callback to configure a VLAN filter.
- *
- * @param dev
- *   Pointer to Ethernet device structure.
- * @param vlan_id
- *   VLAN ID to filter.
- * @param on
- *   Toggle filter.
- *
- * @return
- *   0 on success, negative errno value on failure.
- */
-int
-mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
-{
-	struct priv *priv = dev->data->dev_private;
-	int ret;
-
-	priv_lock(priv);
-	ret = vlan_filter_set(dev, vlan_id, on);
+	if (dev->data->dev_started)
+		priv_dev_traffic_restart(priv, dev);
+out:
 	priv_unlock(priv);
-	assert(ret >= 0);
-	return -ret;
+	return ret;
 }
 
 /**
@@ -144,22 +116,24 @@ mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
 static void
 priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
 {
-	struct rxq *rxq = (*priv->rxqs)[idx];
-	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
-	struct ibv_exp_wq_attr mod;
+	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	struct ibv_wq_attr mod;
 	uint16_t vlan_offloads =
-		(on ? IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : 0) |
+		(on ? IBV_WQ_FLAGS_CVLAN_STRIPPING : 0) |
 		0;
 	int err;
 
 	DEBUG("set VLAN offloads 0x%x for port %d queue %d",
 	      vlan_offloads, rxq->port_id, idx);
-	mod = (struct ibv_exp_wq_attr){
-		.attr_mask = IBV_EXP_WQ_ATTR_VLAN_OFFLOADS,
-		.vlan_offloads = vlan_offloads,
+	mod = (struct ibv_wq_attr){
+		.attr_mask = IBV_WQ_ATTR_FLAGS,
+		.flags_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING,
+		.flags = vlan_offloads,
 	};
 
-	err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
+	err = ibv_modify_wq(rxq_ctrl->ibv->wq, &mod);
 	if (err) {
 		ERROR("%p: failed to modified stripping mode: %s",
 		      (void *)priv, strerror(err));
@@ -210,7 +184,7 @@ mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on)
  * @param mask
  *   VLAN offload bit mask.
  */
-void
+int
 mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask)
 {
 	struct priv *priv = dev->data->dev_private;
@@ -221,7 +195,7 @@ mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask)
 
 		if (!priv->hw_vlan_strip) {
 			ERROR("VLAN stripping is not supported");
-			return;
+			return 0;
 		}
 
 		/* Run on every RX queue and set/reset VLAN stripping. */
@@ -230,4 +204,6 @@ mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask)
 			priv_vlan_strip_queue_set(priv, i, hw_vlan_strip);
 		priv_unlock(priv);
 	}
+
+	return 0;
 }
author	Luca Boccassi <luca.boccassi@gmail.com>	2017-11-08 14:15:11 +0000
committer	Luca Boccassi <luca.boccassi@gmail.com>	2017-11-08 14:45:54 +0000
commit	055c52583a2794da8ba1e85a48cce3832372b12f (patch)
tree	8ceb1cb78fbb46a0f341f8ee24feb3c6b5540013 /drivers/net/mlx5
parent	f239aed5e674965691846e8ce3f187dd47523689 (diff)