aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/mlx4
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/mlx4')
-rw-r--r--drivers/net/mlx4/Makefile50
-rw-r--r--drivers/net/mlx4/mlx4.c255
-rw-r--r--drivers/net/mlx4/mlx4.h57
-rw-r--r--drivers/net/mlx4/mlx4_ethdev.c208
-rw-r--r--drivers/net/mlx4/mlx4_flow.c240
-rw-r--r--drivers/net/mlx4/mlx4_flow.h6
-rw-r--r--drivers/net/mlx4/mlx4_glue.c2
-rw-r--r--drivers/net/mlx4/mlx4_glue.h2
-rw-r--r--drivers/net/mlx4/mlx4_intr.c2
-rw-r--r--drivers/net/mlx4/mlx4_mr.c1247
-rw-r--r--drivers/net/mlx4/mlx4_mr.h122
-rw-r--r--drivers/net/mlx4/mlx4_prm.h18
-rw-r--r--drivers/net/mlx4/mlx4_rxq.c109
-rw-r--r--drivers/net/mlx4/mlx4_rxtx.c566
-rw-r--r--drivers/net/mlx4/mlx4_rxtx.h94
-rw-r--r--drivers/net/mlx4/mlx4_txq.c130
-rw-r--r--drivers/net/mlx4/mlx4_utils.c2
-rw-r--r--drivers/net/mlx4/mlx4_utils.h2
18 files changed, 2267 insertions, 845 deletions
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index cc800493..92e93225 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -1,33 +1,6 @@
-# BSD LICENSE
-#
+# SPDX-License-Identifier: BSD-3-Clause
# Copyright 2012 6WIND S.A.
-# Copyright 2012 Mellanox
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-# * Neither the name of 6WIND S.A. nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Copyright 2012 Mellanox Technologies, Ltd
include $(RTE_SDK)/mk/rte.vars.mk
@@ -64,6 +37,7 @@ CFLAGS += -D_BSD_SOURCE
CFLAGS += -D_DEFAULT_SOURCE
CFLAGS += -D_XOPEN_SOURCE=600
CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DLOPEN_DEPS),y)
CFLAGS += -DMLX4_GLUE='"$(LIB_GLUE)"'
CFLAGS += -DMLX4_GLUE_VERSION='"$(LIB_GLUE_VERSION)"'
@@ -95,10 +69,6 @@ else
CFLAGS += -DNDEBUG -UPEDANTIC
endif
-ifdef CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE
-CFLAGS += -DMLX4_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE)
-endif
-
include $(RTE_SDK)/mk/rte.lib.mk
# Generate and clean-up mlx4_autoconf.h.
@@ -115,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
$Q $(RM) -f -- '$@'
$Q : > '$@'
+ $Q sh -- '$<' '$@' \
+ HAVE_IBV_MLX4_WQE_LSO_SEG \
+ infiniband/mlx4dv.h \
+ type 'struct mlx4_wqe_lso_seg' \
+ $(AUTOCONF_OUTPUT)
# Create mlx4_autoconf.h or update it in case it differs from the new one.
@@ -132,10 +107,15 @@ ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DLOPEN_DEPS),y)
$(LIB): $(LIB_GLUE)
+ifeq ($(LINK_USING_CC),1)
+GLUE_LDFLAGS := $(call linkerprefix,$(LDFLAGS))
+else
+GLUE_LDFLAGS := $(LDFLAGS)
+endif
$(LIB_GLUE): mlx4_glue.o
- $Q $(LD) $(LDFLAGS) $(EXTRA_LDFLAGS) \
+ $Q $(LD) $(GLUE_LDFLAGS) $(EXTRA_LDFLAGS) \
-Wl,-h,$(LIB_GLUE) \
- -s -shared -o $@ $< -libverbs -lmlx4
+ -shared -o $@ $< -libverbs -lmlx4
mlx4_glue.o: mlx4_autoconf.h
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index ee93dafe..defc0d4b 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2012 6WIND S.A.
- * Copyright 2012 Mellanox
+ * Copyright 2012 Mellanox Technologies, Ltd
*/
/**
@@ -44,9 +44,15 @@
#include "mlx4.h"
#include "mlx4_glue.h"
#include "mlx4_flow.h"
+#include "mlx4_mr.h"
#include "mlx4_rxtx.h"
#include "mlx4_utils.h"
+struct mlx4_dev_list mlx4_mem_event_cb_list =
+ LIST_HEAD_INITIALIZER(mlx4_mem_event_cb_list);
+
+rte_rwlock_t mlx4_mem_event_rwlock = RTE_RWLOCK_INITIALIZER;
+
/** Configuration structure for device arguments. */
struct mlx4_conf {
struct {
@@ -61,6 +67,8 @@ const char *pmd_mlx4_init_params[] = {
NULL,
};
+static void mlx4_dev_stop(struct rte_eth_dev *dev);
+
/**
* DPDK callback for Ethernet device configuration.
*
@@ -123,6 +131,9 @@ mlx4_dev_start(struct rte_eth_dev *dev)
(void *)dev, strerror(-ret));
goto err;
}
+#ifndef NDEBUG
+ mlx4_mr_dump_dev(dev);
+#endif
ret = mlx4_rxq_intr_enable(priv);
if (ret) {
ERROR("%p: interrupt handler installation failed",
@@ -143,8 +154,7 @@ mlx4_dev_start(struct rte_eth_dev *dev)
dev->rx_pkt_burst = mlx4_rx_burst;
return 0;
err:
- /* Rollback. */
- priv->started = 0;
+ mlx4_dev_stop(dev);
return ret;
}
@@ -194,10 +204,12 @@ mlx4_dev_close(struct rte_eth_dev *dev)
dev->tx_pkt_burst = mlx4_tx_burst_removed;
rte_wmb();
mlx4_flow_clean(priv);
+ mlx4_rss_deinit(priv);
for (i = 0; i != dev->data->nb_rx_queues; ++i)
mlx4_rx_queue_release(dev->data->rx_queues[i]);
for (i = 0; i != dev->data->nb_tx_queues; ++i)
mlx4_tx_queue_release(dev->data->tx_queues[i]);
+ mlx4_mr_release(dev);
if (priv->pd != NULL) {
assert(priv->ctx != NULL);
claim_zero(mlx4_glue->dealloc_pd(priv->pd));
@@ -385,6 +397,99 @@ free_kvlist:
return ret;
}
+/**
+ * Interpret RSS capabilities reported by device.
+ *
+ * This function returns the set of usable Verbs RSS hash fields, kernel
+ * quirks taken into account.
+ *
+ * @param ctx
+ * Verbs context.
+ * @param pd
+ * Verbs protection domain.
+ * @param device_attr_ex
+ * Extended device attributes to interpret.
+ *
+ * @return
+ * Usable RSS hash fields mask in Verbs format.
+ */
+static uint64_t
+mlx4_hw_rss_sup(struct ibv_context *ctx, struct ibv_pd *pd,
+ struct ibv_device_attr_ex *device_attr_ex)
+{
+ uint64_t hw_rss_sup = device_attr_ex->rss_caps.rx_hash_fields_mask;
+ struct ibv_cq *cq = NULL;
+ struct ibv_wq *wq = NULL;
+ struct ibv_rwq_ind_table *ind = NULL;
+ struct ibv_qp *qp = NULL;
+
+ if (!hw_rss_sup) {
+ WARN("no RSS capabilities reported; disabling support for UDP"
+ " RSS and inner VXLAN RSS");
+ return IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4 |
+ IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6 |
+ IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP;
+ }
+ if (!(hw_rss_sup & IBV_RX_HASH_INNER))
+ return hw_rss_sup;
+ /*
+ * Although reported as supported, missing code in some Linux
+ * versions (v4.15, v4.16) prevents the creation of hash QPs with
+ * inner capability.
+ *
+ * There is no choice but to attempt to instantiate a temporary RSS
+ * context in order to confirm its support.
+ */
+ cq = mlx4_glue->create_cq(ctx, 1, NULL, NULL, 0);
+ wq = cq ? mlx4_glue->create_wq
+ (ctx,
+ &(struct ibv_wq_init_attr){
+ .wq_type = IBV_WQT_RQ,
+ .max_wr = 1,
+ .max_sge = 1,
+ .pd = pd,
+ .cq = cq,
+ }) : NULL;
+ ind = wq ? mlx4_glue->create_rwq_ind_table
+ (ctx,
+ &(struct ibv_rwq_ind_table_init_attr){
+ .log_ind_tbl_size = 0,
+ .ind_tbl = &wq,
+ .comp_mask = 0,
+ }) : NULL;
+ qp = ind ? mlx4_glue->create_qp_ex
+ (ctx,
+ &(struct ibv_qp_init_attr_ex){
+ .comp_mask =
+ (IBV_QP_INIT_ATTR_PD |
+ IBV_QP_INIT_ATTR_RX_HASH |
+ IBV_QP_INIT_ATTR_IND_TABLE),
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .pd = pd,
+ .rwq_ind_tbl = ind,
+ .rx_hash_conf = {
+ .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
+ .rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE,
+ .rx_hash_key = mlx4_rss_hash_key_default,
+ .rx_hash_fields_mask = hw_rss_sup,
+ },
+ }) : NULL;
+ if (!qp) {
+ WARN("disabling unusable inner RSS capability due to kernel"
+ " quirk");
+ hw_rss_sup &= ~IBV_RX_HASH_INNER;
+ } else {
+ claim_zero(mlx4_glue->destroy_qp(qp));
+ }
+ if (ind)
+ claim_zero(mlx4_glue->destroy_rwq_ind_table(ind));
+ if (wq)
+ claim_zero(mlx4_glue->destroy_wq(wq));
+ if (cq)
+ claim_zero(mlx4_glue->destroy_cq(cq));
+ return hw_rss_sup;
+}
+
static struct rte_pci_driver mlx4_driver;
/**
@@ -470,14 +575,14 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
ibv_dev = list[i];
DEBUG("device opened");
if (mlx4_glue->query_device(attr_ctx, &device_attr)) {
- rte_errno = ENODEV;
+ err = ENODEV;
goto error;
}
INFO("%u port(s) detected", device_attr.phys_port_cnt);
conf.ports.present |= (UINT64_C(1) << device_attr.phys_port_cnt) - 1;
if (mlx4_args(pci_dev->device.devargs, &conf)) {
ERROR("failed to process device arguments");
- rte_errno = EINVAL;
+ err = EINVAL;
goto error;
}
/* Use all ports when none are defined */
@@ -485,7 +590,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
conf.ports.enabled = conf.ports.present;
/* Retrieve extended device attributes. */
if (mlx4_glue->query_device_ex(attr_ctx, NULL, &device_attr_ex)) {
- rte_errno = ENODEV;
+ err = ENODEV;
goto error;
}
assert(device_attr.max_sge >= MLX4_MAX_SGE);
@@ -504,18 +609,18 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
DEBUG("using port %u", port);
ctx = mlx4_glue->open_device(ibv_dev);
if (ctx == NULL) {
- rte_errno = ENODEV;
+ err = ENODEV;
goto port_error;
}
/* Check port status. */
err = mlx4_glue->query_port(ctx, port, &port_attr);
if (err) {
- rte_errno = err;
- ERROR("port query failed: %s", strerror(rte_errno));
+ err = ENODEV;
+ ERROR("port query failed: %s", strerror(err));
goto port_error;
}
if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
- rte_errno = ENOTSUP;
+ err = ENOTSUP;
ERROR("port %d is not configured in Ethernet mode",
port);
goto port_error;
@@ -525,15 +630,16 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
port, mlx4_glue->port_state_str(port_attr.state),
port_attr.state);
/* Make asynchronous FD non-blocking to handle interrupts. */
- if (mlx4_fd_set_non_blocking(ctx->async_fd) < 0) {
+ err = mlx4_fd_set_non_blocking(ctx->async_fd);
+ if (err) {
ERROR("cannot make asynchronous FD non-blocking: %s",
- strerror(rte_errno));
+ strerror(err));
goto port_error;
}
/* Allocate protection domain. */
pd = mlx4_glue->alloc_pd(ctx);
if (pd == NULL) {
- rte_errno = ENOMEM;
+ err = ENOMEM;
ERROR("PD allocation failure");
goto port_error;
}
@@ -542,7 +648,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
sizeof(*priv),
RTE_CACHE_LINE_SIZE);
if (priv == NULL) {
- rte_errno = ENOMEM;
+ err = ENOMEM;
ERROR("priv allocation failure");
goto port_error;
}
@@ -562,26 +668,32 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
(device_attr.vendor_part_id ==
PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO);
DEBUG("L2 tunnel checksum offloads are %ssupported",
- (priv->hw_csum_l2tun ? "" : "not "));
- priv->hw_rss_sup = device_attr_ex.rss_caps.rx_hash_fields_mask;
- if (!priv->hw_rss_sup) {
- WARN("no RSS capabilities reported; disabling support"
- " for UDP RSS and inner VXLAN RSS");
- /* Fake support for all possible RSS hash fields. */
- priv->hw_rss_sup = ~UINT64_C(0);
- priv->hw_rss_sup = mlx4_conv_rss_hf(priv, -1);
- /* Filter out known unsupported fields. */
- priv->hw_rss_sup &=
- ~(uint64_t)(IBV_RX_HASH_SRC_PORT_UDP |
- IBV_RX_HASH_DST_PORT_UDP |
- IBV_RX_HASH_INNER);
- }
+ priv->hw_csum_l2tun ? "" : "not ");
+ priv->hw_rss_sup = mlx4_hw_rss_sup(priv->ctx, priv->pd,
+ &device_attr_ex);
DEBUG("supported RSS hash fields mask: %016" PRIx64,
priv->hw_rss_sup);
+ priv->hw_rss_max_qps =
+ device_attr_ex.rss_caps.max_rwq_indirection_table_size;
+ DEBUG("MAX RSS queues %d", priv->hw_rss_max_qps);
+ priv->hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
+ IBV_RAW_PACKET_CAP_SCATTER_FCS);
+ DEBUG("FCS stripping toggling is %ssupported",
+ priv->hw_fcs_strip ? "" : "not ");
+ priv->tso =
+ ((device_attr_ex.tso_caps.max_tso > 0) &&
+ (device_attr_ex.tso_caps.supported_qpts &
+ (1 << IBV_QPT_RAW_PACKET)));
+ if (priv->tso)
+ priv->tso_max_payload_sz =
+ device_attr_ex.tso_caps.max_tso;
+ DEBUG("TSO is %ssupported",
+ priv->tso ? "" : "not ");
/* Configure the first MAC address by default. */
- if (mlx4_get_mac(priv, &mac.addr_bytes)) {
+ err = mlx4_get_mac(priv, &mac.addr_bytes);
+ if (err) {
ERROR("cannot get MAC address, is mlx4_en loaded?"
- " (rte_errno: %s)", strerror(rte_errno));
+ " (error: %s)", strerror(err));
goto port_error;
}
INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
@@ -614,8 +726,8 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
eth_dev = rte_eth_dev_allocate(name);
}
if (eth_dev == NULL) {
+ err = ENOMEM;
ERROR("can not allocate rte ethdev");
- rte_errno = ENOMEM;
goto port_error;
}
eth_dev->data->dev_private = priv;
@@ -649,6 +761,24 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
/* Update link status once if waiting for LSC. */
if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
mlx4_link_update(eth_dev, 0);
+ /*
+ * Once the device is added to the list of memory event
+ * callback, its global MR cache table cannot be expanded
+ * on the fly because of deadlock. If it overflows, lookup
+ * should be done by searching MR list linearly, which is slow.
+ */
+ err = mlx4_mr_btree_init(&priv->mr.cache,
+ MLX4_MR_BTREE_CACHE_N * 2,
+ eth_dev->device->numa_node);
+ if (err) {
+ /* rte_errno is already set. */
+ goto port_error;
+ }
+ /* Add device to memory callback list. */
+ rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
+ LIST_INSERT_HEAD(&mlx4_mem_event_cb_list, priv, mem_event_cb);
+ rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+ rte_eth_dev_probing_finish(eth_dev);
continue;
port_error:
rte_free(priv);
@@ -660,8 +790,6 @@ port_error:
rte_eth_dev_release_port(eth_dev);
break;
}
- if (i == device_attr.phys_port_cnt)
- return 0;
/*
* XXX if something went wrong in the loop above, there is a resource
* leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
@@ -673,8 +801,9 @@ error:
claim_zero(mlx4_glue->close_device(attr_ctx));
if (list)
mlx4_glue->free_device_list(list);
- assert(rte_errno >= 0);
- return -rte_errno;
+ if (err)
+ rte_errno = err;
+ return -err;
}
static const struct rte_pci_id mlx4_pci_id_map[] = {
@@ -708,11 +837,53 @@ static struct rte_pci_driver mlx4_driver = {
#ifdef RTE_LIBRTE_MLX4_DLOPEN_DEPS
/**
+ * Suffix RTE_EAL_PMD_PATH with "-glue".
+ *
+ * This function performs a sanity check on RTE_EAL_PMD_PATH before
+ * suffixing its last component.
+ *
+ * @param buf[out]
+ * Output buffer, should be large enough otherwise NULL is returned.
+ * @param size
+ * Size of @p out.
+ *
+ * @return
+ * Pointer to @p buf or @p NULL in case suffix cannot be appended.
+ */
+static char *
+mlx4_glue_path(char *buf, size_t size)
+{
+ static const char *const bad[] = { "/", ".", "..", NULL };
+ const char *path = RTE_EAL_PMD_PATH;
+ size_t len = strlen(path);
+ size_t off;
+ int i;
+
+ while (len && path[len - 1] == '/')
+ --len;
+ for (off = len; off && path[off - 1] != '/'; --off)
+ ;
+ for (i = 0; bad[i]; ++i)
+ if (!strncmp(path + off, bad[i], (int)(len - off)))
+ goto error;
+ i = snprintf(buf, size, "%.*s-glue", (int)len, path);
+ if (i == -1 || (size_t)i >= size)
+ goto error;
+ return buf;
+error:
+ ERROR("unable to append \"-glue\" to last component of"
+ " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"),"
+ " please re-configure DPDK");
+ return NULL;
+}
+
+/**
* Initialization routine for run-time dependency on rdma-core.
*/
static int
mlx4_glue_init(void)
{
+ char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
const char *path[] = {
/*
* A basic security check is necessary before trusting
@@ -720,7 +891,13 @@ mlx4_glue_init(void)
*/
(geteuid() == getuid() && getegid() == getgid() ?
getenv("MLX4_GLUE_PATH") : NULL),
- RTE_EAL_PMD_PATH,
+ /*
+ * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
+ * variant, otherwise let dlopen() look up libraries on its
+ * own.
+ */
+ (*RTE_EAL_PMD_PATH ?
+ mlx4_glue_path(glue_path, sizeof(glue_path)) : ""),
};
unsigned int i = 0;
void *handle = NULL;
@@ -790,9 +967,7 @@ glue_error:
/**
* Driver initialization routine.
*/
-RTE_INIT(rte_mlx4_pmd_init);
-static void
-rte_mlx4_pmd_init(void)
+RTE_INIT(rte_mlx4_pmd_init)
{
/*
* MLX4_DEVICE_FATAL_CLEANUP tells ibv_destroy functions we
@@ -828,6 +1003,8 @@ rte_mlx4_pmd_init(void)
}
mlx4_glue->fork_init();
rte_pci_register(&mlx4_driver);
+ rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
+ mlx4_mr_mem_event_cb, NULL);
}
RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 19c8a223..e6fb934f 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2012 6WIND S.A.
- * Copyright 2012 Mellanox
+ * Copyright 2012 Mellanox Technologies, Ltd
*/
#ifndef RTE_PMD_MLX4_H_
@@ -23,7 +23,9 @@
#include <rte_ether.h>
#include <rte_interrupts.h>
#include <rte_mempool.h>
-#include <rte_spinlock.h>
+#include <rte_rwlock.h>
+
+#include "mlx4_mr.h"
#ifndef IBV_RX_HASH_INNER
/** This is not necessarily defined by supported RDMA core versions. */
@@ -42,20 +44,12 @@
/** Fixed RSS hash key size in bytes. Cannot be modified. */
#define MLX4_RSS_HASH_KEY_SIZE 40
-/**
- * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
- * from which buffers are to be transmitted will have to be mapped by this
- * driver to their own Memory Region (MR). This is a slow operation.
- *
- * This value is always 1 for RX queues.
- */
-#ifndef MLX4_PMD_TX_MP_CACHE
-#define MLX4_PMD_TX_MP_CACHE 8
-#endif
-
/** Interrupt alarm timeout value in microseconds. */
#define MLX4_INTR_ALARM_TIMEOUT 100000
+/* Maximum packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192
+
/** Port parameter. */
#define MLX4_PMD_PORT_KVARG "port"
@@ -78,20 +72,12 @@ struct rxq;
struct txq;
struct rte_flow;
-/** Memory region descriptor. */
-struct mlx4_mr {
- LIST_ENTRY(mlx4_mr) next; /**< Next entry in list. */
- uintptr_t start; /**< Base address for memory region. */
- uintptr_t end; /**< End address for memory region. */
- uint32_t lkey; /**< L_Key extracted from @p mr. */
- uint32_t refcnt; /**< Reference count for this object. */
- struct priv *priv; /**< Back pointer to private data. */
- struct ibv_mr *mr; /**< Memory region associated with @p mp. */
- struct rte_mempool *mp; /**< Target memory pool (mempool). */
-};
+LIST_HEAD(mlx4_dev_list, priv);
+LIST_HEAD(mlx4_mr_list, mlx4_mr);
/** Private data structure. */
struct priv {
+ LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */
struct rte_eth_dev *dev; /**< Ethernet device. */
struct ibv_context *ctx; /**< Verbs context. */
struct ibv_device_attr device_attr; /**< Device properties. */
@@ -103,15 +89,25 @@ struct priv {
uint32_t vf:1; /**< This is a VF device. */
uint32_t intr_alarm:1; /**< An interrupt alarm is scheduled. */
uint32_t isolated:1; /**< Toggle isolated mode. */
+ uint32_t rss_init:1; /**< Common RSS context is initialized. */
uint32_t hw_csum:1; /**< Checksum offload is supported. */
uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
+ uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+ uint32_t tso:1; /**< Transmit segmentation offload is supported. */
+ uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */
+ uint32_t hw_rss_max_qps; /**< Max Rx Queues supported by RSS. */
uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
+ struct {
+ uint32_t dev_gen; /* Generation number to flush local caches. */
+ rte_rwlock_t rwlock; /* MR Lock. */
+ struct mlx4_mr_btree cache; /* Global MR cache table. */
+ struct mlx4_mr_list mr_list; /* Registered MR list. */
+ struct mlx4_mr_list mr_free_list; /* Freed MR list. */
+ } mr;
LIST_HEAD(, mlx4_rss) rss; /**< Shared targets for Rx flow rules. */
LIST_HEAD(, rte_flow) flows; /**< Configured flow rule handles. */
- LIST_HEAD(, mlx4_mr) mr; /**< Registered memory regions. */
- rte_spinlock_t mr_lock; /**< Lock for @p mr access. */
struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES];
/**< Configured MAC addresses. Unused entries are zeroed. */
};
@@ -131,7 +127,7 @@ void mlx4_allmulticast_disable(struct rte_eth_dev *dev);
void mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index);
int mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
uint32_t index, uint32_t vmdq);
-void mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr);
+int mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr);
int mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on);
int mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats);
void mlx4_stats_reset(struct rte_eth_dev *dev);
@@ -154,11 +150,4 @@ void mlx4_rxq_intr_disable(struct priv *priv);
int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
-/* mlx4_mr.c */
-
-struct mlx4_mr *mlx4_mr_get(struct priv *priv, struct rte_mempool *mp);
-void mlx4_mr_put(struct mlx4_mr *mr);
-uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
- uint32_t i);
-
#endif /* RTE_PMD_MLX4_H_ */
diff --git a/drivers/net/mlx4/mlx4_ethdev.c b/drivers/net/mlx4/mlx4_ethdev.c
index 3bc69273..30deb3ef 100644
--- a/drivers/net/mlx4/mlx4_ethdev.c
+++ b/drivers/net/mlx4/mlx4_ethdev.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -39,6 +39,7 @@
#include <rte_ether.h>
#include <rte_flow.h>
#include <rte_pci.h>
+#include <rte_string_fns.h>
#include "mlx4.h"
#include "mlx4_flow.h"
@@ -120,7 +121,7 @@ try_dev_id:
goto try_dev_id;
dev_port_prev = dev_port;
if (dev_port == (priv->port - 1u))
- snprintf(match, sizeof(match), "%s", name);
+ strlcpy(match, name, sizeof(match));
}
closedir(dir);
if (match[0] == '\0') {
@@ -132,167 +133,6 @@ try_dev_id:
}
/**
- * Read from sysfs entry.
- *
- * @param[in] priv
- * Pointer to private structure.
- * @param[in] entry
- * Entry name relative to sysfs path.
- * @param[out] buf
- * Data output buffer.
- * @param size
- * Buffer size.
- *
- * @return
- * Number of bytes read on success, negative errno value otherwise and
- * rte_errno is set.
- */
-static int
-mlx4_sysfs_read(const struct priv *priv, const char *entry,
- char *buf, size_t size)
-{
- char ifname[IF_NAMESIZE];
- FILE *file;
- int ret;
-
- ret = mlx4_get_ifname(priv, &ifname);
- if (ret)
- return ret;
-
- MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
- ifname, entry);
-
- file = fopen(path, "rb");
- if (file == NULL) {
- rte_errno = errno;
- return -rte_errno;
- }
- ret = fread(buf, 1, size, file);
- if ((size_t)ret < size && ferror(file)) {
- rte_errno = EIO;
- ret = -rte_errno;
- } else {
- ret = size;
- }
- fclose(file);
- return ret;
-}
-
-/**
- * Write to sysfs entry.
- *
- * @param[in] priv
- * Pointer to private structure.
- * @param[in] entry
- * Entry name relative to sysfs path.
- * @param[in] buf
- * Data buffer.
- * @param size
- * Buffer size.
- *
- * @return
- * Number of bytes written on success, negative errno value otherwise and
- * rte_errno is set.
- */
-static int
-mlx4_sysfs_write(const struct priv *priv, const char *entry,
- char *buf, size_t size)
-{
- char ifname[IF_NAMESIZE];
- FILE *file;
- int ret;
-
- ret = mlx4_get_ifname(priv, &ifname);
- if (ret)
- return ret;
-
- MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
- ifname, entry);
-
- file = fopen(path, "wb");
- if (file == NULL) {
- rte_errno = errno;
- return -rte_errno;
- }
- ret = fwrite(buf, 1, size, file);
- if ((size_t)ret < size || ferror(file)) {
- rte_errno = EIO;
- ret = -rte_errno;
- } else {
- ret = size;
- }
- fclose(file);
- return ret;
-}
-
-/**
- * Get unsigned long sysfs property.
- *
- * @param priv
- * Pointer to private structure.
- * @param[in] name
- * Entry name relative to sysfs path.
- * @param[out] value
- * Value output buffer.
- *
- * @return
- * 0 on success, negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx4_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value)
-{
- int ret;
- unsigned long value_ret;
- char value_str[32];
-
- ret = mlx4_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1));
- if (ret < 0) {
- DEBUG("cannot read %s value from sysfs: %s",
- name, strerror(rte_errno));
- return ret;
- }
- value_str[ret] = '\0';
- errno = 0;
- value_ret = strtoul(value_str, NULL, 0);
- if (errno) {
- rte_errno = errno;
- DEBUG("invalid %s value `%s': %s", name, value_str,
- strerror(rte_errno));
- return -rte_errno;
- }
- *value = value_ret;
- return 0;
-}
-
-/**
- * Set unsigned long sysfs property.
- *
- * @param priv
- * Pointer to private structure.
- * @param[in] name
- * Entry name relative to sysfs path.
- * @param value
- * Value to set.
- *
- * @return
- * 0 on success, negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx4_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value)
-{
- int ret;
- MKSTR(value_str, "%lu", value);
-
- ret = mlx4_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1));
- if (ret < 0) {
- DEBUG("cannot write %s `%s' (%lu) to sysfs: %s",
- name, value_str, value, strerror(rte_errno));
- return ret;
- }
- return 0;
-}
-
-/**
* Perform ifreq ioctl() on associated Ethernet device.
*
* @param[in] priv
@@ -361,12 +201,12 @@ mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN])
int
mlx4_mtu_get(struct priv *priv, uint16_t *mtu)
{
- unsigned long ulong_mtu = 0;
- int ret = mlx4_get_sysfs_ulong(priv, "mtu", &ulong_mtu);
+ struct ifreq request;
+ int ret = mlx4_ifreq(priv, SIOCGIFMTU, &request);
if (ret)
return ret;
- *mtu = ulong_mtu;
+ *mtu = request.ifr_mtu;
return 0;
}
@@ -385,20 +225,13 @@ int
mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
{
struct priv *priv = dev->data->dev_private;
- uint16_t new_mtu;
- int ret = mlx4_set_sysfs_ulong(priv, "mtu", mtu);
+ struct ifreq request = { .ifr_mtu = mtu, };
+ int ret = mlx4_ifreq(priv, SIOCSIFMTU, &request);
if (ret)
return ret;
- ret = mlx4_mtu_get(priv, &new_mtu);
- if (ret)
- return ret;
- if (new_mtu == mtu) {
- priv->mtu = mtu;
- return 0;
- }
- rte_errno = EINVAL;
- return -rte_errno;
+ priv->mtu = mtu;
+ return 0;
}
/**
@@ -417,14 +250,14 @@ mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
static int
mlx4_set_flags(struct priv *priv, unsigned int keep, unsigned int flags)
{
- unsigned long tmp = 0;
- int ret = mlx4_get_sysfs_ulong(priv, "flags", &tmp);
+ struct ifreq request;
+ int ret = mlx4_ifreq(priv, SIOCGIFFLAGS, &request);
if (ret)
return ret;
- tmp &= keep;
- tmp |= (flags & (~keep));
- return mlx4_set_sysfs_ulong(priv, "flags", tmp);
+ request.ifr_flags &= keep;
+ request.ifr_flags |= flags & ~keep;
+ return mlx4_ifreq(priv, SIOCSIFFLAGS, &request);
}
/**
@@ -701,11 +534,14 @@ mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
* Pointer to Ethernet device structure.
* @param mac_addr
* MAC address to register.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
*/
-void
+int
mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
{
- mlx4_mac_addr_add(dev, mac_addr, 0, 0);
+ return mlx4_mac_addr_add(dev, mac_addr, 0, 0);
}
/**
@@ -723,7 +559,6 @@ mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
unsigned int max;
char ifname[IF_NAMESIZE];
- info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
/* FIXME: we should ask the device for these values. */
info->min_rx_bufsize = 32;
info->max_rx_pktlen = 65536;
@@ -752,6 +587,7 @@ mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
ETH_LINK_SPEED_20G |
ETH_LINK_SPEED_40G |
ETH_LINK_SPEED_56G;
+ info->flow_type_rss_offloads = mlx4_conv_rss_types(priv, 0, 1);
}
/**
@@ -878,7 +714,7 @@ mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete)
}
link_speed = ethtool_cmd_speed(&edata);
if (link_speed == -1)
- dev_link.link_speed = 0;
+ dev_link.link_speed = ETH_SPEED_NUM_NONE;
else
dev_link.link_speed = link_speed;
dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 2d55bfe0..b40e7e5c 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -76,69 +76,94 @@ struct mlx4_drop {
};
/**
- * Convert DPDK RSS hash fields to their Verbs equivalent.
+ * Convert supported RSS hash field types between DPDK and Verbs formats.
*
- * This function returns the supported (default) set when @p rss_hf has
- * special value (uint64_t)-1.
+ * This function returns the supported (default) set when @p types has
+ * special value 0.
*
* @param priv
* Pointer to private structure.
- * @param rss_hf
- * Hash fields in DPDK format (see struct rte_eth_rss_conf).
+ * @param types
+ * Depending on @p verbs_to_dpdk, hash types in either DPDK (see struct
+ * rte_eth_rss_conf) or Verbs format.
+ * @param verbs_to_dpdk
+ * A zero value converts @p types from DPDK to Verbs, a nonzero value
+ * performs the reverse operation.
*
* @return
- * A valid Verbs RSS hash fields mask for mlx4 on success, (uint64_t)-1
- * otherwise and rte_errno is set.
+ * Converted RSS hash fields on success, (uint64_t)-1 otherwise and
+ * rte_errno is set.
*/
uint64_t
-mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
+mlx4_conv_rss_types(struct priv *priv, uint64_t types, int verbs_to_dpdk)
{
- enum { IPV4, IPV6, TCP, UDP, };
- const uint64_t in[] = {
- [IPV4] = (ETH_RSS_IPV4 |
- ETH_RSS_FRAG_IPV4 |
- ETH_RSS_NONFRAG_IPV4_TCP |
- ETH_RSS_NONFRAG_IPV4_UDP |
- ETH_RSS_NONFRAG_IPV4_OTHER),
- [IPV6] = (ETH_RSS_IPV6 |
- ETH_RSS_FRAG_IPV6 |
- ETH_RSS_NONFRAG_IPV6_TCP |
- ETH_RSS_NONFRAG_IPV6_UDP |
- ETH_RSS_NONFRAG_IPV6_OTHER |
- ETH_RSS_IPV6_EX |
- ETH_RSS_IPV6_TCP_EX |
- ETH_RSS_IPV6_UDP_EX),
- [TCP] = (ETH_RSS_NONFRAG_IPV4_TCP |
- ETH_RSS_NONFRAG_IPV6_TCP |
- ETH_RSS_IPV6_TCP_EX),
- [UDP] = (ETH_RSS_NONFRAG_IPV4_UDP |
- ETH_RSS_NONFRAG_IPV6_UDP |
- ETH_RSS_IPV6_UDP_EX),
+ enum {
+ INNER,
+ IPV4, IPV4_1, IPV4_2, IPV6, IPV6_1, IPV6_2, IPV6_3,
+ TCP, UDP,
+ IPV4_TCP, IPV4_UDP, IPV6_TCP, IPV6_TCP_1, IPV6_UDP, IPV6_UDP_1,
};
- const uint64_t out[RTE_DIM(in)] = {
- [IPV4] = IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4,
- [IPV6] = IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6,
- [TCP] = IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP,
- [UDP] = IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP,
+ enum {
+ VERBS_IPV4 = IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4,
+ VERBS_IPV6 = IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6,
+ VERBS_TCP = IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP,
+ VERBS_UDP = IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP,
};
+ static const uint64_t dpdk[] = {
+ [INNER] = 0,
+ [IPV4] = ETH_RSS_IPV4,
+ [IPV4_1] = ETH_RSS_FRAG_IPV4,
+ [IPV4_2] = ETH_RSS_NONFRAG_IPV4_OTHER,
+ [IPV6] = ETH_RSS_IPV6,
+ [IPV6_1] = ETH_RSS_FRAG_IPV6,
+ [IPV6_2] = ETH_RSS_NONFRAG_IPV6_OTHER,
+ [IPV6_3] = ETH_RSS_IPV6_EX,
+ [TCP] = 0,
+ [UDP] = 0,
+ [IPV4_TCP] = ETH_RSS_NONFRAG_IPV4_TCP,
+ [IPV4_UDP] = ETH_RSS_NONFRAG_IPV4_UDP,
+ [IPV6_TCP] = ETH_RSS_NONFRAG_IPV6_TCP,
+ [IPV6_TCP_1] = ETH_RSS_IPV6_TCP_EX,
+ [IPV6_UDP] = ETH_RSS_NONFRAG_IPV6_UDP,
+ [IPV6_UDP_1] = ETH_RSS_IPV6_UDP_EX,
+ };
+ static const uint64_t verbs[RTE_DIM(dpdk)] = {
+ [INNER] = IBV_RX_HASH_INNER,
+ [IPV4] = VERBS_IPV4,
+ [IPV4_1] = VERBS_IPV4,
+ [IPV4_2] = VERBS_IPV4,
+ [IPV6] = VERBS_IPV6,
+ [IPV6_1] = VERBS_IPV6,
+ [IPV6_2] = VERBS_IPV6,
+ [IPV6_3] = VERBS_IPV6,
+ [TCP] = VERBS_TCP,
+ [UDP] = VERBS_UDP,
+ [IPV4_TCP] = VERBS_IPV4 | VERBS_TCP,
+ [IPV4_UDP] = VERBS_IPV4 | VERBS_UDP,
+ [IPV6_TCP] = VERBS_IPV6 | VERBS_TCP,
+ [IPV6_TCP_1] = VERBS_IPV6 | VERBS_TCP,
+ [IPV6_UDP] = VERBS_IPV6 | VERBS_UDP,
+ [IPV6_UDP_1] = VERBS_IPV6 | VERBS_UDP,
+ };
+ const uint64_t *in = verbs_to_dpdk ? verbs : dpdk;
+ const uint64_t *out = verbs_to_dpdk ? dpdk : verbs;
uint64_t seen = 0;
uint64_t conv = 0;
unsigned int i;
- for (i = 0; i != RTE_DIM(in); ++i)
- if (rss_hf & in[i]) {
- seen |= rss_hf & in[i];
+ if (!types) {
+ if (!verbs_to_dpdk)
+ return priv->hw_rss_sup;
+ types = priv->hw_rss_sup;
+ }
+ for (i = 0; i != RTE_DIM(dpdk); ++i)
+ if (in[i] && (types & in[i]) == in[i]) {
+ seen |= types & in[i];
conv |= out[i];
}
- if ((conv & priv->hw_rss_sup) == conv) {
- if (rss_hf == (uint64_t)-1) {
- /* Include inner RSS by default if supported. */
- conv |= priv->hw_rss_sup & IBV_RX_HASH_INNER;
- return conv;
- }
- if (!(rss_hf & ~seen))
- return conv;
- }
+ if ((verbs_to_dpdk || (conv & priv->hw_rss_sup) == conv) &&
+ !(types & ~seen))
+ return conv;
rte_errno = ENOTSUP;
return (uint64_t)-1;
}
@@ -362,6 +387,9 @@ error:
* Additional mlx4-specific constraints on supported fields:
*
* - No support for partial masks.
+ * - Due to HW/FW limitation, flow rule priority is not taken into account
+ * when matching UDP destination ports, doing is therefore only supported
+ * at the highest priority level (0).
*
* @param[in, out] flow
* Flow rule handle to update.
@@ -393,6 +421,11 @@ mlx4_flow_merge_udp(struct rte_flow *flow,
msg = "mlx4 does not support matching partial UDP fields";
goto error;
}
+ if (mask && mask->hdr.dst_port && flow->priority) {
+ msg = "combining UDP destination port matching with a nonzero"
+ " priority level is not supported";
+ goto error;
+ }
if (!flow->ibv_attr)
return 0;
++flow->ibv_attr->num_of_specs;
@@ -637,6 +670,7 @@ mlx4_flow_prepare(struct priv *priv,
struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) };
struct rte_flow *flow = &temp;
const char *msg = NULL;
+ int overlap;
if (attr->group)
return rte_flow_error_set
@@ -651,12 +685,18 @@ mlx4_flow_prepare(struct priv *priv,
return rte_flow_error_set
(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
NULL, "egress is not supported");
+ if (attr->transfer)
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+ NULL, "transfer is not supported");
if (!attr->ingress)
return rte_flow_error_set
(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
NULL, "only ingress is supported");
fill:
+ overlap = 0;
proc = mlx4_flow_proc_item_list;
+ flow->priority = attr->priority;
/* Go over pattern. */
for (item = pattern; item->type; ++item) {
const struct mlx4_flow_proc_item *next = NULL;
@@ -702,14 +742,24 @@ fill:
}
/* Go over actions list. */
for (action = actions; action->type; ++action) {
+ /* This one may appear anywhere multiple times. */
+ if (action->type == RTE_FLOW_ACTION_TYPE_VOID)
+ continue;
+ /* Fate-deciding actions may appear exactly once. */
+ if (overlap) {
+ msg = "cannot combine several fate-deciding actions,"
+ " choose between DROP, QUEUE or RSS";
+ goto exit_action_not_supported;
+ }
+ overlap = 1;
switch (action->type) {
const struct rte_flow_action_queue *queue;
const struct rte_flow_action_rss *rss;
- const struct rte_eth_rss_conf *rss_conf;
+ const uint8_t *rss_key;
+ uint32_t rss_key_len;
+ uint64_t fields;
unsigned int i;
- case RTE_FLOW_ACTION_TYPE_VOID:
- continue;
case RTE_FLOW_ACTION_TYPE_DROP:
flow->drop = 1;
break;
@@ -736,54 +786,68 @@ fill:
break;
rss = action->conf;
/* Default RSS configuration if none is provided. */
- rss_conf =
- rss->rss_conf ?
- rss->rss_conf :
- &(struct rte_eth_rss_conf){
- .rss_key = mlx4_rss_hash_key_default,
- .rss_key_len = MLX4_RSS_HASH_KEY_SIZE,
- .rss_hf = -1,
- };
+ if (rss->key_len) {
+ rss_key = rss->key;
+ rss_key_len = rss->key_len;
+ } else {
+ rss_key = mlx4_rss_hash_key_default;
+ rss_key_len = MLX4_RSS_HASH_KEY_SIZE;
+ }
/* Sanity checks. */
- for (i = 0; i < rss->num; ++i)
+ for (i = 0; i < rss->queue_num; ++i)
if (rss->queue[i] >=
priv->dev->data->nb_rx_queues)
break;
- if (i != rss->num) {
+ if (i != rss->queue_num) {
msg = "queue index target beyond number of"
" configured Rx queues";
goto exit_action_not_supported;
}
- if (!rte_is_power_of_2(rss->num)) {
+ if (!rte_is_power_of_2(rss->queue_num)) {
msg = "for RSS, mlx4 requires the number of"
" queues to be a power of two";
goto exit_action_not_supported;
}
- if (rss_conf->rss_key_len !=
- sizeof(flow->rss->key)) {
+ if (rss_key_len != sizeof(flow->rss->key)) {
msg = "mlx4 supports exactly one RSS hash key"
" length: "
MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE);
goto exit_action_not_supported;
}
- for (i = 1; i < rss->num; ++i)
+ for (i = 1; i < rss->queue_num; ++i)
if (rss->queue[i] - rss->queue[i - 1] != 1)
break;
- if (i != rss->num) {
+ if (i != rss->queue_num) {
msg = "mlx4 requires RSS contexts to use"
" consecutive queue indices only";
goto exit_action_not_supported;
}
- if (rss->queue[0] % rss->num) {
+ if (rss->queue[0] % rss->queue_num) {
msg = "mlx4 requires the first queue of a RSS"
" context to be aligned on a multiple"
" of the context size";
goto exit_action_not_supported;
}
+ if (rss->func &&
+ rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+ msg = "the only supported RSS hash function"
+ " is Toeplitz";
+ goto exit_action_not_supported;
+ }
+ if (rss->level) {
+ msg = "a nonzero RSS encapsulation level is"
+ " not supported";
+ goto exit_action_not_supported;
+ }
+ rte_errno = 0;
+ fields = mlx4_conv_rss_types(priv, rss->types, 0);
+ if (fields == (uint64_t)-1 && rte_errno) {
+ msg = "unsupported RSS hash type requested";
+ goto exit_action_not_supported;
+ }
flow->rss = mlx4_rss_get
- (priv,
- mlx4_conv_rss_hf(priv, rss_conf->rss_hf),
- rss_conf->rss_key, rss->num, rss->queue);
+ (priv, fields, rss_key, rss->queue_num,
+ rss->queue);
if (!flow->rss) {
msg = "either invalid parameters or not enough"
" resources for additional multi-queue"
@@ -795,10 +859,9 @@ fill:
goto exit_action_not_supported;
}
}
- if (!flow->rss && !flow->drop)
- return rte_flow_error_set
- (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
- NULL, "no valid action");
+ /* When fate is unknown, drop traffic. */
+ if (!overlap)
+ flow->drop = 1;
/* Validation ends here. */
if (!addr) {
if (flow->rss)
@@ -820,11 +883,14 @@ fill:
},
};
- if (!mlx4_zmallocv(__func__, vec, RTE_DIM(vec)))
+ if (!mlx4_zmallocv(__func__, vec, RTE_DIM(vec))) {
+ if (temp.rss)
+ mlx4_rss_put(temp.rss);
return rte_flow_error_set
(error, -rte_errno,
RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
"flow rule handle allocation failure");
+ }
/* Most fields will be updated by second pass. */
*flow = (struct rte_flow){
.ibv_attr = temp.ibv_attr,
@@ -1264,14 +1330,20 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
*/
uint32_t queues =
rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
- alignas(struct rte_flow_action_rss) uint8_t rss_conf_data
- [offsetof(struct rte_flow_action_rss, queue) +
- sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues];
- struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data;
+ uint16_t queue[queues];
+ struct rte_flow_action_rss action_rss = {
+ .func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+ .level = 0,
+ .types = 0,
+ .key_len = MLX4_RSS_HASH_KEY_SIZE,
+ .queue_num = queues,
+ .key = mlx4_rss_hash_key_default,
+ .queue = queue,
+ };
struct rte_flow_action actions[] = {
{
.type = RTE_FLOW_ACTION_TYPE_RSS,
- .conf = rss_conf,
+ .conf = &action_rss,
},
{
.type = RTE_FLOW_ACTION_TYPE_END,
@@ -1293,12 +1365,8 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
if (!queues)
goto error;
/* Prepare default RSS configuration. */
- *rss_conf = (struct rte_flow_action_rss){
- .rss_conf = NULL, /* Rely on default fallback settings. */
- .num = queues,
- };
for (i = 0; i != queues; ++i)
- rss_conf->queue[i] = i;
+ queue[i] = i;
/*
* Set up VLAN item if filtering is enabled and at least one VLAN
* filter is configured.
@@ -1357,7 +1425,7 @@ next_vlan:
if (j != sizeof(mac->addr_bytes))
continue;
if (flow->rss->queues != queues ||
- memcmp(flow->rss->queue_id, rss_conf->queue,
+ memcmp(flow->rss->queue_id, action_rss.queue,
queues * sizeof(flow->rss->queue_id[0])))
continue;
break;
@@ -1397,7 +1465,7 @@ next_vlan:
if (flow && flow->internal) {
assert(flow->rss);
if (flow->rss->queues != queues ||
- memcmp(flow->rss->queue_id, rss_conf->queue,
+ memcmp(flow->rss->queue_id, action_rss.queue,
queues * sizeof(flow->rss->queue_id[0])))
flow = NULL;
}
diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h
index 00188a65..2917ebe9 100644
--- a/drivers/net/mlx4/mlx4_flow.h
+++ b/drivers/net/mlx4/mlx4_flow.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
#ifndef RTE_PMD_MLX4_FLOW_H_
@@ -42,12 +42,14 @@ struct rte_flow {
uint32_t promisc:1; /**< This rule matches everything. */
uint32_t allmulti:1; /**< This rule matches all multicast traffic. */
uint32_t drop:1; /**< This rule drops packets. */
+ uint32_t priority; /**< Flow rule priority. */
struct mlx4_rss *rss; /**< Rx target. */
};
/* mlx4_flow.c */
-uint64_t mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf);
+uint64_t mlx4_conv_rss_types(struct priv *priv, uint64_t types,
+ int verbs_to_dpdk);
int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error);
void mlx4_flow_clean(struct priv *priv);
int mlx4_filter_ctrl(struct rte_eth_dev *dev,
diff --git a/drivers/net/mlx4/mlx4_glue.c b/drivers/net/mlx4/mlx4_glue.c
index 3b79d320..67b3bfac 100644
--- a/drivers/net/mlx4/mlx4_glue.c
+++ b/drivers/net/mlx4/mlx4_glue.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2018 6WIND S.A.
- * Copyright 2018 Mellanox
+ * Copyright 2018 Mellanox Technologies, Ltd
*/
#include <stddef.h>
diff --git a/drivers/net/mlx4/mlx4_glue.h b/drivers/net/mlx4/mlx4_glue.h
index 368f906b..668ca867 100644
--- a/drivers/net/mlx4/mlx4_glue.h
+++ b/drivers/net/mlx4/mlx4_glue.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2018 6WIND S.A.
- * Copyright 2018 Mellanox
+ * Copyright 2018 Mellanox Technologies, Ltd
*/
#ifndef MLX4_GLUE_H_
diff --git a/drivers/net/mlx4/mlx4_intr.c b/drivers/net/mlx4/mlx4_intr.c
index 2141992e..eeb982a0 100644
--- a/drivers/net/mlx4/mlx4_intr.c
+++ b/drivers/net/mlx4/mlx4_intr.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index 9a1e4de3..d23d3c61 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -30,237 +30,1152 @@
#include <rte_malloc.h>
#include <rte_memory.h>
#include <rte_mempool.h>
-#include <rte_spinlock.h>
+#include <rte_rwlock.h>
#include "mlx4_glue.h"
+#include "mlx4_mr.h"
#include "mlx4_rxtx.h"
#include "mlx4_utils.h"
-struct mlx4_check_mempool_data {
+struct mr_find_contig_memsegs_data {
+ uintptr_t addr;
+ uintptr_t start;
+ uintptr_t end;
+ const struct rte_memseg_list *msl;
+};
+
+struct mr_update_mp_data {
+ struct rte_eth_dev *dev;
+ struct mlx4_mr_ctrl *mr_ctrl;
int ret;
- char *start;
- char *end;
};
/**
- * Called by mlx4_check_mempool() when iterating the memory chunks.
- *
- * @param[in] mp
- * Pointer to memory pool (unused).
- * @param[in, out] data
- * Pointer to shared buffer with mlx4_check_mempool().
- * @param[in] memhdr
- * Pointer to mempool chunk header.
- * @param mem_idx
- * Mempool element index (unused).
+ * Expand B-tree table to a given size. Can't be called with holding
+ * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ * @param n
+ * Number of entries for expansion.
+ *
+ * @return
+ * 0 on success, -1 on failure.
*/
-static void
-mlx4_check_mempool_cb(struct rte_mempool *mp, void *opaque,
- struct rte_mempool_memhdr *memhdr,
- unsigned int mem_idx)
+static int
+mr_btree_expand(struct mlx4_mr_btree *bt, int n)
+{
+ void *mem;
+ int ret = 0;
+
+ if (n <= bt->size)
+ return ret;
+ /*
+ * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is
+ * used inside if there's no room to expand. Because this is a quite
+ * rare case and a part of very slow path, it is very acceptable.
+ * Initially cache_bh[] will be given practically enough space and once
+ * it is expanded, expansion wouldn't be needed again ever.
+ */
+ mem = rte_realloc(bt->table, n * sizeof(struct mlx4_mr_cache), 0);
+ if (mem == NULL) {
+ /* Not an error, B-tree search will be skipped. */
+ WARN("failed to expand MR B-tree (%p) table", (void *)bt);
+ ret = -1;
+ } else {
+ DEBUG("expanded MR B-tree table (size=%u)", n);
+ bt->table = mem;
+ bt->size = n;
+ }
+ return ret;
+}
+
+/**
+ * Look up LKey from given B-tree lookup table, store the last index and return
+ * searched LKey.
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ * @param[out] idx
+ * Pointer to index. Even on search failure, returns index where it stops
+ * searching so that index can be used when inserting a new entry.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mr_btree_lookup(struct mlx4_mr_btree *bt, uint16_t *idx, uintptr_t addr)
+{
+ struct mlx4_mr_cache *lkp_tbl;
+ uint16_t n;
+ uint16_t base = 0;
+
+ assert(bt != NULL);
+ lkp_tbl = *bt->table;
+ n = bt->len;
+ /* First entry must be NULL for comparison. */
+ assert(bt->len > 0 || (lkp_tbl[0].start == 0 &&
+ lkp_tbl[0].lkey == UINT32_MAX));
+ /* Binary search. */
+ do {
+ register uint16_t delta = n >> 1;
+
+ if (addr < lkp_tbl[base + delta].start) {
+ n = delta;
+ } else {
+ base += delta;
+ n -= delta;
+ }
+ } while (n > 1);
+ assert(addr >= lkp_tbl[base].start);
+ *idx = base;
+ if (addr < lkp_tbl[base].end)
+ return lkp_tbl[base].lkey;
+ /* Not found. */
+ return UINT32_MAX;
+}
+
+/**
+ * Insert an entry to B-tree lookup table.
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ * @param entry
+ * Pointer to new entry to insert.
+ *
+ * @return
+ * 0 on success, -1 on failure.
+ */
+static int
+mr_btree_insert(struct mlx4_mr_btree *bt, struct mlx4_mr_cache *entry)
{
- struct mlx4_check_mempool_data *data = opaque;
+ struct mlx4_mr_cache *lkp_tbl;
+ uint16_t idx = 0;
+ size_t shift;
- (void)mp;
- (void)mem_idx;
- /* It already failed, skip the next chunks. */
- if (data->ret != 0)
+ assert(bt != NULL);
+ assert(bt->len <= bt->size);
+ assert(bt->len > 0);
+ lkp_tbl = *bt->table;
+ /* Find out the slot for insertion. */
+ if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) {
+ DEBUG("abort insertion to B-tree(%p): already exist at"
+ " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
+ (void *)bt, idx, entry->start, entry->end, entry->lkey);
+ /* Already exist, return. */
+ return 0;
+ }
+ /* If table is full, return error. */
+ if (unlikely(bt->len == bt->size)) {
+ bt->overflow = 1;
+ return -1;
+ }
+ /* Insert entry. */
+ ++idx;
+ shift = (bt->len - idx) * sizeof(struct mlx4_mr_cache);
+ if (shift)
+ memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift);
+ lkp_tbl[idx] = *entry;
+ bt->len++;
+ DEBUG("inserted B-tree(%p)[%u],"
+ " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
+ (void *)bt, idx, entry->start, entry->end, entry->lkey);
+ return 0;
+}
+
+/**
+ * Initialize B-tree and allocate memory for lookup table.
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ * @param n
+ * Number of entries to allocate.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket)
+{
+ if (bt == NULL) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ memset(bt, 0, sizeof(*bt));
+ bt->table = rte_calloc_socket("B-tree table",
+ n, sizeof(struct mlx4_mr_cache),
+ 0, socket);
+ if (bt->table == NULL) {
+ rte_errno = ENOMEM;
+ ERROR("failed to allocate memory for btree cache on socket %d",
+ socket);
+ return -rte_errno;
+ }
+ bt->size = n;
+ /* First entry must be NULL for binary search. */
+ (*bt->table)[bt->len++] = (struct mlx4_mr_cache) {
+ .lkey = UINT32_MAX,
+ };
+ DEBUG("initialized B-tree %p with table %p",
+ (void *)bt, (void *)bt->table);
+ return 0;
+}
+
+/**
+ * Free B-tree resources.
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ */
+void
+mlx4_mr_btree_free(struct mlx4_mr_btree *bt)
+{
+ if (bt == NULL)
return;
- /* It is the first chunk. */
- if (data->start == NULL && data->end == NULL) {
- data->start = memhdr->addr;
- data->end = data->start + memhdr->len;
+ DEBUG("freeing B-tree %p with table %p", (void *)bt, (void *)bt->table);
+ rte_free(bt->table);
+ memset(bt, 0, sizeof(*bt));
+}
+
+#ifndef NDEBUG
+/**
+ * Dump all the entries in a B-tree
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ */
+void
+mlx4_mr_btree_dump(struct mlx4_mr_btree *bt)
+{
+ int idx;
+ struct mlx4_mr_cache *lkp_tbl;
+
+ if (bt == NULL)
return;
+ lkp_tbl = *bt->table;
+ for (idx = 0; idx < bt->len; ++idx) {
+ struct mlx4_mr_cache *entry = &lkp_tbl[idx];
+
+ DEBUG("B-tree(%p)[%u],"
+ " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
+ (void *)bt, idx, entry->start, entry->end, entry->lkey);
}
- if (data->end == memhdr->addr) {
- data->end += memhdr->len;
- return;
+}
+#endif
+
+/**
+ * Find virtually contiguous memory chunk in a given MR.
+ *
+ * @param dev
+ * Pointer to MR structure.
+ * @param[out] entry
+ * Pointer to returning MR cache entry. If not found, this will not be
+ * updated.
+ * @param start_idx
+ * Start index of the memseg bitmap.
+ *
+ * @return
+ * Next index to go on lookup.
+ */
+static int
+mr_find_next_chunk(struct mlx4_mr *mr, struct mlx4_mr_cache *entry,
+ int base_idx)
+{
+ uintptr_t start = 0;
+ uintptr_t end = 0;
+ uint32_t idx = 0;
+
+ for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) {
+ if (rte_bitmap_get(mr->ms_bmp, idx)) {
+ const struct rte_memseg_list *msl;
+ const struct rte_memseg *ms;
+
+ msl = mr->msl;
+ ms = rte_fbarray_get(&msl->memseg_arr,
+ mr->ms_base_idx + idx);
+ assert(msl->page_sz == ms->hugepage_sz);
+ if (!start)
+ start = ms->addr_64;
+ end = ms->addr_64 + ms->hugepage_sz;
+ } else if (start) {
+ /* Passed the end of a fragment. */
+ break;
+ }
}
- if (data->start == (char *)memhdr->addr + memhdr->len) {
- data->start -= memhdr->len;
- return;
+ if (start) {
+ /* Found one chunk. */
+ entry->start = start;
+ entry->end = end;
+ entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey);
}
- /* Error, mempool is not virtually contiguous. */
- data->ret = -1;
+ return idx;
}
/**
- * Check if a mempool can be used: it must be virtually contiguous.
+ * Insert a MR to the global B-tree cache. It may fail due to low-on-memory.
+ * Then, this entry will have to be searched by mr_lookup_dev_list() in
+ * mlx4_mr_create() on miss.
*
- * @param[in] mp
- * Pointer to memory pool.
- * @param[out] start
- * Pointer to the start address of the mempool virtual memory area.
- * @param[out] end
- * Pointer to the end address of the mempool virtual memory area.
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mr
+ * Pointer to MR to insert.
*
* @return
- * 0 on success (mempool is virtually contiguous), -1 on error.
+ * 0 on success, -1 on failure.
*/
static int
-mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start, uintptr_t *end)
+mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx4_mr *mr)
{
- struct mlx4_check_mempool_data data;
+ struct priv *priv = dev->data->dev_private;
+ unsigned int n;
- memset(&data, 0, sizeof(data));
- rte_mempool_mem_iter(mp, mlx4_check_mempool_cb, &data);
- *start = (uintptr_t)data.start;
- *end = (uintptr_t)data.end;
- return data.ret;
+ DEBUG("port %u inserting MR(%p) to global cache",
+ dev->data->port_id, (void *)mr);
+ for (n = 0; n < mr->ms_bmp_n; ) {
+ struct mlx4_mr_cache entry = { 0, };
+
+ /* Find a contiguous chunk and advance the index. */
+ n = mr_find_next_chunk(mr, &entry, n);
+ if (!entry.end)
+ break;
+ if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
+ /*
+ * Overflowed, but the global table cannot be expanded
+ * because of deadlock.
+ */
+ return -1;
+ }
+ }
+ return 0;
}
/**
- * Obtain a memory region from a memory pool.
+ * Look up address in the original global MR list.
*
- * If a matching memory region already exists, it is returned with its
- * reference count incremented, otherwise a new one is registered.
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] entry
+ * Pointer to returning MR cache entry. If no match, this will not be updated.
+ * @param addr
+ * Search key.
*
- * @param priv
- * Pointer to private structure.
- * @param mp
- * Pointer to memory pool.
+ * @return
+ * Found MR on match, NULL otherwise.
+ */
+static struct mlx4_mr *
+mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
+ uintptr_t addr)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct mlx4_mr *mr;
+
+ /* Iterate all the existing MRs. */
+ LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+ unsigned int n;
+
+ if (mr->ms_n == 0)
+ continue;
+ for (n = 0; n < mr->ms_bmp_n; ) {
+ struct mlx4_mr_cache ret = { 0, };
+
+ n = mr_find_next_chunk(mr, &ret, n);
+ if (addr >= ret.start && addr < ret.end) {
+ /* Found. */
+ *entry = ret;
+ return mr;
+ }
+ }
+ }
+ return NULL;
+}
+
+/**
+ * Look up address on device.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] entry
+ * Pointer to returning MR cache entry. If no match, this will not be updated.
+ * @param addr
+ * Search key.
*
* @return
- * Memory region pointer, NULL in case of error and rte_errno is set.
+ * Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
*/
-struct mlx4_mr *
-mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
+static uint32_t
+mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
+ uintptr_t addr)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- uintptr_t start;
- uintptr_t end;
- unsigned int i;
+ struct priv *priv = dev->data->dev_private;
+ uint16_t idx;
+ uint32_t lkey = UINT32_MAX;
struct mlx4_mr *mr;
- if (mlx4_check_mempool(mp, &start, &end) != 0) {
- rte_errno = EINVAL;
- ERROR("mempool %p: not virtually contiguous",
- (void *)mp);
- return NULL;
+ /*
+ * If the global cache has overflowed since it failed to expand the
+ * B-tree table, it can't have all the existing MRs. Then, the address
+ * has to be searched by traversing the original MR list instead, which
+ * is very slow path. Otherwise, the global cache is all inclusive.
+ */
+ if (!unlikely(priv->mr.cache.overflow)) {
+ lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+ if (lkey != UINT32_MAX)
+ *entry = (*priv->mr.cache.table)[idx];
+ } else {
+ /* Falling back to the slowest path. */
+ mr = mr_lookup_dev_list(dev, entry, addr);
+ if (mr != NULL)
+ lkey = entry->lkey;
}
- DEBUG("mempool %p area start=%p end=%p size=%zu",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- /* Round start and end to page boundary if found in memory segments. */
- for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
- uintptr_t addr = (uintptr_t)ms[i].addr;
- size_t len = ms[i].len;
- unsigned int align = ms[i].hugepage_sz;
-
- if ((start > addr) && (start < addr + len))
- start = RTE_ALIGN_FLOOR(start, align);
- if ((end > addr) && (end < addr + len))
- end = RTE_ALIGN_CEIL(end, align);
+ assert(lkey == UINT32_MAX || (addr >= entry->start &&
+ addr < entry->end));
+ return lkey;
+}
+
+/**
+ * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free()
+ * can raise memory free event and the callback function will spin on the lock.
+ *
+ * @param mr
+ * Pointer to MR to free.
+ */
+static void
+mr_free(struct mlx4_mr *mr)
+{
+ if (mr == NULL)
+ return;
+ DEBUG("freeing MR(%p):", (void *)mr);
+ if (mr->ibv_mr != NULL)
+ claim_zero(mlx4_glue->dereg_mr(mr->ibv_mr));
+ if (mr->ms_bmp != NULL)
+ rte_bitmap_free(mr->ms_bmp);
+ rte_free(mr);
+}
+
+/**
+ * Releass resources of detached MR having no online entry.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+mlx4_mr_garbage_collect(struct rte_eth_dev *dev)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct mlx4_mr *mr_next;
+ struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
+
+ /*
+ * MR can't be freed with holding the lock because rte_free() could call
+ * memory free callback function. This will be a deadlock situation.
+ */
+ rte_rwlock_write_lock(&priv->mr.rwlock);
+ /* Detach the whole free list and release it after unlocking. */
+ free_list = priv->mr.mr_free_list;
+ LIST_INIT(&priv->mr.mr_free_list);
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+ /* Release resources. */
+ mr_next = LIST_FIRST(&free_list);
+ while (mr_next != NULL) {
+ struct mlx4_mr *mr = mr_next;
+
+ mr_next = LIST_NEXT(mr, mr);
+ mr_free(mr);
}
- DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- rte_spinlock_lock(&priv->mr_lock);
- LIST_FOREACH(mr, &priv->mr, next)
- if (mp == mr->mp && start >= mr->start && end <= mr->end)
- break;
- if (mr) {
- ++mr->refcnt;
- goto release;
+}
+
+/* Called during rte_memseg_contig_walk() by mlx4_mr_create(). */
+static int
+mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, size_t len, void *arg)
+{
+ struct mr_find_contig_memsegs_data *data = arg;
+
+ if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len)
+ return 0;
+ /* Found, save it and stop walking. */
+ data->start = ms->addr_64;
+ data->end = ms->addr_64 + len;
+ data->msl = msl;
+ return 1;
+}
+
+/**
+ * Create a new global Memroy Region (MR) for a missing virtual address.
+ * Register entire virtually contiguous memory chunk around the address.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] entry
+ * Pointer to returning MR cache entry, found in the global cache or newly
+ * created. If failed to create one, this will not be updated.
+ * @param addr
+ * Target virtual address to register.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
+ */
+static uint32_t
+mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
+ uintptr_t addr)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ const struct rte_memseg_list *msl;
+ const struct rte_memseg *ms;
+ struct mlx4_mr *mr = NULL;
+ size_t len;
+ uint32_t ms_n;
+ uint32_t bmp_size;
+ void *bmp_mem;
+ int ms_idx_shift = -1;
+ unsigned int n;
+ struct mr_find_contig_memsegs_data data = {
+ .addr = addr,
+ };
+ struct mr_find_contig_memsegs_data data_re;
+
+ DEBUG("port %u creating a MR using address (%p)",
+ dev->data->port_id, (void *)addr);
+ /*
+ * Release detached MRs if any. This can't be called with holding either
+ * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
+ * been detached by the memory free event but it couldn't be released
+ * inside the callback due to deadlock. As a result, releasing resources
+ * is quite opportunistic.
+ */
+ mlx4_mr_garbage_collect(dev);
+ /*
+ * Find out a contiguous virtual address chunk in use, to which the
+ * given address belongs, in order to register maximum range. In the
+ * best case where mempools are not dynamically recreated and
+ * '--socket-mem' is speicified as an EAL option, it is very likely to
+ * have only one MR(LKey) per a socket and per a hugepage-size even
+ * though the system memory is highly fragmented.
+ */
+ if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) {
+ WARN("port %u unable to find virtually contiguous"
+ " chunk for address (%p)."
+ " rte_memseg_contig_walk() failed.",
+ dev->data->port_id, (void *)addr);
+ rte_errno = ENXIO;
+ goto err_nolock;
}
- mr = rte_malloc(__func__, sizeof(*mr), 0);
- if (!mr) {
+alloc_resources:
+ /* Addresses must be page-aligned. */
+ assert(rte_is_aligned((void *)data.start, data.msl->page_sz));
+ assert(rte_is_aligned((void *)data.end, data.msl->page_sz));
+ msl = data.msl;
+ ms = rte_mem_virt2memseg((void *)data.start, msl);
+ len = data.end - data.start;
+ assert(msl->page_sz == ms->hugepage_sz);
+ /* Number of memsegs in the range. */
+ ms_n = len / msl->page_sz;
+ DEBUG("port %u extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
+ " page_sz=0x%" PRIx64 ", ms_n=%u",
+ dev->data->port_id, (void *)addr,
+ data.start, data.end, msl->page_sz, ms_n);
+ /* Size of memory for bitmap. */
+ bmp_size = rte_bitmap_get_memory_footprint(ms_n);
+ mr = rte_zmalloc_socket(NULL,
+ RTE_ALIGN_CEIL(sizeof(*mr),
+ RTE_CACHE_LINE_SIZE) +
+ bmp_size,
+ RTE_CACHE_LINE_SIZE, msl->socket_id);
+ if (mr == NULL) {
+ WARN("port %u unable to allocate memory for a new MR of"
+ " address (%p).",
+ dev->data->port_id, (void *)addr);
rte_errno = ENOMEM;
- goto release;
+ goto err_nolock;
}
- *mr = (struct mlx4_mr){
- .start = start,
- .end = end,
- .refcnt = 1,
- .priv = priv,
- .mr = mlx4_glue->reg_mr(priv->pd, (void *)start, end - start,
- IBV_ACCESS_LOCAL_WRITE),
- .mp = mp,
- };
- if (mr->mr) {
- mr->lkey = mr->mr->lkey;
- LIST_INSERT_HEAD(&priv->mr, mr, next);
- } else {
- rte_free(mr);
- mr = NULL;
- rte_errno = errno ? errno : EINVAL;
+ mr->msl = msl;
+ /*
+ * Save the index of the first memseg and initialize memseg bitmap. To
+ * see if a memseg of ms_idx in the memseg-list is still valid, check:
+ * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx)
+ */
+ mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+ bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE);
+ mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size);
+ if (mr->ms_bmp == NULL) {
+ WARN("port %u unable to initialize bitamp for a new MR of"
+ " address (%p).",
+ dev->data->port_id, (void *)addr);
+ rte_errno = EINVAL;
+ goto err_nolock;
+ }
+ /*
+ * Should recheck whether the extended contiguous chunk is still valid.
+ * Because memory_hotplug_lock can't be held if there's any memory
+ * related calls in a critical path, resource allocation above can't be
+ * locked. If the memory has been changed at this point, try again with
+ * just single page. If not, go on with the big chunk atomically from
+ * here.
+ */
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+ data_re = data;
+ if (len > msl->page_sz &&
+ !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) {
+ WARN("port %u unable to find virtually contiguous"
+ " chunk for address (%p)."
+ " rte_memseg_contig_walk() failed.",
+ dev->data->port_id, (void *)addr);
+ rte_errno = ENXIO;
+ goto err_memlock;
+ }
+ if (data.start != data_re.start || data.end != data_re.end) {
+ /*
+ * The extended contiguous chunk has been changed. Try again
+ * with single memseg instead.
+ */
+ data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz);
+ data.end = data.start + msl->page_sz;
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ mr_free(mr);
+ goto alloc_resources;
}
-release:
- rte_spinlock_unlock(&priv->mr_lock);
- return mr;
+ assert(data.msl == data_re.msl);
+ rte_rwlock_write_lock(&priv->mr.rwlock);
+ /*
+ * Check the address is really missing. If other thread already created
+ * one or it is not found due to overflow, abort and return.
+ */
+ if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) {
+ /*
+ * Insert to the global cache table. It may fail due to
+ * low-on-memory. Then, this entry will have to be searched
+ * here again.
+ */
+ mr_btree_insert(&priv->mr.cache, entry);
+ DEBUG("port %u found MR for %p on final lookup, abort",
+ dev->data->port_id, (void *)addr);
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ /*
+ * Must be unlocked before calling rte_free() because
+ * mlx4_mr_mem_event_free_cb() can be called inside.
+ */
+ mr_free(mr);
+ return entry->lkey;
+ }
+ /*
+ * Trim start and end addresses for verbs MR. Set bits for registering
+ * memsegs but exclude already registered ones. Bitmap can be
+ * fragmented.
+ */
+ for (n = 0; n < ms_n; ++n) {
+ uintptr_t start;
+ struct mlx4_mr_cache ret = { 0, };
+
+ start = data_re.start + n * msl->page_sz;
+ /* Exclude memsegs already registered by other MRs. */
+ if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) {
+ /*
+ * Start from the first unregistered memseg in the
+ * extended range.
+ */
+ if (ms_idx_shift == -1) {
+ mr->ms_base_idx += n;
+ data.start = start;
+ ms_idx_shift = n;
+ }
+ data.end = start + msl->page_sz;
+ rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift);
+ ++mr->ms_n;
+ }
+ }
+ len = data.end - data.start;
+ mr->ms_bmp_n = len / msl->page_sz;
+ assert(ms_idx_shift + mr->ms_bmp_n <= ms_n);
+ /*
+ * Finally create a verbs MR for the memory chunk. ibv_reg_mr() can be
+ * called with holding the memory lock because it doesn't use
+ * mlx4_alloc_buf_extern() which eventually calls rte_malloc_socket()
+ * through mlx4_alloc_verbs_buf().
+ */
+ mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)data.start, len,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (mr->ibv_mr == NULL) {
+ WARN("port %u fail to create a verbs MR for address (%p)",
+ dev->data->port_id, (void *)addr);
+ rte_errno = EINVAL;
+ goto err_mrlock;
+ }
+ assert((uintptr_t)mr->ibv_mr->addr == data.start);
+ assert(mr->ibv_mr->length == len);
+ LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+ DEBUG("port %u MR CREATED (%p) for %p:\n"
+ " [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
+ " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
+ dev->data->port_id, (void *)mr, (void *)addr,
+ data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey),
+ mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
+ /* Insert to the global cache table. */
+ mr_insert_dev_cache(dev, mr);
+ /* Fill in output data. */
+ mr_lookup_dev(dev, entry, addr);
+ /* Lookup can't fail. */
+ assert(entry->lkey != UINT32_MAX);
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return entry->lkey;
+err_mrlock:
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+err_memlock:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+err_nolock:
+ /*
+ * In case of error, as this can be called in a datapath, a warning
+ * message per an error is preferable instead. Must be unlocked before
+ * calling rte_free() because mlx4_mr_mem_event_free_cb() can be called
+ * inside.
+ */
+ mr_free(mr);
+ return UINT32_MAX;
}
/**
- * Release a memory region.
+ * Rebuild the global B-tree cache of device from the original MR list.
*
- * This function decrements its reference count and destroys it after
- * reaching 0.
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+mr_rebuild_dev_cache(struct rte_eth_dev *dev)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct mlx4_mr *mr;
+
+ DEBUG("port %u rebuild dev cache[]", dev->data->port_id);
+ /* Flush cache to rebuild. */
+ priv->mr.cache.len = 1;
+ priv->mr.cache.overflow = 0;
+ /* Iterate all the existing MRs. */
+ LIST_FOREACH(mr, &priv->mr.mr_list, mr)
+ if (mr_insert_dev_cache(dev, mr) < 0)
+ return;
+}
+
+/**
+ * Callback for memory free event. Iterate freed memsegs and check whether it
+ * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a
+ * result, the MR would be fragmented. If it becomes empty, the MR will be freed
+ * later by mlx4_mr_garbage_collect().
*
- * Note to avoid race conditions given this function may be used from the
- * data plane, it's extremely important that each user holds its own
- * reference.
+ * The global cache must be rebuilt if there's any change and this event has to
+ * be propagated to dataplane threads to flush the local caches.
*
- * @param mr
- * Memory region to release.
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param addr
+ * Address of freed memory.
+ * @param len
+ * Size of freed memory.
+ */
+static void
+mlx4_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
+{
+ struct priv *priv = dev->data->dev_private;
+ const struct rte_memseg_list *msl;
+ struct mlx4_mr *mr;
+ int ms_n;
+ int i;
+ int rebuild = 0;
+
+ DEBUG("port %u free callback: addr=%p, len=%zu",
+ dev->data->port_id, addr, len);
+ msl = rte_mem_virt2memseg_list(addr);
+ /* addr and len must be page-aligned. */
+ assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
+ assert(len == RTE_ALIGN(len, msl->page_sz));
+ ms_n = len / msl->page_sz;
+ rte_rwlock_write_lock(&priv->mr.rwlock);
+ /* Clear bits of freed memsegs from MR. */
+ for (i = 0; i < ms_n; ++i) {
+ const struct rte_memseg *ms;
+ struct mlx4_mr_cache entry;
+ uintptr_t start;
+ int ms_idx;
+ uint32_t pos;
+
+ /* Find MR having this memseg. */
+ start = (uintptr_t)addr + i * msl->page_sz;
+ mr = mr_lookup_dev_list(dev, &entry, start);
+ if (mr == NULL)
+ continue;
+ ms = rte_mem_virt2memseg((void *)start, msl);
+ assert(ms != NULL);
+ assert(msl->page_sz == ms->hugepage_sz);
+ ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+ pos = ms_idx - mr->ms_base_idx;
+ assert(rte_bitmap_get(mr->ms_bmp, pos));
+ assert(pos < mr->ms_bmp_n);
+ DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p",
+ dev->data->port_id, (void *)mr, pos, (void *)start);
+ rte_bitmap_clear(mr->ms_bmp, pos);
+ if (--mr->ms_n == 0) {
+ LIST_REMOVE(mr, mr);
+ LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+ DEBUG("port %u remove MR(%p) from list",
+ dev->data->port_id, (void *)mr);
+ }
+ /*
+ * MR is fragmented or will be freed. the global cache must be
+ * rebuilt.
+ */
+ rebuild = 1;
+ }
+ if (rebuild) {
+ mr_rebuild_dev_cache(dev);
+ /*
+ * Flush local caches by propagating invalidation across cores.
+ * rte_smp_wmb() is enough to synchronize this event. If one of
+ * freed memsegs is seen by other core, that means the memseg
+ * has been allocated by allocator, which will come after this
+ * free call. Therefore, this store instruction (incrementing
+ * generation below) will be guaranteed to be seen by other core
+ * before the core sees the newly allocated memory.
+ */
+ ++priv->mr.dev_gen;
+ DEBUG("broadcasting local cache flush, gen=%d",
+ priv->mr.dev_gen);
+ rte_smp_wmb();
+ }
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+#ifndef NDEBUG
+ if (rebuild)
+ mlx4_mr_dump_dev(dev);
+#endif
+}
+
+/**
+ * Callback for memory event.
+ *
+ * @param event_type
+ * Memory event type.
+ * @param addr
+ * Address of memory.
+ * @param len
+ * Size of memory.
*/
void
-mlx4_mr_put(struct mlx4_mr *mr)
+mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
+ size_t len, void *arg __rte_unused)
{
- struct priv *priv = mr->priv;
-
- rte_spinlock_lock(&priv->mr_lock);
- assert(mr->refcnt);
- if (--mr->refcnt)
- goto release;
- LIST_REMOVE(mr, next);
- claim_zero(mlx4_glue->dereg_mr(mr->mr));
- rte_free(mr);
-release:
- rte_spinlock_unlock(&priv->mr_lock);
+ struct priv *priv;
+
+ switch (event_type) {
+ case RTE_MEM_EVENT_FREE:
+ rte_rwlock_read_lock(&mlx4_mem_event_rwlock);
+ /* Iterate all the existing mlx4 devices. */
+ LIST_FOREACH(priv, &mlx4_mem_event_cb_list, mem_event_cb)
+ mlx4_mr_mem_event_free_cb(priv->dev, addr, len);
+ rte_rwlock_read_unlock(&mlx4_mem_event_rwlock);
+ break;
+ case RTE_MEM_EVENT_ALLOC:
+ default:
+ break;
+ }
}
/**
- * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
- * If mp2mr[] is full, remove an entry first.
+ * Look up address in the global MR cache table. If not found, create a new MR.
+ * Insert the found/created entry to local bottom-half cache table.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mr_ctrl
+ * Pointer to per-queue MR control structure.
+ * @param[out] entry
+ * Pointer to returning MR cache entry, found in the global cache or newly
+ * created. If failed to create one, this is not written.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mlx4_mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+ struct mlx4_mr_cache *entry, uintptr_t addr)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct mlx4_mr_btree *bt = &mr_ctrl->cache_bh;
+ uint16_t idx;
+ uint32_t lkey;
+
+ /* If local cache table is full, try to double it. */
+ if (unlikely(bt->len == bt->size))
+ mr_btree_expand(bt, bt->size << 1);
+ /* Look up in the global cache. */
+ rte_rwlock_read_lock(&priv->mr.rwlock);
+ lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+ if (lkey != UINT32_MAX) {
+ /* Found. */
+ *entry = (*priv->mr.cache.table)[idx];
+ rte_rwlock_read_unlock(&priv->mr.rwlock);
+ /*
+ * Update local cache. Even if it fails, return the found entry
+ * to update top-half cache. Next time, this entry will be found
+ * in the global cache.
+ */
+ mr_btree_insert(bt, entry);
+ return lkey;
+ }
+ rte_rwlock_read_unlock(&priv->mr.rwlock);
+ /* First time to see the address? Create a new MR. */
+ lkey = mlx4_mr_create(dev, entry, addr);
+ /*
+ * Update the local cache if successfully created a new global MR. Even
+ * if failed to create one, there's no action to take in this datapath
+ * code. As returning LKey is invalid, this will eventually make HW
+ * fail.
+ */
+ if (lkey != UINT32_MAX)
+ mr_btree_insert(bt, entry);
+ return lkey;
+}
+
+/**
+ * Bottom-half of LKey search on datapath. Firstly search in cache_bh[] and if
+ * misses, search in the global MR cache table and update the new entry to
+ * per-queue local caches.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mr_ctrl
+ * Pointer to per-queue MR control structure.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mlx4_mr_addr2mr_bh(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+ uintptr_t addr)
+{
+ uint32_t lkey;
+ uint16_t bh_idx = 0;
+ /* Victim in top-half cache to replace with new entry. */
+ struct mlx4_mr_cache *repl = &mr_ctrl->cache[mr_ctrl->head];
+
+ /* Binary-search MR translation table. */
+ lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
+ /* Update top-half cache. */
+ if (likely(lkey != UINT32_MAX)) {
+ *repl = (*mr_ctrl->cache_bh.table)[bh_idx];
+ } else {
+ /*
+ * If missed in local lookup table, search in the global cache
+ * and local cache_bh[] will be updated inside if possible.
+ * Top-half cache entry will also be updated.
+ */
+ lkey = mlx4_mr_lookup_dev(dev, mr_ctrl, repl, addr);
+ if (unlikely(lkey == UINT32_MAX))
+ return UINT32_MAX;
+ }
+ /* Update the most recently used entry. */
+ mr_ctrl->mru = mr_ctrl->head;
+ /* Point to the next victim, the oldest. */
+ mr_ctrl->head = (mr_ctrl->head + 1) % MLX4_MR_CACHE_N;
+ return lkey;
+}
+
+/**
+ * Bottom-half of LKey search on Rx.
+ *
+ * @param rxq
+ * Pointer to Rx queue structure.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr)
+{
+ struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
+ struct priv *priv = rxq->priv;
+
+ DEBUG("Rx queue %u: miss on top-half, mru=%u, head=%u, addr=%p",
+ rxq->stats.idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr);
+ return mlx4_mr_addr2mr_bh(priv->dev, mr_ctrl, addr);
+}
+
+/**
+ * Bottom-half of LKey search on Tx.
*
* @param txq
* Pointer to Tx queue structure.
- * @param[in] mp
- * Memory pool for which a memory region lkey must be added.
- * @param[in] i
- * Index in memory pool (MP) where to add memory region (MR).
+ * @param addr
+ * Search key.
*
* @return
- * Added mr->lkey on success, (uint32_t)-1 on failure.
+ * Searched LKey on success, UINT32_MAX on no match.
*/
uint32_t
-mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
+mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr)
{
+ struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+ struct priv *priv = txq->priv;
+
+ DEBUG("Tx queue %u: miss on top-half, mru=%u, head=%u, addr=%p",
+ txq->stats.idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr);
+ return mlx4_mr_addr2mr_bh(priv->dev, mr_ctrl, addr);
+}
+
+/**
+ * Flush all of the local cache entries.
+ *
+ * @param mr_ctrl
+ * Pointer to per-queue MR control structure.
+ */
+void
+mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl)
+{
+ /* Reset the most-recently-used index. */
+ mr_ctrl->mru = 0;
+ /* Reset the linear search array. */
+ mr_ctrl->head = 0;
+ memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache));
+ /* Reset the B-tree table. */
+ mr_ctrl->cache_bh.len = 1;
+ mr_ctrl->cache_bh.overflow = 0;
+ /* Update the generation number. */
+ mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr;
+ DEBUG("mr_ctrl(%p): flushed, cur_gen=%d",
+ (void *)mr_ctrl, mr_ctrl->cur_gen);
+}
+
+/* Called during rte_mempool_mem_iter() by mlx4_mr_update_mp(). */
+static void
+mlx4_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque,
+ struct rte_mempool_memhdr *memhdr,
+ unsigned mem_idx __rte_unused)
+{
+ struct mr_update_mp_data *data = opaque;
+ uint32_t lkey;
+
+ /* Stop iteration if failed in the previous walk. */
+ if (data->ret < 0)
+ return;
+ /* Register address of the chunk and update local caches. */
+ lkey = mlx4_mr_addr2mr_bh(data->dev, data->mr_ctrl,
+ (uintptr_t)memhdr->addr);
+ if (lkey == UINT32_MAX)
+ data->ret = -1;
+}
+
+/**
+ * Register entire memory chunks in a Mempool.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mr_ctrl
+ * Pointer to per-queue MR control structure.
+ * @param mp
+ * Pointer to registering Mempool.
+ *
+ * @return
+ * 0 on success, -1 on failure.
+ */
+int
+mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+ struct rte_mempool *mp)
+{
+ struct mr_update_mp_data data = {
+ .dev = dev,
+ .mr_ctrl = mr_ctrl,
+ .ret = 0,
+ };
+
+ rte_mempool_mem_iter(mp, mlx4_mr_update_mp_cb, &data);
+ return data.ret;
+}
+
+#ifndef NDEBUG
+/**
+ * Dump all the created MRs and the global cache entries.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx4_mr_dump_dev(struct rte_eth_dev *dev)
+{
+ struct priv *priv = dev->data->dev_private;
struct mlx4_mr *mr;
+ int mr_n = 0;
+ int chunk_n = 0;
+
+ rte_rwlock_read_lock(&priv->mr.rwlock);
+ /* Iterate all the existing MRs. */
+ LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+ unsigned int n;
+
+ DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
+ dev->data->port_id, mr_n++,
+ rte_cpu_to_be_32(mr->ibv_mr->lkey),
+ mr->ms_n, mr->ms_bmp_n);
+ if (mr->ms_n == 0)
+ continue;
+ for (n = 0; n < mr->ms_bmp_n; ) {
+ struct mlx4_mr_cache ret = { 0, };
- /* Add a new entry, register MR first. */
- DEBUG("%p: discovered new memory pool \"%s\" (%p)",
- (void *)txq, mp->name, (void *)mp);
- mr = mlx4_mr_get(txq->priv, mp);
- if (unlikely(mr == NULL)) {
- DEBUG("%p: unable to configure MR, mlx4_mr_get() failed",
- (void *)txq);
- return (uint32_t)-1;
+ n = mr_find_next_chunk(mr, &ret, n);
+ if (!ret.end)
+ break;
+ DEBUG(" chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")",
+ chunk_n++, ret.start, ret.end);
+ }
}
- if (unlikely(i == RTE_DIM(txq->mp2mr))) {
- /* Table is full, remove oldest entry. */
- DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
- (void *)txq);
- --i;
- mlx4_mr_put(txq->mp2mr[0].mr);
- memmove(&txq->mp2mr[0], &txq->mp2mr[1],
- (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ DEBUG("port %u dumping global cache", dev->data->port_id);
+ mlx4_mr_btree_dump(&priv->mr.cache);
+ rte_rwlock_read_unlock(&priv->mr.rwlock);
+}
+#endif
+
+/**
+ * Release all the created MRs and resources. Remove device from memory callback
+ * list.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx4_mr_release(struct rte_eth_dev *dev)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct mlx4_mr *mr_next = LIST_FIRST(&priv->mr.mr_list);
+
+ /* Remove from memory callback device list. */
+ rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
+ LIST_REMOVE(priv, mem_event_cb);
+ rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+#ifndef NDEBUG
+ mlx4_mr_dump_dev(dev);
+#endif
+ rte_rwlock_write_lock(&priv->mr.rwlock);
+ /* Detach from MR list and move to free list. */
+ while (mr_next != NULL) {
+ struct mlx4_mr *mr = mr_next;
+
+ mr_next = LIST_NEXT(mr, mr);
+ LIST_REMOVE(mr, mr);
+ LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
}
- /* Store the new entry. */
- txq->mp2mr[i].mp = mp;
- txq->mp2mr[i].mr = mr;
- txq->mp2mr[i].lkey = mr->lkey;
- DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
- (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
+ LIST_INIT(&priv->mr.mr_list);
+ /* Free global cache. */
+ mlx4_mr_btree_free(&priv->mr.cache);
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+ /* Free all remaining MRs. */
+ mlx4_mr_garbage_collect(dev);
}
diff --git a/drivers/net/mlx4/mlx4_mr.h b/drivers/net/mlx4/mlx4_mr.h
new file mode 100644
index 00000000..37a365a8
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_mr.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX4_MR_H_
+#define RTE_PMD_MLX4_MR_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_eal_memconfig.h>
+#include <rte_ethdev.h>
+#include <rte_rwlock.h>
+#include <rte_bitmap.h>
+
+/* Size of per-queue MR cache array for linear search. */
+#define MLX4_MR_CACHE_N 8
+
+/* Size of MR cache table for binary search. */
+#define MLX4_MR_BTREE_CACHE_N 256
+
+/* Memory Region object. */
+struct mlx4_mr {
+ LIST_ENTRY(mlx4_mr) mr; /**< Pointer to the prev/next entry. */
+ struct ibv_mr *ibv_mr; /* Verbs Memory Region. */
+ const struct rte_memseg_list *msl;
+ int ms_base_idx; /* Start index of msl->memseg_arr[]. */
+ int ms_n; /* Number of memsegs in use. */
+ uint32_t ms_bmp_n; /* Number of bits in memsegs bit-mask. */
+ struct rte_bitmap *ms_bmp; /* Bit-mask of memsegs belonged to MR. */
+};
+
+/* Cache entry for Memory Region. */
+struct mlx4_mr_cache {
+ uintptr_t start; /* Start address of MR. */
+ uintptr_t end; /* End address of MR. */
+ uint32_t lkey; /* rte_cpu_to_be_32(ibv_mr->lkey). */
+} __rte_packed;
+
+/* MR Cache table for Binary search. */
+struct mlx4_mr_btree {
+ uint16_t len; /* Number of entries. */
+ uint16_t size; /* Total number of entries. */
+ int overflow; /* Mark failure of table expansion. */
+ struct mlx4_mr_cache (*table)[];
+} __rte_packed;
+
+/* Per-queue MR control descriptor. */
+struct mlx4_mr_ctrl {
+ uint32_t *dev_gen_ptr; /* Generation number of device to poll. */
+ uint32_t cur_gen; /* Generation number saved to flush caches. */
+ uint16_t mru; /* Index of last hit entry in top-half cache. */
+ uint16_t head; /* Index of the oldest entry in top-half cache. */
+ struct mlx4_mr_cache cache[MLX4_MR_CACHE_N]; /* Cache for top-half. */
+ struct mlx4_mr_btree cache_bh; /* Cache for bottom-half. */
+} __rte_packed;
+
+extern struct mlx4_dev_list mlx4_mem_event_cb_list;
+extern rte_rwlock_t mlx4_mem_event_rwlock;
+
+/* First entry must be NULL for comparison. */
+#define mlx4_mr_btree_len(bt) ((bt)->len - 1)
+
+int mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket);
+void mlx4_mr_btree_free(struct mlx4_mr_btree *bt);
+void mlx4_mr_btree_dump(struct mlx4_mr_btree *bt);
+void mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
+ size_t len, void *arg);
+int mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+ struct rte_mempool *mp);
+void mlx4_mr_dump_dev(struct rte_eth_dev *dev);
+void mlx4_mr_release(struct rte_eth_dev *dev);
+
+/**
+ * Look up LKey from given lookup table by linear search. Firstly look up the
+ * last-hit entry. If miss, the entire array is searched. If found, update the
+ * last-hit index and return LKey.
+ *
+ * @param lkp_tbl
+ * Pointer to lookup table.
+ * @param[in,out] cached_idx
+ * Pointer to last-hit index.
+ * @param n
+ * Size of lookup table.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx4_mr_lookup_cache(struct mlx4_mr_cache *lkp_tbl, uint16_t *cached_idx,
+ uint16_t n, uintptr_t addr)
+{
+ uint16_t idx;
+
+ if (likely(addr >= lkp_tbl[*cached_idx].start &&
+ addr < lkp_tbl[*cached_idx].end))
+ return lkp_tbl[*cached_idx].lkey;
+ for (idx = 0; idx < n && lkp_tbl[idx].start != 0; ++idx) {
+ if (addr >= lkp_tbl[idx].start &&
+ addr < lkp_tbl[idx].end) {
+ /* Found. */
+ *cached_idx = idx;
+ return lkp_tbl[idx].lkey;
+ }
+ }
+ return UINT32_MAX;
+}
+
+#endif /* RTE_PMD_MLX4_MR_H_ */
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index 153dda52..aef77ba0 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
#ifndef MLX4_PRM_H_
@@ -19,6 +19,7 @@
#ifdef PEDANTIC
#pragma GCC diagnostic error "-Wpedantic"
#endif
+#include "mlx4_autoconf.h"
/* ConnectX-3 Tx queue basic block. */
#define MLX4_TXBB_SHIFT 6
@@ -40,6 +41,7 @@
/* Work queue element (WQE) flags. */
#define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
#define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
/* CQE checksum flags. */
enum {
@@ -51,6 +53,7 @@ enum {
};
/* CQE status flags. */
+#define MLX4_CQE_STATUS_IPV6F (1 << 12)
#define MLX4_CQE_STATUS_IPV4 (1 << 22)
#define MLX4_CQE_STATUS_IPV4F (1 << 23)
#define MLX4_CQE_STATUS_IPV6 (1 << 24)
@@ -97,6 +100,19 @@ struct mlx4_cq {
int arm_sn; /**< Rx event counter. */
};
+#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg {
+ rte_be32_t mss_hdr_size;
+ rte_be32_t header[];
+};
+#endif
+
/**
* Retrieve a CQE entry from a CQ.
*
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 7a036ed8..9737da2e 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -88,7 +88,7 @@ mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = {
*/
struct mlx4_rss *
mlx4_rss_get(struct priv *priv, uint64_t fields,
- uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+ const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
uint16_t queues, const uint16_t queue_id[])
{
struct mlx4_rss *rss;
@@ -336,6 +336,14 @@ mlx4_rss_init(struct priv *priv)
unsigned int i;
int ret;
+ if (priv->rss_init)
+ return 0;
+ if (priv->dev->data->nb_rx_queues > priv->hw_rss_max_qps) {
+ ERROR("RSS does not support more than %d queues",
+ priv->hw_rss_max_qps);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
/* Prepare range for RSS contexts before creating the first WQ. */
ret = mlx4_glue->dv_set_context_attr
(priv->ctx,
@@ -418,6 +426,7 @@ wq_num_check:
}
wq_num_prev = wq_num;
}
+ priv->rss_init = 1;
return 0;
error:
ERROR("cannot initialize common RSS resources (queue %u): %s: %s",
@@ -446,6 +455,8 @@ mlx4_rss_deinit(struct priv *priv)
{
unsigned int i;
+ if (!priv->rss_init)
+ return;
for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) {
struct rxq *rxq = priv->dev->data->rx_queues[i];
@@ -454,6 +465,7 @@ mlx4_rss_deinit(struct priv *priv)
mlx4_rxq_detach(rxq);
}
}
+ priv->rss_init = 0;
}
/**
@@ -482,6 +494,7 @@ mlx4_rxq_attach(struct rxq *rxq)
}
struct priv *priv = rxq->priv;
+ struct rte_eth_dev *dev = priv->dev;
const uint32_t elts_n = 1 << rxq->elts_n;
const uint32_t sges_n = 1 << rxq->sges_n;
struct rte_mbuf *(*elts)[elts_n] = rxq->elts;
@@ -491,6 +504,8 @@ mlx4_rxq_attach(struct rxq *rxq)
const char *msg;
struct ibv_cq *cq = NULL;
struct ibv_wq *wq = NULL;
+ uint32_t create_flags = 0;
+ uint32_t comp_mask = 0;
volatile struct mlx4_wqe_data_seg (*wqes)[];
unsigned int i;
int ret;
@@ -503,6 +518,11 @@ mlx4_rxq_attach(struct rxq *rxq)
msg = "CQ creation failure";
goto error;
}
+ /* By default, FCS (CRC) is stripped by hardware. */
+ if (rxq->crc_present) {
+ create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
+ comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+ }
wq = mlx4_glue->create_wq
(priv->ctx,
&(struct ibv_wq_init_attr){
@@ -511,6 +531,8 @@ mlx4_rxq_attach(struct rxq *rxq)
.max_sge = sges_n,
.pd = priv->pd,
.cq = cq,
+ .comp_mask = comp_mask,
+ .create_flags = create_flags,
});
if (!wq) {
ret = errno ? errno : EINVAL;
@@ -537,6 +559,11 @@ mlx4_rxq_attach(struct rxq *rxq)
msg = "failed to obtain device information from WQ/CQ objects";
goto error;
}
+ /* Pre-register Rx mempool. */
+ DEBUG("port %u Rx queue %u registering mp %s having %u chunks",
+ priv->dev->data->port_id, rxq->stats.idx,
+ rxq->mp->name, rxq->mp->nb_mem_chunks);
+ mlx4_mr_update_mp(dev, &rxq->mr_ctrl, rxq->mp);
wqes = (volatile struct mlx4_wqe_data_seg (*)[])
((uintptr_t)dv_rwq.buf.buf + dv_rwq.rq.offset);
for (i = 0; i != RTE_DIM(*elts); ++i) {
@@ -568,7 +595,7 @@ mlx4_rxq_attach(struct rxq *rxq)
.addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
uintptr_t)),
.byte_count = rte_cpu_to_be_32(buf->data_len),
- .lkey = rte_cpu_to_be_32(rxq->mr->lkey),
+ .lkey = mlx4_rx_mb2mr(rxq, buf),
};
(*elts)[i] = buf;
}
@@ -597,6 +624,7 @@ error:
claim_zero(mlx4_glue->destroy_wq(wq));
if (cq)
claim_zero(mlx4_glue->destroy_cq(cq));
+ --rxq->usecnt;
rte_errno = ret;
ERROR("error while attaching Rx queue %p: %s: %s",
(void *)rxq, msg, strerror(ret));
@@ -650,7 +678,9 @@ uint64_t
mlx4_get_rx_queue_offloads(struct priv *priv)
{
uint64_t offloads = DEV_RX_OFFLOAD_SCATTER |
- DEV_RX_OFFLOAD_CRC_STRIP;
+ DEV_RX_OFFLOAD_CRC_STRIP |
+ DEV_RX_OFFLOAD_KEEP_CRC |
+ DEV_RX_OFFLOAD_JUMBO_FRAME;
if (priv->hw_csum)
offloads |= DEV_RX_OFFLOAD_CHECKSUM;
@@ -676,26 +706,6 @@ mlx4_get_rx_port_offloads(struct priv *priv)
}
/**
- * Checks if the per-queue offload configuration is valid.
- *
- * @param priv
- * Pointer to private structure.
- * @param requested
- * Per-queue offloads configuration.
- *
- * @return
- * Nonzero when configuration is valid.
- */
-static int
-mlx4_check_rx_queue_offloads(struct priv *priv, uint64_t requested)
-{
- uint64_t mandatory = priv->dev->data->dev_conf.rxmode.offloads;
- uint64_t supported = mlx4_get_rx_port_offloads(priv);
-
- return !((mandatory ^ requested) & supported);
-}
-
-/**
* DPDK callback to configure a Rx queue.
*
* @param dev
@@ -736,20 +746,14 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
},
};
int ret;
+ uint32_t crc_present;
+ uint64_t offloads;
+
+ offloads = conf->offloads | dev->data->dev_conf.rxmode.offloads;
- (void)conf; /* Thresholds configuration (ignored). */
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
- if (!mlx4_check_rx_queue_offloads(priv, conf->offloads)) {
- rte_errno = ENOTSUP;
- ERROR("%p: Rx queue offloads 0x%" PRIx64 " don't match port "
- "offloads 0x%" PRIx64 " or supported offloads 0x%" PRIx64,
- (void *)dev, conf->offloads,
- dev->data->dev_conf.rxmode.offloads,
- (mlx4_get_rx_port_offloads(priv) |
- mlx4_get_rx_queue_offloads(priv)));
- return -rte_errno;
- }
+
if (idx >= dev->data->nb_rx_queues) {
rte_errno = EOVERFLOW;
ERROR("%p: queue index out of range (%u >= %u)",
@@ -774,6 +778,23 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
" to the next power of two (%u)",
(void *)dev, idx, desc);
}
+ /* By default, FCS (CRC) is stripped by hardware. */
+ crc_present = 0;
+ if (rte_eth_dev_must_keep_crc(offloads)) {
+ if (priv->hw_fcs_strip) {
+ crc_present = 1;
+ } else {
+ WARN("%p: CRC stripping has been disabled but will still"
+ " be performed by hardware, make sure MLNX_OFED and"
+ " firmware are up to date",
+ (void *)dev);
+ }
+ }
+ DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
+ " incoming frames to hide it",
+ (void *)dev,
+ crc_present ? "disabled" : "enabled",
+ crc_present << 2);
/* Allocate and initialize Rx queue. */
mlx4_zmallocv_socket("RXQ", vec, RTE_DIM(vec), socket);
if (!rxq) {
@@ -790,9 +811,10 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
.elts = elts,
/* Toggle Rx checksum offload if hardware supports it. */
.csum = priv->hw_csum &&
- (conf->offloads & DEV_RX_OFFLOAD_CHECKSUM),
+ (offloads & DEV_RX_OFFLOAD_CHECKSUM),
.csum_l2tun = priv->hw_csum_l2tun &&
- (conf->offloads & DEV_RX_OFFLOAD_CHECKSUM),
+ (offloads & DEV_RX_OFFLOAD_CHECKSUM),
+ .crc_present = crc_present,
.l2tun_offload = priv->hw_csum_l2tun,
.stats = {
.idx = idx,
@@ -804,7 +826,7 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
(mb_len - RTE_PKTMBUF_HEADROOM)) {
;
- } else if (conf->offloads & DEV_RX_OFFLOAD_SCATTER) {
+ } else if (offloads & DEV_RX_OFFLOAD_SCATTER) {
uint32_t size =
RTE_PKTMBUF_HEADROOM +
dev->data->dev_conf.rxmode.max_rx_pkt_len;
@@ -847,11 +869,9 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1 << rxq->sges_n);
goto error;
}
- /* Use the entire Rx mempool as the memory region. */
- rxq->mr = mlx4_mr_get(priv, mp);
- if (!rxq->mr) {
- ERROR("%p: MR creation failure: %s",
- (void *)dev, strerror(rte_errno));
+ if (mlx4_mr_btree_init(&rxq->mr_ctrl.cache_bh,
+ MLX4_MR_BTREE_CACHE_N, socket)) {
+ /* rte_errno is already set. */
goto error;
}
if (dev->data->dev_conf.intr_conf.rxq) {
@@ -911,7 +931,6 @@ mlx4_rx_queue_release(void *dpdk_rxq)
assert(!rxq->rq_db);
if (rxq->channel)
claim_zero(mlx4_glue->destroy_comp_channel(rxq->channel));
- if (rxq->mr)
- mlx4_mr_put(rxq->mr);
+ mlx4_mr_btree_free(&rxq->mr_ctrl.cache_bh);
rte_free(rxq);
}
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 8ca8b77c..8c88effc 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -38,10 +38,29 @@
* DWORD (32 byte) of a TXBB.
*/
struct pv {
- volatile struct mlx4_wqe_data_seg *dseg;
+ union {
+ volatile struct mlx4_wqe_data_seg *dseg;
+ volatile uint32_t *dst;
+ };
uint32_t val;
};
+/** A helper structure for TSO packet handling. */
+struct tso_info {
+ /** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
+ struct pv *pv;
+ /** Current entry in the pv array. */
+ int pv_counter;
+ /** Total size of the WQE including padding. */
+ uint32_t wqe_size;
+ /** Size of TSO header to prepend to each packet to send. */
+ uint16_t tso_header_size;
+ /** Total size of the TSO segment in the WQE. */
+ uint16_t wqe_tso_seg_size;
+ /** Raw WQE size in units of 16 Bytes and without padding. */
+ uint8_t fence_size;
+};
+
/** A table to translate Rx completion flags to packet type. */
uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
/*
@@ -52,49 +71,58 @@ uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
* bit[4] - MLX4_CQE_STATUS_TCP
* bit[3] - MLX4_CQE_STATUS_IPV4OPT
* bit[2] - MLX4_CQE_STATUS_IPV6
- * bit[1] - MLX4_CQE_STATUS_IPV4F
+ * bit[1] - MLX4_CQE_STATUS_IPF
* bit[0] - MLX4_CQE_STATUS_IPV4
* giving a total of up to 256 entries.
*/
+ /* L2 */
[0x00] = RTE_PTYPE_L2_ETHER,
+ /* L3 */
[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_L4_NONFRAG,
[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_L4_FRAG,
[0x03] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_L4_FRAG,
- [0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
- [0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT,
+ [0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_NONFRAG,
+ [0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG,
+ [0x08] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L4_NONFRAG,
+ [0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L4_NONFRAG,
[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
RTE_PTYPE_L4_FRAG,
+ [0x0b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L4_FRAG,
+ /* TCP */
[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_L4_TCP,
- [0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_L4_TCP,
[0x14] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_L4_TCP,
+ [0x16] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG,
[0x18] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
RTE_PTYPE_L4_TCP,
[0x19] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
RTE_PTYPE_L4_TCP,
- [0x1a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
- RTE_PTYPE_L4_TCP,
+ /* UDP */
[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_L4_UDP,
- [0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_L4_UDP,
[0x24] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_L4_UDP,
+ [0x26] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG,
[0x28] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
RTE_PTYPE_L4_UDP,
[0x29] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
RTE_PTYPE_L4_UDP,
- [0x2a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
- RTE_PTYPE_L4_UDP,
/* Tunneled - L3 IPV6 */
[0x80] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L4_FRAG,
@@ -102,65 +130,58 @@ uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L4_FRAG,
[0x84] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
+ [0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
[0x88] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT,
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_NONFRAG,
[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT,
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_NONFRAG,
[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG,
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x8b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_FRAG,
/* Tunneled - L3 IPV6, TCP */
[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L4_TCP,
- [0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG |
- RTE_PTYPE_INNER_L4_TCP,
- [0x93] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG |
- RTE_PTYPE_INNER_L4_TCP,
[0x94] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_INNER_L4_TCP,
+ [0x96] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
[0x98] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT |
- RTE_PTYPE_INNER_L4_TCP,
+ RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP,
[0x99] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT |
- RTE_PTYPE_INNER_L4_TCP,
- [0x9a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
- RTE_PTYPE_INNER_L4_TCP,
+ RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP,
/* Tunneled - L3 IPV6, UDP */
- [0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_UDP,
- [0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ [0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG |
RTE_PTYPE_INNER_L4_UDP,
- [0xa3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG |
- RTE_PTYPE_INNER_L4_UDP,
- [0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ [0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_INNER_L4_UDP,
- [0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ [0xa6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV4_EXT |
RTE_PTYPE_INNER_L4_UDP,
- [0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ [0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV4_EXT |
RTE_PTYPE_INNER_L4_UDP,
- [0xaa] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
- RTE_PTYPE_INNER_L4_UDP,
/* Tunneled - L3 IPV4 */
[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L4_FRAG,
@@ -168,65 +189,54 @@ uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L4_FRAG,
[0xc4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_NONFRAG,
+ [0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
[0xc8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT,
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_NONFRAG,
[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT,
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_NONFRAG,
[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV4_EXT |
RTE_PTYPE_INNER_L4_FRAG,
+ [0xcb] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_FRAG,
/* Tunneled - L3 IPV4, TCP */
- [0xd0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_TCP,
[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L4_TCP,
- [0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG |
- RTE_PTYPE_INNER_L4_TCP,
- [0xd3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG |
- RTE_PTYPE_INNER_L4_TCP,
[0xd4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_INNER_L4_TCP,
+ [0xd6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
[0xd8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV4_EXT |
RTE_PTYPE_INNER_L4_TCP,
[0xd9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV4_EXT |
RTE_PTYPE_INNER_L4_TCP,
- [0xda] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
- RTE_PTYPE_INNER_L4_TCP,
/* Tunneled - L3 IPV4, UDP */
- [0xe0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_UDP,
[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L4_UDP,
- [0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG |
- RTE_PTYPE_INNER_L4_UDP,
- [0xe3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L4_FRAG |
- RTE_PTYPE_INNER_L4_UDP,
[0xe4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
RTE_PTYPE_INNER_L4_UDP,
+ [0xe6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
[0xe8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP,
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_UDP,
[0xe9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP,
- [0xea] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
- RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
RTE_PTYPE_INNER_L4_UDP,
};
@@ -263,7 +273,7 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, volatile uint32_t *start,
} while (start != (volatile uint32_t *)sq->eob);
start = (volatile uint32_t *)sq->buf;
/* Flip invalid stamping ownership. */
- stamp ^= RTE_BE32(0x1 << MLX4_SQ_OWNER_BIT);
+ stamp ^= RTE_BE32(1u << MLX4_SQ_OWNER_BIT);
sq->stamp = stamp;
if (start == end)
return size;
@@ -344,24 +354,6 @@ mlx4_txq_complete(struct txq *txq, const unsigned int elts_m,
}
/**
- * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
- * the cloned mbuf is allocated is returned instead.
- *
- * @param buf
- * Pointer to mbuf.
- *
- * @return
- * Memory pool where data is located for given mbuf.
- */
-static struct rte_mempool *
-mlx4_txq_mb2mp(struct rte_mbuf *buf)
-{
- if (unlikely(RTE_MBUF_INDIRECT(buf)))
- return rte_mbuf_from_indirect(buf)->pool;
- return buf->pool;
-}
-
-/**
* Write Tx data segment to the SQ.
*
* @param dseg
@@ -378,7 +370,7 @@ mlx4_fill_tx_data_seg(volatile struct mlx4_wqe_data_seg *dseg,
uint32_t lkey, uintptr_t addr, rte_be32_t byte_count)
{
dseg->addr = rte_cpu_to_be_64(addr);
- dseg->lkey = rte_cpu_to_be_32(lkey);
+ dseg->lkey = lkey;
#if RTE_CACHE_LINE_SIZE < 64
/*
* Need a barrier here before writing the byte_count
@@ -395,6 +387,342 @@ mlx4_fill_tx_data_seg(volatile struct mlx4_wqe_data_seg *dseg,
}
/**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ * Pointer to the first packet mbuf.
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param tinfo
+ * Pointer to a structure to fill the info with.
+ *
+ * @return
+ * 0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+ struct txq *txq,
+ struct tso_info *tinfo)
+{
+ struct mlx4_sq *sq = &txq->msq;
+ const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+ (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+ tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len;
+ if (tunneled)
+ tinfo->tso_header_size +=
+ buf->outer_l2_len + buf->outer_l3_len;
+ if (unlikely(buf->tso_segsz == 0 ||
+ tinfo->tso_header_size == 0 ||
+ tinfo->tso_header_size > MLX4_MAX_TSO_HEADER ||
+ tinfo->tso_header_size > buf->data_len))
+ return -EINVAL;
+ /*
+ * Calculate the WQE TSO segment size
+ * Note:
+ * 1. An LSO segment must be padded such that the subsequent data
+ * segment is 16-byte aligned.
+ * 2. The start address of the TSO segment is always 16 Bytes aligned.
+ */
+ tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +
+ tinfo->tso_header_size,
+ sizeof(struct mlx4_wqe_data_seg));
+ tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+ tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+ buf->nb_segs;
+ tinfo->wqe_size =
+ RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+ MLX4_TXBB_SIZE);
+ /* Validate WQE size and WQE space in the send queue. */
+ if (sq->remain_size < tinfo->wqe_size ||
+ tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+ return -ENOMEM;
+ /* Init pv. */
+ tinfo->pv = (struct pv *)txq->bounce_buf;
+ tinfo->pv_counter = 0;
+ return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ * Pointer to the first packet mbuf.
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param tinfo
+ * Pointer to TSO info to use.
+ * @param dseg
+ * Pointer to the first data segment in the TSO WQE.
+ * @param ctrl
+ * Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ * 0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
+ struct txq *txq,
+ struct tso_info *tinfo,
+ volatile struct mlx4_wqe_data_seg *dseg,
+ volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+ uint32_t lkey;
+ int nb_segs = buf->nb_segs;
+ int nb_segs_txbb;
+ struct mlx4_sq *sq = &txq->msq;
+ struct rte_mbuf *sbuf = buf;
+ struct pv *pv = tinfo->pv;
+ int *pv_counter = &tinfo->pv_counter;
+ volatile struct mlx4_wqe_ctrl_seg *ctrl_next =
+ (volatile struct mlx4_wqe_ctrl_seg *)
+ ((volatile uint8_t *)ctrl + tinfo->wqe_size);
+ uint16_t data_len = sbuf->data_len - tinfo->tso_header_size;
+ uintptr_t data_addr = rte_pktmbuf_mtod_offset(sbuf, uintptr_t,
+ tinfo->tso_header_size);
+
+ do {
+ /* how many dseg entries do we have in the current TXBB ? */
+ nb_segs_txbb = (MLX4_TXBB_SIZE -
+ ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
+ MLX4_SEG_SHIFT;
+ switch (nb_segs_txbb) {
+#ifndef NDEBUG
+ default:
+ /* Should never happen. */
+ rte_panic("%p: Invalid number of SGEs(%d) for a TXBB",
+ (void *)txq, nb_segs_txbb);
+ /* rte_panic never returns. */
+ break;
+#endif /* NDEBUG */
+ case 4:
+ /* Memory region key for this memory pool. */
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
+ if (unlikely(lkey == (uint32_t)-1))
+ goto err;
+ dseg->addr = rte_cpu_to_be_64(data_addr);
+ dseg->lkey = lkey;
+ /*
+ * This data segment starts at the beginning of a new
+ * TXBB, so we need to postpone its byte_count writing
+ * for later.
+ */
+ pv[*pv_counter].dseg = dseg;
+ /*
+ * Zero length segment is treated as inline segment
+ * with zero data.
+ */
+ pv[(*pv_counter)++].val =
+ rte_cpu_to_be_32(data_len ?
+ data_len :
+ 0x80000000);
+ if (--nb_segs == 0)
+ return ctrl_next;
+ /* Prepare next buf info */
+ sbuf = sbuf->next;
+ dseg++;
+ data_len = sbuf->data_len;
+ data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+ /* fallthrough */
+ case 3:
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
+ if (unlikely(lkey == (uint32_t)-1))
+ goto err;
+ mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
+ rte_cpu_to_be_32(data_len ?
+ data_len :
+ 0x80000000));
+ if (--nb_segs == 0)
+ return ctrl_next;
+ /* Prepare next buf info */
+ sbuf = sbuf->next;
+ dseg++;
+ data_len = sbuf->data_len;
+ data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+ /* fallthrough */
+ case 2:
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
+ if (unlikely(lkey == (uint32_t)-1))
+ goto err;
+ mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
+ rte_cpu_to_be_32(data_len ?
+ data_len :
+ 0x80000000));
+ if (--nb_segs == 0)
+ return ctrl_next;
+ /* Prepare next buf info */
+ sbuf = sbuf->next;
+ dseg++;
+ data_len = sbuf->data_len;
+ data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+ /* fallthrough */
+ case 1:
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
+ if (unlikely(lkey == (uint32_t)-1))
+ goto err;
+ mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
+ rte_cpu_to_be_32(data_len ?
+ data_len :
+ 0x80000000));
+ if (--nb_segs == 0)
+ return ctrl_next;
+ /* Prepare next buf info */
+ sbuf = sbuf->next;
+ dseg++;
+ data_len = sbuf->data_len;
+ data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+ /* fallthrough */
+ }
+ /* Wrap dseg if it points at the end of the queue. */
+ if ((volatile uint8_t *)dseg >= sq->eob)
+ dseg = (volatile struct mlx4_wqe_data_seg *)
+ ((volatile uint8_t *)dseg - sq->size);
+ } while (true);
+err:
+ return NULL;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *
+ * This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ * Pointer to the first packet mbuf.
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param tinfo
+ * Pointer to TSO info to use.
+ * @param ctrl
+ * Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ * 0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_data_seg *
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+ struct txq *txq,
+ struct tso_info *tinfo,
+ volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+ volatile struct mlx4_wqe_lso_seg *tseg =
+ (volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
+ struct mlx4_sq *sq = &txq->msq;
+ struct pv *pv = tinfo->pv;
+ int *pv_counter = &tinfo->pv_counter;
+ int remain_size = tinfo->tso_header_size;
+ char *from = rte_pktmbuf_mtod(buf, char *);
+ uint16_t txbb_avail_space;
+ /* Union to overcome volatile constraints when copying TSO header. */
+ union {
+ volatile uint8_t *vto;
+ uint8_t *to;
+ } thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+ /*
+ * TSO data always starts at offset 20 from the beginning of the TXBB
+ * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+ * we can write the first 44 TSO header bytes without worry for TxQ
+ * wrapping or overwriting the first TXBB 32bit word.
+ */
+ txbb_avail_space = MLX4_TXBB_SIZE -
+ (sizeof(struct mlx4_wqe_ctrl_seg) +
+ sizeof(struct mlx4_wqe_lso_seg));
+ while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
+ /* Copy to end of txbb. */
+ rte_memcpy(thdr.to, from, txbb_avail_space);
+ from += txbb_avail_space;
+ thdr.to += txbb_avail_space;
+ /* New TXBB, Check for TxQ wrap. */
+ if (thdr.to >= sq->eob)
+ thdr.vto = sq->buf;
+ /* New TXBB, stash the first 32bits for later use. */
+ pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+ pv[(*pv_counter)++].val = *(uint32_t *)from,
+ from += sizeof(uint32_t);
+ thdr.to += sizeof(uint32_t);
+ remain_size -= txbb_avail_space + sizeof(uint32_t);
+ /* Avail space in new TXBB is TXBB size - 4 */
+ txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+ }
+ if (remain_size > txbb_avail_space) {
+ rte_memcpy(thdr.to, from, txbb_avail_space);
+ from += txbb_avail_space;
+ thdr.to += txbb_avail_space;
+ remain_size -= txbb_avail_space;
+ /* New TXBB, Check for TxQ wrap. */
+ if (thdr.to >= sq->eob)
+ thdr.vto = sq->buf;
+ pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+ rte_memcpy(&pv[*pv_counter].val, from, remain_size);
+ (*pv_counter)++;
+ } else if (remain_size) {
+ rte_memcpy(thdr.to, from, remain_size);
+ }
+ tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+ tinfo->tso_header_size);
+ /* Calculate data segment location */
+ return (volatile struct mlx4_wqe_data_seg *)
+ ((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ * Pointer to the first packet mbuf.
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param ctrl
+ * Pointer to the WQE control segment.
+ *
+ * @return
+ * Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+ volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+ volatile struct mlx4_wqe_data_seg *dseg;
+ volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
+ struct mlx4_sq *sq = &txq->msq;
+ struct tso_info tinfo;
+ struct pv *pv;
+ int pv_counter;
+ int ret;
+
+ ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+ if (unlikely(ret))
+ goto error;
+ dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
+ if (unlikely(dseg == NULL))
+ goto error;
+ if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+ dseg = (volatile struct mlx4_wqe_data_seg *)
+ ((uintptr_t)dseg - sq->size);
+ ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
+ if (unlikely(ctrl_next == NULL))
+ goto error;
+ /* Write the first DWORD of each TXBB save earlier. */
+ if (likely(tinfo.pv_counter)) {
+ pv = tinfo.pv;
+ pv_counter = tinfo.pv_counter;
+ /* Need a barrier here before writing the first TXBB word. */
+ rte_io_wmb();
+ do {
+ --pv_counter;
+ *pv[pv_counter].dst = pv[pv_counter].val;
+ } while (pv_counter > 0);
+ }
+ ctrl->fence_size = tinfo.fence_size;
+ sq->remain_size -= tinfo.wqe_size;
+ return ctrl_next;
+error:
+ txq->stats.odropped++;
+ return NULL;
+}
+
+/**
* Write data segments of multi-segment packet.
*
* @param buf
@@ -437,7 +765,7 @@ mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
goto txbb_tail_segs;
txbb_head_seg:
/* Memory region key (big endian) for this memory pool. */
- lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
if (unlikely(lkey == (uint32_t)-1)) {
DEBUG("%p: unable to get MP <-> MR association",
(void *)txq);
@@ -449,7 +777,7 @@ txbb_head_seg:
dseg = (volatile struct mlx4_wqe_data_seg *)
sq->buf;
dseg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(sbuf, uintptr_t));
- dseg->lkey = rte_cpu_to_be_32(lkey);
+ dseg->lkey = lkey;
/*
* This data segment starts at the beginning of a new
* TXBB, so we need to postpone its byte_count writing
@@ -469,7 +797,7 @@ txbb_tail_segs:
/* Jump to default if there are more than two segments remaining. */
switch (nb_segs) {
default:
- lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
if (unlikely(lkey == (uint32_t)-1)) {
DEBUG("%p: unable to get MP <-> MR association",
(void *)txq);
@@ -485,7 +813,7 @@ txbb_tail_segs:
nb_segs--;
/* fallthrough */
case 2:
- lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
if (unlikely(lkey == (uint32_t)-1)) {
DEBUG("%p: unable to get MP <-> MR association",
(void *)txq);
@@ -501,7 +829,7 @@ txbb_tail_segs:
nb_segs--;
/* fallthrough */
case 1:
- lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
if (unlikely(lkey == (uint32_t)-1)) {
DEBUG("%p: unable to get MP <-> MR association",
(void *)txq);
@@ -587,6 +915,7 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint16_t flags16[2];
} srcrb;
uint32_t lkey;
+ bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
/* Clean up old buffer. */
if (likely(elt->buf != NULL)) {
@@ -605,13 +934,22 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
} while (tmp != NULL);
}
RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
- if (buf->nb_segs == 1) {
+ if (tso) {
+ /* Change opcode to TSO */
+ owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+ owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+ ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+ if (!ctrl_next) {
+ elt->buf = NULL;
+ break;
+ }
+ } else if (buf->nb_segs == 1) {
/* Validate WQE space in the send queue. */
if (sq->remain_size < MLX4_TXBB_SIZE) {
elt->buf = NULL;
break;
}
- lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+ lkey = mlx4_tx_mb2mr(txq, buf);
if (unlikely(lkey == (uint32_t)-1)) {
/* MR does not exist. */
DEBUG("%p: unable to get MP <-> MR association",
@@ -639,7 +977,7 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
ctrl_next = (volatile struct mlx4_wqe_ctrl_seg *)
((volatile uint8_t *)ctrl_next - sq->size);
/* Flip HW valid ownership. */
- sq->owner_opcode ^= 0x1 << MLX4_SQ_OWNER_BIT;
+ sq->owner_opcode ^= 1u << MLX4_SQ_OWNER_BIT;
}
/*
* For raw Ethernet, the SOLICIT flag is used to indicate
@@ -746,11 +1084,13 @@ rxq_cq_to_pkt_type(volatile struct mlx4_cqe *cqe,
* bit[4] - MLX4_CQE_STATUS_TCP
* bit[3] - MLX4_CQE_STATUS_IPV4OPT
* bit[2] - MLX4_CQE_STATUS_IPV6
- * bit[1] - MLX4_CQE_STATUS_IPV4F
+ * bit[1] - MLX4_CQE_STATUS_IPF
* bit[0] - MLX4_CQE_STATUS_IPV4
* giving a total of up to 256 entries.
*/
idx |= ((status & MLX4_CQE_STATUS_PTYPE_MASK) >> 22);
+ if (status & MLX4_CQE_STATUS_IPV6)
+ idx |= ((status & MLX4_CQE_STATUS_IPV6F) >> 11);
return mlx4_ptype_table[idx];
}
@@ -934,11 +1274,14 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
goto skip;
}
pkt = seg;
+ assert(len >= (rxq->crc_present << 2));
/* Update packet information. */
pkt->packet_type =
rxq_cq_to_pkt_type(cqe, rxq->l2tun_offload);
pkt->ol_flags = PKT_RX_RSS_HASH;
pkt->hash.rss = cqe->immed_rss_invalid;
+ if (rxq->crc_present)
+ len -= ETHER_CRC_LEN;
pkt->pkt_len = len;
if (rxq->csum | rxq->csum_l2tun) {
uint32_t flags =
@@ -963,6 +1306,9 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
* changes.
*/
scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+ /* If there's only one MR, no need to replace LKey in WQE. */
+ if (unlikely(mlx4_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+ scat->lkey = mlx4_rx_mb2mr(rxq, rep);
if (len > seg->data_len) {
len -= seg->data_len;
++pkt->nb_segs;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index c12bd39a..ffa8abfc 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
#ifndef MLX4_RXTX_H_
@@ -25,6 +25,7 @@
#include "mlx4.h"
#include "mlx4_prm.h"
+#include "mlx4_mr.h"
/** Rx queue counters. */
struct mlx4_rxq_stats {
@@ -39,7 +40,6 @@ struct mlx4_rxq_stats {
struct rxq {
struct priv *priv; /**< Back pointer to private data. */
struct rte_mempool *mp; /**< Memory pool for allocations. */
- struct mlx4_mr *mr; /**< Memory region. */
struct ibv_cq *cq; /**< Completion queue. */
struct ibv_wq *wq; /**< Work queue. */
struct ibv_comp_channel *channel; /**< Rx completion channel. */
@@ -47,11 +47,13 @@ struct rxq {
uint16_t port_id; /**< Port ID for incoming packets. */
uint16_t sges_n; /**< Number of segments per packet (log2 value). */
uint16_t elts_n; /**< Mbuf queue size (log2 value). */
+ struct mlx4_mr_ctrl mr_ctrl; /* MR control descriptor. */
struct rte_mbuf *(*elts)[]; /**< Rx elements. */
volatile struct mlx4_wqe_data_seg (*wqes)[]; /**< HW queue entries. */
volatile uint32_t *rq_db; /**< RQ doorbell record. */
uint32_t csum:1; /**< Enable checksum offloading. */
uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */
+ uint32_t crc_present:1; /**< CRC must be subtracted. */
uint32_t l2tun_offload:1; /**< L2 tunnel offload is enabled. */
struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */
struct mlx4_rxq_stats stats; /**< Rx queue counters. */
@@ -83,12 +85,12 @@ struct txq_elt {
};
};
-/** Rx queue counters. */
+/** Tx queue counters. */
struct mlx4_txq_stats {
unsigned int idx; /**< Mapping index. */
uint64_t opackets; /**< Total of successfully sent packets. */
uint64_t obytes; /**< Total of successfully sent bytes. */
- uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+ uint64_t odropped; /**< Total number of packets failed to transmit. */
};
/** Tx queue descriptor. */
@@ -100,6 +102,7 @@ struct txq {
int elts_comp_cd; /**< Countdown for next completion. */
unsigned int elts_comp_cd_init; /**< Initial value for countdown. */
unsigned int elts_n; /**< (*elts)[] length. */
+ struct mlx4_mr_ctrl mr_ctrl; /* MR control descriptor. */
struct txq_elt (*elts)[]; /**< Tx elements. */
struct mlx4_txq_stats stats; /**< Tx queue counters. */
uint32_t max_inline; /**< Max inline send size. */
@@ -108,11 +111,6 @@ struct txq {
uint32_t lb:1; /**< Whether packets should be looped back by eSwitch. */
uint8_t *bounce_buf;
/**< Memory used for storing the first DWORD of data TXBBs. */
- struct {
- const struct rte_mempool *mp; /**< Cached memory pool. */
- struct mlx4_mr *mr; /**< Memory region (for mp). */
- uint32_t lkey; /**< mr->lkey copy. */
- } mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */
struct priv *priv; /**< Back pointer to private data. */
unsigned int socket; /**< CPU socket ID for allocations. */
struct ibv_cq *cq; /**< Completion queue. */
@@ -126,7 +124,7 @@ uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE];
int mlx4_rss_init(struct priv *priv);
void mlx4_rss_deinit(struct priv *priv);
struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields,
- uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+ const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
uint16_t queues, const uint16_t queue_id[]);
void mlx4_rss_put(struct mlx4_rss *rss);
int mlx4_rss_attach(struct mlx4_rss *rss);
@@ -160,34 +158,70 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
const struct rte_eth_txconf *conf);
void mlx4_tx_queue_release(void *dpdk_txq);
+/* mlx4_mr.c */
+
+void mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl);
+uint32_t mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr);
+uint32_t mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr);
+
/**
- * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
- * Call mlx4_txq_add_mr() if MP is not registered yet.
+ * Query LKey from a packet buffer for Rx. No need to flush local caches for Rx
+ * as mempool is pre-configured and static.
+ *
+ * @param rxq
+ * Pointer to Rx queue structure.
+ * @param addr
+ * Address to search.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx4_rx_addr2mr(struct rxq *rxq, uintptr_t addr)
+{
+ struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
+ uint32_t lkey;
+
+ /* Linear search on MR cache array. */
+ lkey = mlx4_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru,
+ MLX4_MR_CACHE_N, addr);
+ if (likely(lkey != UINT32_MAX))
+ return lkey;
+ /* Take slower bottom-half (Binary Search) on miss. */
+ return mlx4_rx_addr2mr_bh(rxq, addr);
+}
+
+#define mlx4_rx_mb2mr(rxq, mb) mlx4_rx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr))
+
+/**
+ * Query LKey from a packet buffer for Tx. If not found, add the mempool.
*
* @param txq
* Pointer to Tx queue structure.
- * @param[in] mp
- * Memory pool for which a memory region lkey must be returned.
+ * @param addr
+ * Address to search.
*
* @return
- * mr->lkey on success, (uint32_t)-1 on failure.
+ * Searched LKey on success, UINT32_MAX on no match.
*/
-static inline uint32_t
-mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+static __rte_always_inline uint32_t
+mlx4_tx_addr2mr(struct txq *txq, uintptr_t addr)
{
- unsigned int i;
-
- for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
- if (unlikely(txq->mp2mr[i].mp == NULL)) {
- /* Unknown MP, add a new MR for it. */
- break;
- }
- if (txq->mp2mr[i].mp == mp) {
- /* MP found MP. */
- return txq->mp2mr[i].lkey;
- }
- }
- return mlx4_txq_add_mr(txq, mp, i);
+ struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+ uint32_t lkey;
+
+ /* Check generation bit to see if there's any change on existing MRs. */
+ if (unlikely(*mr_ctrl->dev_gen_ptr != mr_ctrl->cur_gen))
+ mlx4_mr_flush_local_cache(mr_ctrl);
+ /* Linear search on MR cache array. */
+ lkey = mlx4_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru,
+ MLX4_MR_CACHE_N, addr);
+ if (likely(lkey != UINT32_MAX))
+ return lkey;
+ /* Take slower bottom-half (binary search) on miss. */
+ return mlx4_tx_addr2mr_bh(txq, addr);
}
+#define mlx4_tx_mb2mr(rxq, mb) mlx4_tx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr))
+
#endif /* MLX4_RXTX_H_ */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 071b2d5d..9aa7440d 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -63,64 +63,6 @@ mlx4_txq_free_elts(struct txq *txq)
txq->elts_tail = txq->elts_head;
}
-struct txq_mp2mr_mbuf_check_data {
- int ret;
-};
-
-/**
- * Callback function for rte_mempool_obj_iter() to check whether a given
- * mempool object looks like a mbuf.
- *
- * @param[in] mp
- * The mempool pointer
- * @param[in] arg
- * Context data (struct mlx4_txq_mp2mr_mbuf_check_data). Contains the
- * return value.
- * @param[in] obj
- * Object address.
- * @param index
- * Object index, unused.
- */
-static void
-mlx4_txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
- uint32_t index)
-{
- struct txq_mp2mr_mbuf_check_data *data = arg;
- struct rte_mbuf *buf = obj;
-
- (void)index;
- /*
- * Check whether mbuf structure fits element size and whether mempool
- * pointer is valid.
- */
- if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
- data->ret = -1;
-}
-
-/**
- * Iterator function for rte_mempool_walk() to register existing mempools and
- * fill the MP to MR cache of a Tx queue.
- *
- * @param[in] mp
- * Memory Pool to register.
- * @param *arg
- * Pointer to Tx queue structure.
- */
-static void
-mlx4_txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
-{
- struct txq *txq = arg;
- struct txq_mp2mr_mbuf_check_data data = {
- .ret = 0,
- };
-
- /* Register mempool only if the first element looks like a mbuf. */
- if (rte_mempool_obj_iter(mp, mlx4_txq_mp2mr_mbuf_check, &data) == 0 ||
- data.ret == -1)
- return;
- mlx4_txq_mp2mr(txq, mp);
-}
-
/**
* Retrieves information needed in order to directly access the Tx queue.
*
@@ -144,9 +86,9 @@ mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv)
uint32_t headroom_size = 2048 + (1 << dqp->sq.wqe_shift);
/* Continuous headroom size bytes must always stay freed. */
sq->remain_size = sq->size - headroom_size;
- sq->owner_opcode = MLX4_OPCODE_SEND | (0 << MLX4_SQ_OWNER_BIT);
+ sq->owner_opcode = MLX4_OPCODE_SEND | (0u << MLX4_SQ_OWNER_BIT);
sq->stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
- (0 << MLX4_SQ_OWNER_BIT));
+ (0u << MLX4_SQ_OWNER_BIT));
sq->db = dqp->sdb;
sq->doorbell_qpn = dqp->doorbell_qpn;
cq->buf = dcq->buf.buf;
@@ -174,32 +116,18 @@ mlx4_get_tx_port_offloads(struct priv *priv)
DEV_TX_OFFLOAD_UDP_CKSUM |
DEV_TX_OFFLOAD_TCP_CKSUM);
}
- if (priv->hw_csum_l2tun)
+ if (priv->tso)
+ offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+ if (priv->hw_csum_l2tun) {
offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+ if (priv->tso)
+ offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+ DEV_TX_OFFLOAD_GRE_TNL_TSO);
+ }
return offloads;
}
/**
- * Checks if the per-queue offload configuration is valid.
- *
- * @param priv
- * Pointer to private structure.
- * @param requested
- * Per-queue offloads configuration.
- *
- * @return
- * Nonzero when configuration is valid.
- */
-static int
-mlx4_check_tx_queue_offloads(struct priv *priv, uint64_t requested)
-{
- uint64_t mandatory = priv->dev->data->dev_conf.txmode.offloads;
- uint64_t supported = mlx4_get_tx_port_offloads(priv);
-
- return !((mandatory ^ requested) & supported);
-}
-
-/**
* DPDK callback to configure a Tx queue.
*
* @param dev
@@ -246,23 +174,13 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
},
};
int ret;
+ uint64_t offloads;
+
+ offloads = conf->offloads | dev->data->dev_conf.txmode.offloads;
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
- /*
- * Don't verify port offloads for application which
- * use the old API.
- */
- if ((conf->txq_flags & ETH_TXQ_FLAGS_IGNORE) &&
- !mlx4_check_tx_queue_offloads(priv, conf->offloads)) {
- rte_errno = ENOTSUP;
- ERROR("%p: Tx queue offloads 0x%" PRIx64 " don't match port "
- "offloads 0x%" PRIx64 " or supported offloads 0x%" PRIx64,
- (void *)dev, conf->offloads,
- dev->data->dev_conf.txmode.offloads,
- mlx4_get_tx_port_offloads(priv));
- return -rte_errno;
- }
+
if (idx >= dev->data->nb_tx_queues) {
rte_errno = EOVERFLOW;
ERROR("%p: queue index out of range (%u >= %u)",
@@ -313,11 +231,11 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
.elts_comp_cd_init =
RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4),
.csum = priv->hw_csum &&
- (conf->offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
+ (offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
DEV_TX_OFFLOAD_UDP_CKSUM |
DEV_TX_OFFLOAD_TCP_CKSUM)),
.csum_l2tun = priv->hw_csum_l2tun &&
- (conf->offloads &
+ (offloads &
DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM),
/* Enable Tx loopback for VF devices. */
.lb = !!priv->vf,
@@ -404,8 +322,13 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
/* Save first wqe pointer in the first element. */
(&(*txq->elts)[0])->wqe =
(volatile struct mlx4_wqe_ctrl_seg *)txq->msq.buf;
- /* Pre-register known mempools. */
- rte_mempool_walk(mlx4_txq_mp2mr_iter, txq);
+ if (mlx4_mr_btree_init(&txq->mr_ctrl.cache_bh,
+ MLX4_MR_BTREE_CACHE_N, socket)) {
+ /* rte_errno is already set. */
+ goto error;
+ }
+ /* Save pointer of global generation number to check memory event. */
+ txq->mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
DEBUG("%p: adding Tx queue %p to list", (void *)dev, (void *)txq);
dev->data->tx_queues[idx] = txq;
return 0;
@@ -446,11 +369,6 @@ mlx4_tx_queue_release(void *dpdk_txq)
claim_zero(mlx4_glue->destroy_qp(txq->qp));
if (txq->cq)
claim_zero(mlx4_glue->destroy_cq(txq->cq));
- for (i = 0; i != RTE_DIM(txq->mp2mr); ++i) {
- if (!txq->mp2mr[i].mp)
- break;
- assert(txq->mp2mr[i].mr);
- mlx4_mr_put(txq->mp2mr[i].mr);
- }
+ mlx4_mr_btree_free(&txq->mr_ctrl.cache_bh);
rte_free(txq);
}
diff --git a/drivers/net/mlx4/mlx4_utils.c b/drivers/net/mlx4/mlx4_utils.c
index d10812ec..a727d703 100644
--- a/drivers/net/mlx4/mlx4_utils.c
+++ b/drivers/net/mlx4/mlx4_utils.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h
index 9fdbacad..86abb3b7 100644
--- a/drivers/net/mlx4/mlx4_utils.h
+++ b/drivers/net/mlx4/mlx4_utils.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
#ifndef MLX4_UTILS_H_