aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/mlx4
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/mlx4')
-rw-r--r--drivers/net/mlx4/Makefile43
-rw-r--r--drivers/net/mlx4/mlx4.c202
-rw-r--r--drivers/net/mlx4/mlx4.h51
-rw-r--r--drivers/net/mlx4/mlx4_ethdev.c208
-rw-r--r--drivers/net/mlx4/mlx4_flow.c240
-rw-r--r--drivers/net/mlx4/mlx4_flow.h6
-rw-r--r--drivers/net/mlx4/mlx4_glue.c2
-rw-r--r--drivers/net/mlx4/mlx4_glue.h2
-rw-r--r--drivers/net/mlx4/mlx4_intr.c2
-rw-r--r--drivers/net/mlx4/mlx4_mr.c1247
-rw-r--r--drivers/net/mlx4/mlx4_mr.h122
-rw-r--r--drivers/net/mlx4/mlx4_prm.h2
-rw-r--r--drivers/net/mlx4/mlx4_rxq.c99
-rw-r--r--drivers/net/mlx4/mlx4_rxtx.c44
-rw-r--r--drivers/net/mlx4/mlx4_rxtx.h92
-rw-r--r--drivers/net/mlx4/mlx4_txq.c122
-rw-r--r--drivers/net/mlx4/mlx4_utils.c2
-rw-r--r--drivers/net/mlx4/mlx4_utils.h2
18 files changed, 1750 insertions, 738 deletions
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index cc800493..73f9d405 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -1,33 +1,6 @@
-# BSD LICENSE
-#
+# SPDX-License-Identifier: BSD-3-Clause
# Copyright 2012 6WIND S.A.
-# Copyright 2012 Mellanox
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and/or other materials provided with the
-# distribution.
-# * Neither the name of 6WIND S.A. nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Copyright 2012 Mellanox Technologies, Ltd
include $(RTE_SDK)/mk/rte.vars.mk
@@ -64,6 +37,7 @@ CFLAGS += -D_BSD_SOURCE
CFLAGS += -D_DEFAULT_SOURCE
CFLAGS += -D_XOPEN_SOURCE=600
CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DLOPEN_DEPS),y)
CFLAGS += -DMLX4_GLUE='"$(LIB_GLUE)"'
CFLAGS += -DMLX4_GLUE_VERSION='"$(LIB_GLUE_VERSION)"'
@@ -95,10 +69,6 @@ else
CFLAGS += -DNDEBUG -UPEDANTIC
endif
-ifdef CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE
-CFLAGS += -DMLX4_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE)
-endif
-
include $(RTE_SDK)/mk/rte.lib.mk
# Generate and clean-up mlx4_autoconf.h.
@@ -132,8 +102,13 @@ ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DLOPEN_DEPS),y)
$(LIB): $(LIB_GLUE)
+ifeq ($(LINK_USING_CC),1)
+GLUE_LDFLAGS := $(call linkerprefix,$(LDFLAGS))
+else
+GLUE_LDFLAGS := $(LDFLAGS)
+endif
$(LIB_GLUE): mlx4_glue.o
- $Q $(LD) $(LDFLAGS) $(EXTRA_LDFLAGS) \
+ $Q $(LD) $(GLUE_LDFLAGS) $(EXTRA_LDFLAGS) \
-Wl,-h,$(LIB_GLUE) \
-s -shared -o $@ $< -libverbs -lmlx4
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index ee93dafe..a29814b3 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2012 6WIND S.A.
- * Copyright 2012 Mellanox
+ * Copyright 2012 Mellanox Technologies, Ltd
*/
/**
@@ -44,9 +44,15 @@
#include "mlx4.h"
#include "mlx4_glue.h"
#include "mlx4_flow.h"
+#include "mlx4_mr.h"
#include "mlx4_rxtx.h"
#include "mlx4_utils.h"
+struct mlx4_dev_list mlx4_mem_event_cb_list =
+ LIST_HEAD_INITIALIZER(mlx4_mem_event_cb_list);
+
+rte_rwlock_t mlx4_mem_event_rwlock = RTE_RWLOCK_INITIALIZER;
+
/** Configuration structure for device arguments. */
struct mlx4_conf {
struct {
@@ -61,6 +67,8 @@ const char *pmd_mlx4_init_params[] = {
NULL,
};
+static void mlx4_dev_stop(struct rte_eth_dev *dev);
+
/**
* DPDK callback for Ethernet device configuration.
*
@@ -123,6 +131,9 @@ mlx4_dev_start(struct rte_eth_dev *dev)
(void *)dev, strerror(-ret));
goto err;
}
+#ifndef NDEBUG
+ mlx4_mr_dump_dev(dev);
+#endif
ret = mlx4_rxq_intr_enable(priv);
if (ret) {
ERROR("%p: interrupt handler installation failed",
@@ -143,8 +154,7 @@ mlx4_dev_start(struct rte_eth_dev *dev)
dev->rx_pkt_burst = mlx4_rx_burst;
return 0;
err:
- /* Rollback. */
- priv->started = 0;
+ mlx4_dev_stop(dev);
return ret;
}
@@ -194,10 +204,12 @@ mlx4_dev_close(struct rte_eth_dev *dev)
dev->tx_pkt_burst = mlx4_tx_burst_removed;
rte_wmb();
mlx4_flow_clean(priv);
+ mlx4_rss_deinit(priv);
for (i = 0; i != dev->data->nb_rx_queues; ++i)
mlx4_rx_queue_release(dev->data->rx_queues[i]);
for (i = 0; i != dev->data->nb_tx_queues; ++i)
mlx4_tx_queue_release(dev->data->tx_queues[i]);
+ mlx4_mr_release(dev);
if (priv->pd != NULL) {
assert(priv->ctx != NULL);
claim_zero(mlx4_glue->dealloc_pd(priv->pd));
@@ -385,6 +397,99 @@ free_kvlist:
return ret;
}
+/**
+ * Interpret RSS capabilities reported by device.
+ *
+ * This function returns the set of usable Verbs RSS hash fields, kernel
+ * quirks taken into account.
+ *
+ * @param ctx
+ * Verbs context.
+ * @param pd
+ * Verbs protection domain.
+ * @param device_attr_ex
+ * Extended device attributes to interpret.
+ *
+ * @return
+ * Usable RSS hash fields mask in Verbs format.
+ */
+static uint64_t
+mlx4_hw_rss_sup(struct ibv_context *ctx, struct ibv_pd *pd,
+ struct ibv_device_attr_ex *device_attr_ex)
+{
+ uint64_t hw_rss_sup = device_attr_ex->rss_caps.rx_hash_fields_mask;
+ struct ibv_cq *cq = NULL;
+ struct ibv_wq *wq = NULL;
+ struct ibv_rwq_ind_table *ind = NULL;
+ struct ibv_qp *qp = NULL;
+
+ if (!hw_rss_sup) {
+ WARN("no RSS capabilities reported; disabling support for UDP"
+ " RSS and inner VXLAN RSS");
+ return IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4 |
+ IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6 |
+ IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP;
+ }
+ if (!(hw_rss_sup & IBV_RX_HASH_INNER))
+ return hw_rss_sup;
+ /*
+ * Although reported as supported, missing code in some Linux
+ * versions (v4.15, v4.16) prevents the creation of hash QPs with
+ * inner capability.
+ *
+ * There is no choice but to attempt to instantiate a temporary RSS
+ * context in order to confirm its support.
+ */
+ cq = mlx4_glue->create_cq(ctx, 1, NULL, NULL, 0);
+ wq = cq ? mlx4_glue->create_wq
+ (ctx,
+ &(struct ibv_wq_init_attr){
+ .wq_type = IBV_WQT_RQ,
+ .max_wr = 1,
+ .max_sge = 1,
+ .pd = pd,
+ .cq = cq,
+ }) : NULL;
+ ind = wq ? mlx4_glue->create_rwq_ind_table
+ (ctx,
+ &(struct ibv_rwq_ind_table_init_attr){
+ .log_ind_tbl_size = 0,
+ .ind_tbl = &wq,
+ .comp_mask = 0,
+ }) : NULL;
+ qp = ind ? mlx4_glue->create_qp_ex
+ (ctx,
+ &(struct ibv_qp_init_attr_ex){
+ .comp_mask =
+ (IBV_QP_INIT_ATTR_PD |
+ IBV_QP_INIT_ATTR_RX_HASH |
+ IBV_QP_INIT_ATTR_IND_TABLE),
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .pd = pd,
+ .rwq_ind_tbl = ind,
+ .rx_hash_conf = {
+ .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
+ .rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE,
+ .rx_hash_key = mlx4_rss_hash_key_default,
+ .rx_hash_fields_mask = hw_rss_sup,
+ },
+ }) : NULL;
+ if (!qp) {
+ WARN("disabling unusable inner RSS capability due to kernel"
+ " quirk");
+ hw_rss_sup &= ~IBV_RX_HASH_INNER;
+ } else {
+ claim_zero(mlx4_glue->destroy_qp(qp));
+ }
+ if (ind)
+ claim_zero(mlx4_glue->destroy_rwq_ind_table(ind));
+ if (wq)
+ claim_zero(mlx4_glue->destroy_wq(wq));
+ if (cq)
+ claim_zero(mlx4_glue->destroy_cq(cq));
+ return hw_rss_sup;
+}
+
static struct rte_pci_driver mlx4_driver;
/**
@@ -562,22 +667,15 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
(device_attr.vendor_part_id ==
PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO);
DEBUG("L2 tunnel checksum offloads are %ssupported",
- (priv->hw_csum_l2tun ? "" : "not "));
- priv->hw_rss_sup = device_attr_ex.rss_caps.rx_hash_fields_mask;
- if (!priv->hw_rss_sup) {
- WARN("no RSS capabilities reported; disabling support"
- " for UDP RSS and inner VXLAN RSS");
- /* Fake support for all possible RSS hash fields. */
- priv->hw_rss_sup = ~UINT64_C(0);
- priv->hw_rss_sup = mlx4_conv_rss_hf(priv, -1);
- /* Filter out known unsupported fields. */
- priv->hw_rss_sup &=
- ~(uint64_t)(IBV_RX_HASH_SRC_PORT_UDP |
- IBV_RX_HASH_DST_PORT_UDP |
- IBV_RX_HASH_INNER);
- }
+ priv->hw_csum_l2tun ? "" : "not ");
+ priv->hw_rss_sup = mlx4_hw_rss_sup(priv->ctx, priv->pd,
+ &device_attr_ex);
DEBUG("supported RSS hash fields mask: %016" PRIx64,
priv->hw_rss_sup);
+ priv->hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
+ IBV_RAW_PACKET_CAP_SCATTER_FCS);
+ DEBUG("FCS stripping toggling is %ssupported",
+ priv->hw_fcs_strip ? "" : "not ");
/* Configure the first MAC address by default. */
if (mlx4_get_mac(priv, &mac.addr_bytes)) {
ERROR("cannot get MAC address, is mlx4_en loaded?"
@@ -649,6 +747,24 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
/* Update link status once if waiting for LSC. */
if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
mlx4_link_update(eth_dev, 0);
+ /*
+ * Once the device is added to the list of memory event
+ * callback, its global MR cache table cannot be expanded
+ * on the fly because of deadlock. If it overflows, lookup
+ * should be done by searching MR list linearly, which is slow.
+ */
+ err = mlx4_mr_btree_init(&priv->mr.cache,
+ MLX4_MR_BTREE_CACHE_N * 2,
+ eth_dev->device->numa_node);
+ if (err) {
+ /* rte_errno is already set. */
+ goto port_error;
+ }
+ /* Add device to memory callback list. */
+ rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
+ LIST_INSERT_HEAD(&mlx4_mem_event_cb_list, priv, mem_event_cb);
+ rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+ rte_eth_dev_probing_finish(eth_dev);
continue;
port_error:
rte_free(priv);
@@ -708,11 +824,53 @@ static struct rte_pci_driver mlx4_driver = {
#ifdef RTE_LIBRTE_MLX4_DLOPEN_DEPS
/**
+ * Suffix RTE_EAL_PMD_PATH with "-glue".
+ *
+ * This function performs a sanity check on RTE_EAL_PMD_PATH before
+ * suffixing its last component.
+ *
+ * @param buf[out]
+ * Output buffer, should be large enough otherwise NULL is returned.
+ * @param size
+ * Size of @p out.
+ *
+ * @return
+ * Pointer to @p buf or @p NULL in case suffix cannot be appended.
+ */
+static char *
+mlx4_glue_path(char *buf, size_t size)
+{
+ static const char *const bad[] = { "/", ".", "..", NULL };
+ const char *path = RTE_EAL_PMD_PATH;
+ size_t len = strlen(path);
+ size_t off;
+ int i;
+
+ while (len && path[len - 1] == '/')
+ --len;
+ for (off = len; off && path[off - 1] != '/'; --off)
+ ;
+ for (i = 0; bad[i]; ++i)
+ if (!strncmp(path + off, bad[i], (int)(len - off)))
+ goto error;
+ i = snprintf(buf, size, "%.*s-glue", (int)len, path);
+ if (i == -1 || (size_t)i >= size)
+ goto error;
+ return buf;
+error:
+ ERROR("unable to append \"-glue\" to last component of"
+ " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"),"
+ " please re-configure DPDK");
+ return NULL;
+}
+
+/**
* Initialization routine for run-time dependency on rdma-core.
*/
static int
mlx4_glue_init(void)
{
+ char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
const char *path[] = {
/*
* A basic security check is necessary before trusting
@@ -720,7 +878,13 @@ mlx4_glue_init(void)
*/
(geteuid() == getuid() && getegid() == getgid() ?
getenv("MLX4_GLUE_PATH") : NULL),
- RTE_EAL_PMD_PATH,
+ /*
+ * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
+ * variant, otherwise let dlopen() look up libraries on its
+ * own.
+ */
+ (*RTE_EAL_PMD_PATH ?
+ mlx4_glue_path(glue_path, sizeof(glue_path)) : ""),
};
unsigned int i = 0;
void *handle = NULL;
@@ -828,6 +992,8 @@ rte_mlx4_pmd_init(void)
}
mlx4_glue->fork_init();
rte_pci_register(&mlx4_driver);
+ rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
+ mlx4_mr_mem_event_cb, NULL);
}
RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 19c8a223..300cb4d7 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2012 6WIND S.A.
- * Copyright 2012 Mellanox
+ * Copyright 2012 Mellanox Technologies, Ltd
*/
#ifndef RTE_PMD_MLX4_H_
@@ -23,7 +23,9 @@
#include <rte_ether.h>
#include <rte_interrupts.h>
#include <rte_mempool.h>
-#include <rte_spinlock.h>
+#include <rte_rwlock.h>
+
+#include "mlx4_mr.h"
#ifndef IBV_RX_HASH_INNER
/** This is not necessarily defined by supported RDMA core versions. */
@@ -42,17 +44,6 @@
/** Fixed RSS hash key size in bytes. Cannot be modified. */
#define MLX4_RSS_HASH_KEY_SIZE 40
-/**
- * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
- * from which buffers are to be transmitted will have to be mapped by this
- * driver to their own Memory Region (MR). This is a slow operation.
- *
- * This value is always 1 for RX queues.
- */
-#ifndef MLX4_PMD_TX_MP_CACHE
-#define MLX4_PMD_TX_MP_CACHE 8
-#endif
-
/** Interrupt alarm timeout value in microseconds. */
#define MLX4_INTR_ALARM_TIMEOUT 100000
@@ -78,20 +69,12 @@ struct rxq;
struct txq;
struct rte_flow;
-/** Memory region descriptor. */
-struct mlx4_mr {
- LIST_ENTRY(mlx4_mr) next; /**< Next entry in list. */
- uintptr_t start; /**< Base address for memory region. */
- uintptr_t end; /**< End address for memory region. */
- uint32_t lkey; /**< L_Key extracted from @p mr. */
- uint32_t refcnt; /**< Reference count for this object. */
- struct priv *priv; /**< Back pointer to private data. */
- struct ibv_mr *mr; /**< Memory region associated with @p mp. */
- struct rte_mempool *mp; /**< Target memory pool (mempool). */
-};
+LIST_HEAD(mlx4_dev_list, priv);
+LIST_HEAD(mlx4_mr_list, mlx4_mr);
/** Private data structure. */
struct priv {
+ LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */
struct rte_eth_dev *dev; /**< Ethernet device. */
struct ibv_context *ctx; /**< Verbs context. */
struct ibv_device_attr device_attr; /**< Device properties. */
@@ -103,15 +86,22 @@ struct priv {
uint32_t vf:1; /**< This is a VF device. */
uint32_t intr_alarm:1; /**< An interrupt alarm is scheduled. */
uint32_t isolated:1; /**< Toggle isolated mode. */
+ uint32_t rss_init:1; /**< Common RSS context is initialized. */
uint32_t hw_csum:1; /**< Checksum offload is supported. */
uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
+ uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
+ struct {
+ uint32_t dev_gen; /* Generation number to flush local caches. */
+ rte_rwlock_t rwlock; /* MR Lock. */
+ struct mlx4_mr_btree cache; /* Global MR cache table. */
+ struct mlx4_mr_list mr_list; /* Registered MR list. */
+ struct mlx4_mr_list mr_free_list; /* Freed MR list. */
+ } mr;
LIST_HEAD(, mlx4_rss) rss; /**< Shared targets for Rx flow rules. */
LIST_HEAD(, rte_flow) flows; /**< Configured flow rule handles. */
- LIST_HEAD(, mlx4_mr) mr; /**< Registered memory regions. */
- rte_spinlock_t mr_lock; /**< Lock for @p mr access. */
struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES];
/**< Configured MAC addresses. Unused entries are zeroed. */
};
@@ -131,7 +121,7 @@ void mlx4_allmulticast_disable(struct rte_eth_dev *dev);
void mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index);
int mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
uint32_t index, uint32_t vmdq);
-void mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr);
+int mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr);
int mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on);
int mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats);
void mlx4_stats_reset(struct rte_eth_dev *dev);
@@ -154,11 +144,4 @@ void mlx4_rxq_intr_disable(struct priv *priv);
int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
-/* mlx4_mr.c */
-
-struct mlx4_mr *mlx4_mr_get(struct priv *priv, struct rte_mempool *mp);
-void mlx4_mr_put(struct mlx4_mr *mr);
-uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
- uint32_t i);
-
#endif /* RTE_PMD_MLX4_H_ */
diff --git a/drivers/net/mlx4/mlx4_ethdev.c b/drivers/net/mlx4/mlx4_ethdev.c
index 3bc69273..30deb3ef 100644
--- a/drivers/net/mlx4/mlx4_ethdev.c
+++ b/drivers/net/mlx4/mlx4_ethdev.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -39,6 +39,7 @@
#include <rte_ether.h>
#include <rte_flow.h>
#include <rte_pci.h>
+#include <rte_string_fns.h>
#include "mlx4.h"
#include "mlx4_flow.h"
@@ -120,7 +121,7 @@ try_dev_id:
goto try_dev_id;
dev_port_prev = dev_port;
if (dev_port == (priv->port - 1u))
- snprintf(match, sizeof(match), "%s", name);
+ strlcpy(match, name, sizeof(match));
}
closedir(dir);
if (match[0] == '\0') {
@@ -132,167 +133,6 @@ try_dev_id:
}
/**
- * Read from sysfs entry.
- *
- * @param[in] priv
- * Pointer to private structure.
- * @param[in] entry
- * Entry name relative to sysfs path.
- * @param[out] buf
- * Data output buffer.
- * @param size
- * Buffer size.
- *
- * @return
- * Number of bytes read on success, negative errno value otherwise and
- * rte_errno is set.
- */
-static int
-mlx4_sysfs_read(const struct priv *priv, const char *entry,
- char *buf, size_t size)
-{
- char ifname[IF_NAMESIZE];
- FILE *file;
- int ret;
-
- ret = mlx4_get_ifname(priv, &ifname);
- if (ret)
- return ret;
-
- MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
- ifname, entry);
-
- file = fopen(path, "rb");
- if (file == NULL) {
- rte_errno = errno;
- return -rte_errno;
- }
- ret = fread(buf, 1, size, file);
- if ((size_t)ret < size && ferror(file)) {
- rte_errno = EIO;
- ret = -rte_errno;
- } else {
- ret = size;
- }
- fclose(file);
- return ret;
-}
-
-/**
- * Write to sysfs entry.
- *
- * @param[in] priv
- * Pointer to private structure.
- * @param[in] entry
- * Entry name relative to sysfs path.
- * @param[in] buf
- * Data buffer.
- * @param size
- * Buffer size.
- *
- * @return
- * Number of bytes written on success, negative errno value otherwise and
- * rte_errno is set.
- */
-static int
-mlx4_sysfs_write(const struct priv *priv, const char *entry,
- char *buf, size_t size)
-{
- char ifname[IF_NAMESIZE];
- FILE *file;
- int ret;
-
- ret = mlx4_get_ifname(priv, &ifname);
- if (ret)
- return ret;
-
- MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
- ifname, entry);
-
- file = fopen(path, "wb");
- if (file == NULL) {
- rte_errno = errno;
- return -rte_errno;
- }
- ret = fwrite(buf, 1, size, file);
- if ((size_t)ret < size || ferror(file)) {
- rte_errno = EIO;
- ret = -rte_errno;
- } else {
- ret = size;
- }
- fclose(file);
- return ret;
-}
-
-/**
- * Get unsigned long sysfs property.
- *
- * @param priv
- * Pointer to private structure.
- * @param[in] name
- * Entry name relative to sysfs path.
- * @param[out] value
- * Value output buffer.
- *
- * @return
- * 0 on success, negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx4_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value)
-{
- int ret;
- unsigned long value_ret;
- char value_str[32];
-
- ret = mlx4_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1));
- if (ret < 0) {
- DEBUG("cannot read %s value from sysfs: %s",
- name, strerror(rte_errno));
- return ret;
- }
- value_str[ret] = '\0';
- errno = 0;
- value_ret = strtoul(value_str, NULL, 0);
- if (errno) {
- rte_errno = errno;
- DEBUG("invalid %s value `%s': %s", name, value_str,
- strerror(rte_errno));
- return -rte_errno;
- }
- *value = value_ret;
- return 0;
-}
-
-/**
- * Set unsigned long sysfs property.
- *
- * @param priv
- * Pointer to private structure.
- * @param[in] name
- * Entry name relative to sysfs path.
- * @param value
- * Value to set.
- *
- * @return
- * 0 on success, negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx4_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value)
-{
- int ret;
- MKSTR(value_str, "%lu", value);
-
- ret = mlx4_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1));
- if (ret < 0) {
- DEBUG("cannot write %s `%s' (%lu) to sysfs: %s",
- name, value_str, value, strerror(rte_errno));
- return ret;
- }
- return 0;
-}
-
-/**
* Perform ifreq ioctl() on associated Ethernet device.
*
* @param[in] priv
@@ -361,12 +201,12 @@ mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN])
int
mlx4_mtu_get(struct priv *priv, uint16_t *mtu)
{
- unsigned long ulong_mtu = 0;
- int ret = mlx4_get_sysfs_ulong(priv, "mtu", &ulong_mtu);
+ struct ifreq request;
+ int ret = mlx4_ifreq(priv, SIOCGIFMTU, &request);
if (ret)
return ret;
- *mtu = ulong_mtu;
+ *mtu = request.ifr_mtu;
return 0;
}
@@ -385,20 +225,13 @@ int
mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
{
struct priv *priv = dev->data->dev_private;
- uint16_t new_mtu;
- int ret = mlx4_set_sysfs_ulong(priv, "mtu", mtu);
+ struct ifreq request = { .ifr_mtu = mtu, };
+ int ret = mlx4_ifreq(priv, SIOCSIFMTU, &request);
if (ret)
return ret;
- ret = mlx4_mtu_get(priv, &new_mtu);
- if (ret)
- return ret;
- if (new_mtu == mtu) {
- priv->mtu = mtu;
- return 0;
- }
- rte_errno = EINVAL;
- return -rte_errno;
+ priv->mtu = mtu;
+ return 0;
}
/**
@@ -417,14 +250,14 @@ mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
static int
mlx4_set_flags(struct priv *priv, unsigned int keep, unsigned int flags)
{
- unsigned long tmp = 0;
- int ret = mlx4_get_sysfs_ulong(priv, "flags", &tmp);
+ struct ifreq request;
+ int ret = mlx4_ifreq(priv, SIOCGIFFLAGS, &request);
if (ret)
return ret;
- tmp &= keep;
- tmp |= (flags & (~keep));
- return mlx4_set_sysfs_ulong(priv, "flags", tmp);
+ request.ifr_flags &= keep;
+ request.ifr_flags |= flags & ~keep;
+ return mlx4_ifreq(priv, SIOCSIFFLAGS, &request);
}
/**
@@ -701,11 +534,14 @@ mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
* Pointer to Ethernet device structure.
* @param mac_addr
* MAC address to register.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
*/
-void
+int
mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
{
- mlx4_mac_addr_add(dev, mac_addr, 0, 0);
+ return mlx4_mac_addr_add(dev, mac_addr, 0, 0);
}
/**
@@ -723,7 +559,6 @@ mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
unsigned int max;
char ifname[IF_NAMESIZE];
- info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
/* FIXME: we should ask the device for these values. */
info->min_rx_bufsize = 32;
info->max_rx_pktlen = 65536;
@@ -752,6 +587,7 @@ mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
ETH_LINK_SPEED_20G |
ETH_LINK_SPEED_40G |
ETH_LINK_SPEED_56G;
+ info->flow_type_rss_offloads = mlx4_conv_rss_types(priv, 0, 1);
}
/**
@@ -878,7 +714,7 @@ mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete)
}
link_speed = ethtool_cmd_speed(&edata);
if (link_speed == -1)
- dev_link.link_speed = 0;
+ dev_link.link_speed = ETH_SPEED_NUM_NONE;
else
dev_link.link_speed = link_speed;
dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 2d55bfe0..b40e7e5c 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -76,69 +76,94 @@ struct mlx4_drop {
};
/**
- * Convert DPDK RSS hash fields to their Verbs equivalent.
+ * Convert supported RSS hash field types between DPDK and Verbs formats.
*
- * This function returns the supported (default) set when @p rss_hf has
- * special value (uint64_t)-1.
+ * This function returns the supported (default) set when @p types has
+ * special value 0.
*
* @param priv
* Pointer to private structure.
- * @param rss_hf
- * Hash fields in DPDK format (see struct rte_eth_rss_conf).
+ * @param types
+ * Depending on @p verbs_to_dpdk, hash types in either DPDK (see struct
+ * rte_eth_rss_conf) or Verbs format.
+ * @param verbs_to_dpdk
+ * A zero value converts @p types from DPDK to Verbs, a nonzero value
+ * performs the reverse operation.
*
* @return
- * A valid Verbs RSS hash fields mask for mlx4 on success, (uint64_t)-1
- * otherwise and rte_errno is set.
+ * Converted RSS hash fields on success, (uint64_t)-1 otherwise and
+ * rte_errno is set.
*/
uint64_t
-mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
+mlx4_conv_rss_types(struct priv *priv, uint64_t types, int verbs_to_dpdk)
{
- enum { IPV4, IPV6, TCP, UDP, };
- const uint64_t in[] = {
- [IPV4] = (ETH_RSS_IPV4 |
- ETH_RSS_FRAG_IPV4 |
- ETH_RSS_NONFRAG_IPV4_TCP |
- ETH_RSS_NONFRAG_IPV4_UDP |
- ETH_RSS_NONFRAG_IPV4_OTHER),
- [IPV6] = (ETH_RSS_IPV6 |
- ETH_RSS_FRAG_IPV6 |
- ETH_RSS_NONFRAG_IPV6_TCP |
- ETH_RSS_NONFRAG_IPV6_UDP |
- ETH_RSS_NONFRAG_IPV6_OTHER |
- ETH_RSS_IPV6_EX |
- ETH_RSS_IPV6_TCP_EX |
- ETH_RSS_IPV6_UDP_EX),
- [TCP] = (ETH_RSS_NONFRAG_IPV4_TCP |
- ETH_RSS_NONFRAG_IPV6_TCP |
- ETH_RSS_IPV6_TCP_EX),
- [UDP] = (ETH_RSS_NONFRAG_IPV4_UDP |
- ETH_RSS_NONFRAG_IPV6_UDP |
- ETH_RSS_IPV6_UDP_EX),
+ enum {
+ INNER,
+ IPV4, IPV4_1, IPV4_2, IPV6, IPV6_1, IPV6_2, IPV6_3,
+ TCP, UDP,
+ IPV4_TCP, IPV4_UDP, IPV6_TCP, IPV6_TCP_1, IPV6_UDP, IPV6_UDP_1,
};
- const uint64_t out[RTE_DIM(in)] = {
- [IPV4] = IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4,
- [IPV6] = IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6,
- [TCP] = IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP,
- [UDP] = IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP,
+ enum {
+ VERBS_IPV4 = IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4,
+ VERBS_IPV6 = IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6,
+ VERBS_TCP = IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP,
+ VERBS_UDP = IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP,
};
+ static const uint64_t dpdk[] = {
+ [INNER] = 0,
+ [IPV4] = ETH_RSS_IPV4,
+ [IPV4_1] = ETH_RSS_FRAG_IPV4,
+ [IPV4_2] = ETH_RSS_NONFRAG_IPV4_OTHER,
+ [IPV6] = ETH_RSS_IPV6,
+ [IPV6_1] = ETH_RSS_FRAG_IPV6,
+ [IPV6_2] = ETH_RSS_NONFRAG_IPV6_OTHER,
+ [IPV6_3] = ETH_RSS_IPV6_EX,
+ [TCP] = 0,
+ [UDP] = 0,
+ [IPV4_TCP] = ETH_RSS_NONFRAG_IPV4_TCP,
+ [IPV4_UDP] = ETH_RSS_NONFRAG_IPV4_UDP,
+ [IPV6_TCP] = ETH_RSS_NONFRAG_IPV6_TCP,
+ [IPV6_TCP_1] = ETH_RSS_IPV6_TCP_EX,
+ [IPV6_UDP] = ETH_RSS_NONFRAG_IPV6_UDP,
+ [IPV6_UDP_1] = ETH_RSS_IPV6_UDP_EX,
+ };
+ static const uint64_t verbs[RTE_DIM(dpdk)] = {
+ [INNER] = IBV_RX_HASH_INNER,
+ [IPV4] = VERBS_IPV4,
+ [IPV4_1] = VERBS_IPV4,
+ [IPV4_2] = VERBS_IPV4,
+ [IPV6] = VERBS_IPV6,
+ [IPV6_1] = VERBS_IPV6,
+ [IPV6_2] = VERBS_IPV6,
+ [IPV6_3] = VERBS_IPV6,
+ [TCP] = VERBS_TCP,
+ [UDP] = VERBS_UDP,
+ [IPV4_TCP] = VERBS_IPV4 | VERBS_TCP,
+ [IPV4_UDP] = VERBS_IPV4 | VERBS_UDP,
+ [IPV6_TCP] = VERBS_IPV6 | VERBS_TCP,
+ [IPV6_TCP_1] = VERBS_IPV6 | VERBS_TCP,
+ [IPV6_UDP] = VERBS_IPV6 | VERBS_UDP,
+ [IPV6_UDP_1] = VERBS_IPV6 | VERBS_UDP,
+ };
+ const uint64_t *in = verbs_to_dpdk ? verbs : dpdk;
+ const uint64_t *out = verbs_to_dpdk ? dpdk : verbs;
uint64_t seen = 0;
uint64_t conv = 0;
unsigned int i;
- for (i = 0; i != RTE_DIM(in); ++i)
- if (rss_hf & in[i]) {
- seen |= rss_hf & in[i];
+ if (!types) {
+ if (!verbs_to_dpdk)
+ return priv->hw_rss_sup;
+ types = priv->hw_rss_sup;
+ }
+ for (i = 0; i != RTE_DIM(dpdk); ++i)
+ if (in[i] && (types & in[i]) == in[i]) {
+ seen |= types & in[i];
conv |= out[i];
}
- if ((conv & priv->hw_rss_sup) == conv) {
- if (rss_hf == (uint64_t)-1) {
- /* Include inner RSS by default if supported. */
- conv |= priv->hw_rss_sup & IBV_RX_HASH_INNER;
- return conv;
- }
- if (!(rss_hf & ~seen))
- return conv;
- }
+ if ((verbs_to_dpdk || (conv & priv->hw_rss_sup) == conv) &&
+ !(types & ~seen))
+ return conv;
rte_errno = ENOTSUP;
return (uint64_t)-1;
}
@@ -362,6 +387,9 @@ error:
* Additional mlx4-specific constraints on supported fields:
*
* - No support for partial masks.
+ * - Due to HW/FW limitation, flow rule priority is not taken into account
+ * when matching UDP destination ports, doing is therefore only supported
+ * at the highest priority level (0).
*
* @param[in, out] flow
* Flow rule handle to update.
@@ -393,6 +421,11 @@ mlx4_flow_merge_udp(struct rte_flow *flow,
msg = "mlx4 does not support matching partial UDP fields";
goto error;
}
+ if (mask && mask->hdr.dst_port && flow->priority) {
+ msg = "combining UDP destination port matching with a nonzero"
+ " priority level is not supported";
+ goto error;
+ }
if (!flow->ibv_attr)
return 0;
++flow->ibv_attr->num_of_specs;
@@ -637,6 +670,7 @@ mlx4_flow_prepare(struct priv *priv,
struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) };
struct rte_flow *flow = &temp;
const char *msg = NULL;
+ int overlap;
if (attr->group)
return rte_flow_error_set
@@ -651,12 +685,18 @@ mlx4_flow_prepare(struct priv *priv,
return rte_flow_error_set
(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
NULL, "egress is not supported");
+ if (attr->transfer)
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+ NULL, "transfer is not supported");
if (!attr->ingress)
return rte_flow_error_set
(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
NULL, "only ingress is supported");
fill:
+ overlap = 0;
proc = mlx4_flow_proc_item_list;
+ flow->priority = attr->priority;
/* Go over pattern. */
for (item = pattern; item->type; ++item) {
const struct mlx4_flow_proc_item *next = NULL;
@@ -702,14 +742,24 @@ fill:
}
/* Go over actions list. */
for (action = actions; action->type; ++action) {
+ /* This one may appear anywhere multiple times. */
+ if (action->type == RTE_FLOW_ACTION_TYPE_VOID)
+ continue;
+ /* Fate-deciding actions may appear exactly once. */
+ if (overlap) {
+ msg = "cannot combine several fate-deciding actions,"
+ " choose between DROP, QUEUE or RSS";
+ goto exit_action_not_supported;
+ }
+ overlap = 1;
switch (action->type) {
const struct rte_flow_action_queue *queue;
const struct rte_flow_action_rss *rss;
- const struct rte_eth_rss_conf *rss_conf;
+ const uint8_t *rss_key;
+ uint32_t rss_key_len;
+ uint64_t fields;
unsigned int i;
- case RTE_FLOW_ACTION_TYPE_VOID:
- continue;
case RTE_FLOW_ACTION_TYPE_DROP:
flow->drop = 1;
break;
@@ -736,54 +786,68 @@ fill:
break;
rss = action->conf;
/* Default RSS configuration if none is provided. */
- rss_conf =
- rss->rss_conf ?
- rss->rss_conf :
- &(struct rte_eth_rss_conf){
- .rss_key = mlx4_rss_hash_key_default,
- .rss_key_len = MLX4_RSS_HASH_KEY_SIZE,
- .rss_hf = -1,
- };
+ if (rss->key_len) {
+ rss_key = rss->key;
+ rss_key_len = rss->key_len;
+ } else {
+ rss_key = mlx4_rss_hash_key_default;
+ rss_key_len = MLX4_RSS_HASH_KEY_SIZE;
+ }
/* Sanity checks. */
- for (i = 0; i < rss->num; ++i)
+ for (i = 0; i < rss->queue_num; ++i)
if (rss->queue[i] >=
priv->dev->data->nb_rx_queues)
break;
- if (i != rss->num) {
+ if (i != rss->queue_num) {
msg = "queue index target beyond number of"
" configured Rx queues";
goto exit_action_not_supported;
}
- if (!rte_is_power_of_2(rss->num)) {
+ if (!rte_is_power_of_2(rss->queue_num)) {
msg = "for RSS, mlx4 requires the number of"
" queues to be a power of two";
goto exit_action_not_supported;
}
- if (rss_conf->rss_key_len !=
- sizeof(flow->rss->key)) {
+ if (rss_key_len != sizeof(flow->rss->key)) {
msg = "mlx4 supports exactly one RSS hash key"
" length: "
MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE);
goto exit_action_not_supported;
}
- for (i = 1; i < rss->num; ++i)
+ for (i = 1; i < rss->queue_num; ++i)
if (rss->queue[i] - rss->queue[i - 1] != 1)
break;
- if (i != rss->num) {
+ if (i != rss->queue_num) {
msg = "mlx4 requires RSS contexts to use"
" consecutive queue indices only";
goto exit_action_not_supported;
}
- if (rss->queue[0] % rss->num) {
+ if (rss->queue[0] % rss->queue_num) {
msg = "mlx4 requires the first queue of a RSS"
" context to be aligned on a multiple"
" of the context size";
goto exit_action_not_supported;
}
+ if (rss->func &&
+ rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+ msg = "the only supported RSS hash function"
+ " is Toeplitz";
+ goto exit_action_not_supported;
+ }
+ if (rss->level) {
+ msg = "a nonzero RSS encapsulation level is"
+ " not supported";
+ goto exit_action_not_supported;
+ }
+ rte_errno = 0;
+ fields = mlx4_conv_rss_types(priv, rss->types, 0);
+ if (fields == (uint64_t)-1 && rte_errno) {
+ msg = "unsupported RSS hash type requested";
+ goto exit_action_not_supported;
+ }
flow->rss = mlx4_rss_get
- (priv,
- mlx4_conv_rss_hf(priv, rss_conf->rss_hf),
- rss_conf->rss_key, rss->num, rss->queue);
+ (priv, fields, rss_key, rss->queue_num,
+ rss->queue);
if (!flow->rss) {
msg = "either invalid parameters or not enough"
" resources for additional multi-queue"
@@ -795,10 +859,9 @@ fill:
goto exit_action_not_supported;
}
}
- if (!flow->rss && !flow->drop)
- return rte_flow_error_set
- (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
- NULL, "no valid action");
+ /* When fate is unknown, drop traffic. */
+ if (!overlap)
+ flow->drop = 1;
/* Validation ends here. */
if (!addr) {
if (flow->rss)
@@ -820,11 +883,14 @@ fill:
},
};
- if (!mlx4_zmallocv(__func__, vec, RTE_DIM(vec)))
+ if (!mlx4_zmallocv(__func__, vec, RTE_DIM(vec))) {
+ if (temp.rss)
+ mlx4_rss_put(temp.rss);
return rte_flow_error_set
(error, -rte_errno,
RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
"flow rule handle allocation failure");
+ }
/* Most fields will be updated by second pass. */
*flow = (struct rte_flow){
.ibv_attr = temp.ibv_attr,
@@ -1264,14 +1330,20 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
*/
uint32_t queues =
rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
- alignas(struct rte_flow_action_rss) uint8_t rss_conf_data
- [offsetof(struct rte_flow_action_rss, queue) +
- sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues];
- struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data;
+ uint16_t queue[queues];
+ struct rte_flow_action_rss action_rss = {
+ .func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+ .level = 0,
+ .types = 0,
+ .key_len = MLX4_RSS_HASH_KEY_SIZE,
+ .queue_num = queues,
+ .key = mlx4_rss_hash_key_default,
+ .queue = queue,
+ };
struct rte_flow_action actions[] = {
{
.type = RTE_FLOW_ACTION_TYPE_RSS,
- .conf = rss_conf,
+ .conf = &action_rss,
},
{
.type = RTE_FLOW_ACTION_TYPE_END,
@@ -1293,12 +1365,8 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
if (!queues)
goto error;
/* Prepare default RSS configuration. */
- *rss_conf = (struct rte_flow_action_rss){
- .rss_conf = NULL, /* Rely on default fallback settings. */
- .num = queues,
- };
for (i = 0; i != queues; ++i)
- rss_conf->queue[i] = i;
+ queue[i] = i;
/*
* Set up VLAN item if filtering is enabled and at least one VLAN
* filter is configured.
@@ -1357,7 +1425,7 @@ next_vlan:
if (j != sizeof(mac->addr_bytes))
continue;
if (flow->rss->queues != queues ||
- memcmp(flow->rss->queue_id, rss_conf->queue,
+ memcmp(flow->rss->queue_id, action_rss.queue,
queues * sizeof(flow->rss->queue_id[0])))
continue;
break;
@@ -1397,7 +1465,7 @@ next_vlan:
if (flow && flow->internal) {
assert(flow->rss);
if (flow->rss->queues != queues ||
- memcmp(flow->rss->queue_id, rss_conf->queue,
+ memcmp(flow->rss->queue_id, action_rss.queue,
queues * sizeof(flow->rss->queue_id[0])))
flow = NULL;
}
diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h
index 00188a65..2917ebe9 100644
--- a/drivers/net/mlx4/mlx4_flow.h
+++ b/drivers/net/mlx4/mlx4_flow.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
#ifndef RTE_PMD_MLX4_FLOW_H_
@@ -42,12 +42,14 @@ struct rte_flow {
uint32_t promisc:1; /**< This rule matches everything. */
uint32_t allmulti:1; /**< This rule matches all multicast traffic. */
uint32_t drop:1; /**< This rule drops packets. */
+ uint32_t priority; /**< Flow rule priority. */
struct mlx4_rss *rss; /**< Rx target. */
};
/* mlx4_flow.c */
-uint64_t mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf);
+uint64_t mlx4_conv_rss_types(struct priv *priv, uint64_t types,
+ int verbs_to_dpdk);
int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error);
void mlx4_flow_clean(struct priv *priv);
int mlx4_filter_ctrl(struct rte_eth_dev *dev,
diff --git a/drivers/net/mlx4/mlx4_glue.c b/drivers/net/mlx4/mlx4_glue.c
index 3b79d320..67b3bfac 100644
--- a/drivers/net/mlx4/mlx4_glue.c
+++ b/drivers/net/mlx4/mlx4_glue.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2018 6WIND S.A.
- * Copyright 2018 Mellanox
+ * Copyright 2018 Mellanox Technologies, Ltd
*/
#include <stddef.h>
diff --git a/drivers/net/mlx4/mlx4_glue.h b/drivers/net/mlx4/mlx4_glue.h
index 368f906b..668ca867 100644
--- a/drivers/net/mlx4/mlx4_glue.h
+++ b/drivers/net/mlx4/mlx4_glue.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2018 6WIND S.A.
- * Copyright 2018 Mellanox
+ * Copyright 2018 Mellanox Technologies, Ltd
*/
#ifndef MLX4_GLUE_H_
diff --git a/drivers/net/mlx4/mlx4_intr.c b/drivers/net/mlx4/mlx4_intr.c
index 2141992e..eeb982a0 100644
--- a/drivers/net/mlx4/mlx4_intr.c
+++ b/drivers/net/mlx4/mlx4_intr.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index 9a1e4de3..d23d3c61 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -30,237 +30,1152 @@
#include <rte_malloc.h>
#include <rte_memory.h>
#include <rte_mempool.h>
-#include <rte_spinlock.h>
+#include <rte_rwlock.h>
#include "mlx4_glue.h"
+#include "mlx4_mr.h"
#include "mlx4_rxtx.h"
#include "mlx4_utils.h"
-struct mlx4_check_mempool_data {
+struct mr_find_contig_memsegs_data {
+ uintptr_t addr;
+ uintptr_t start;
+ uintptr_t end;
+ const struct rte_memseg_list *msl;
+};
+
+struct mr_update_mp_data {
+ struct rte_eth_dev *dev;
+ struct mlx4_mr_ctrl *mr_ctrl;
int ret;
- char *start;
- char *end;
};
/**
- * Called by mlx4_check_mempool() when iterating the memory chunks.
- *
- * @param[in] mp
- * Pointer to memory pool (unused).
- * @param[in, out] data
- * Pointer to shared buffer with mlx4_check_mempool().
- * @param[in] memhdr
- * Pointer to mempool chunk header.
- * @param mem_idx
- * Mempool element index (unused).
+ * Expand B-tree table to a given size. Can't be called with holding
+ * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ * @param n
+ * Number of entries for expansion.
+ *
+ * @return
+ * 0 on success, -1 on failure.
*/
-static void
-mlx4_check_mempool_cb(struct rte_mempool *mp, void *opaque,
- struct rte_mempool_memhdr *memhdr,
- unsigned int mem_idx)
+static int
+mr_btree_expand(struct mlx4_mr_btree *bt, int n)
+{
+ void *mem;
+ int ret = 0;
+
+ if (n <= bt->size)
+ return ret;
+ /*
+ * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is
+ * used inside if there's no room to expand. Because this is a quite
+ * rare case and a part of very slow path, it is very acceptable.
+ * Initially cache_bh[] will be given practically enough space and once
+ * it is expanded, expansion wouldn't be needed again ever.
+ */
+ mem = rte_realloc(bt->table, n * sizeof(struct mlx4_mr_cache), 0);
+ if (mem == NULL) {
+ /* Not an error, B-tree search will be skipped. */
+ WARN("failed to expand MR B-tree (%p) table", (void *)bt);
+ ret = -1;
+ } else {
+ DEBUG("expanded MR B-tree table (size=%u)", n);
+ bt->table = mem;
+ bt->size = n;
+ }
+ return ret;
+}
+
+/**
+ * Look up LKey from given B-tree lookup table, store the last index and return
+ * searched LKey.
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ * @param[out] idx
+ * Pointer to index. Even on search failure, returns index where it stops
+ * searching so that index can be used when inserting a new entry.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mr_btree_lookup(struct mlx4_mr_btree *bt, uint16_t *idx, uintptr_t addr)
+{
+ struct mlx4_mr_cache *lkp_tbl;
+ uint16_t n;
+ uint16_t base = 0;
+
+ assert(bt != NULL);
+ lkp_tbl = *bt->table;
+ n = bt->len;
+ /* First entry must be NULL for comparison. */
+ assert(bt->len > 0 || (lkp_tbl[0].start == 0 &&
+ lkp_tbl[0].lkey == UINT32_MAX));
+ /* Binary search. */
+ do {
+ register uint16_t delta = n >> 1;
+
+ if (addr < lkp_tbl[base + delta].start) {
+ n = delta;
+ } else {
+ base += delta;
+ n -= delta;
+ }
+ } while (n > 1);
+ assert(addr >= lkp_tbl[base].start);
+ *idx = base;
+ if (addr < lkp_tbl[base].end)
+ return lkp_tbl[base].lkey;
+ /* Not found. */
+ return UINT32_MAX;
+}
+
+/**
+ * Insert an entry to B-tree lookup table.
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ * @param entry
+ * Pointer to new entry to insert.
+ *
+ * @return
+ * 0 on success, -1 on failure.
+ */
+static int
+mr_btree_insert(struct mlx4_mr_btree *bt, struct mlx4_mr_cache *entry)
{
- struct mlx4_check_mempool_data *data = opaque;
+ struct mlx4_mr_cache *lkp_tbl;
+ uint16_t idx = 0;
+ size_t shift;
- (void)mp;
- (void)mem_idx;
- /* It already failed, skip the next chunks. */
- if (data->ret != 0)
+ assert(bt != NULL);
+ assert(bt->len <= bt->size);
+ assert(bt->len > 0);
+ lkp_tbl = *bt->table;
+ /* Find out the slot for insertion. */
+ if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) {
+ DEBUG("abort insertion to B-tree(%p): already exist at"
+ " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
+ (void *)bt, idx, entry->start, entry->end, entry->lkey);
+ /* Already exist, return. */
+ return 0;
+ }
+ /* If table is full, return error. */
+ if (unlikely(bt->len == bt->size)) {
+ bt->overflow = 1;
+ return -1;
+ }
+ /* Insert entry. */
+ ++idx;
+ shift = (bt->len - idx) * sizeof(struct mlx4_mr_cache);
+ if (shift)
+ memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift);
+ lkp_tbl[idx] = *entry;
+ bt->len++;
+ DEBUG("inserted B-tree(%p)[%u],"
+ " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
+ (void *)bt, idx, entry->start, entry->end, entry->lkey);
+ return 0;
+}
+
+/**
+ * Initialize B-tree and allocate memory for lookup table.
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ * @param n
+ * Number of entries to allocate.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket)
+{
+ if (bt == NULL) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ memset(bt, 0, sizeof(*bt));
+ bt->table = rte_calloc_socket("B-tree table",
+ n, sizeof(struct mlx4_mr_cache),
+ 0, socket);
+ if (bt->table == NULL) {
+ rte_errno = ENOMEM;
+ ERROR("failed to allocate memory for btree cache on socket %d",
+ socket);
+ return -rte_errno;
+ }
+ bt->size = n;
+ /* First entry must be NULL for binary search. */
+ (*bt->table)[bt->len++] = (struct mlx4_mr_cache) {
+ .lkey = UINT32_MAX,
+ };
+ DEBUG("initialized B-tree %p with table %p",
+ (void *)bt, (void *)bt->table);
+ return 0;
+}
+
+/**
+ * Free B-tree resources.
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ */
+void
+mlx4_mr_btree_free(struct mlx4_mr_btree *bt)
+{
+ if (bt == NULL)
return;
- /* It is the first chunk. */
- if (data->start == NULL && data->end == NULL) {
- data->start = memhdr->addr;
- data->end = data->start + memhdr->len;
+ DEBUG("freeing B-tree %p with table %p", (void *)bt, (void *)bt->table);
+ rte_free(bt->table);
+ memset(bt, 0, sizeof(*bt));
+}
+
+#ifndef NDEBUG
+/**
+ * Dump all the entries in a B-tree
+ *
+ * @param bt
+ * Pointer to B-tree structure.
+ */
+void
+mlx4_mr_btree_dump(struct mlx4_mr_btree *bt)
+{
+ int idx;
+ struct mlx4_mr_cache *lkp_tbl;
+
+ if (bt == NULL)
return;
+ lkp_tbl = *bt->table;
+ for (idx = 0; idx < bt->len; ++idx) {
+ struct mlx4_mr_cache *entry = &lkp_tbl[idx];
+
+ DEBUG("B-tree(%p)[%u],"
+ " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
+ (void *)bt, idx, entry->start, entry->end, entry->lkey);
}
- if (data->end == memhdr->addr) {
- data->end += memhdr->len;
- return;
+}
+#endif
+
+/**
+ * Find virtually contiguous memory chunk in a given MR.
+ *
+ * @param dev
+ * Pointer to MR structure.
+ * @param[out] entry
+ * Pointer to returning MR cache entry. If not found, this will not be
+ * updated.
+ * @param start_idx
+ * Start index of the memseg bitmap.
+ *
+ * @return
+ * Next index to go on lookup.
+ */
+static int
+mr_find_next_chunk(struct mlx4_mr *mr, struct mlx4_mr_cache *entry,
+ int base_idx)
+{
+ uintptr_t start = 0;
+ uintptr_t end = 0;
+ uint32_t idx = 0;
+
+ for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) {
+ if (rte_bitmap_get(mr->ms_bmp, idx)) {
+ const struct rte_memseg_list *msl;
+ const struct rte_memseg *ms;
+
+ msl = mr->msl;
+ ms = rte_fbarray_get(&msl->memseg_arr,
+ mr->ms_base_idx + idx);
+ assert(msl->page_sz == ms->hugepage_sz);
+ if (!start)
+ start = ms->addr_64;
+ end = ms->addr_64 + ms->hugepage_sz;
+ } else if (start) {
+ /* Passed the end of a fragment. */
+ break;
+ }
}
- if (data->start == (char *)memhdr->addr + memhdr->len) {
- data->start -= memhdr->len;
- return;
+ if (start) {
+ /* Found one chunk. */
+ entry->start = start;
+ entry->end = end;
+ entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey);
}
- /* Error, mempool is not virtually contiguous. */
- data->ret = -1;
+ return idx;
}
/**
- * Check if a mempool can be used: it must be virtually contiguous.
+ * Insert a MR to the global B-tree cache. It may fail due to low-on-memory.
+ * Then, this entry will have to be searched by mr_lookup_dev_list() in
+ * mlx4_mr_create() on miss.
*
- * @param[in] mp
- * Pointer to memory pool.
- * @param[out] start
- * Pointer to the start address of the mempool virtual memory area.
- * @param[out] end
- * Pointer to the end address of the mempool virtual memory area.
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mr
+ * Pointer to MR to insert.
*
* @return
- * 0 on success (mempool is virtually contiguous), -1 on error.
+ * 0 on success, -1 on failure.
*/
static int
-mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start, uintptr_t *end)
+mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx4_mr *mr)
{
- struct mlx4_check_mempool_data data;
+ struct priv *priv = dev->data->dev_private;
+ unsigned int n;
- memset(&data, 0, sizeof(data));
- rte_mempool_mem_iter(mp, mlx4_check_mempool_cb, &data);
- *start = (uintptr_t)data.start;
- *end = (uintptr_t)data.end;
- return data.ret;
+ DEBUG("port %u inserting MR(%p) to global cache",
+ dev->data->port_id, (void *)mr);
+ for (n = 0; n < mr->ms_bmp_n; ) {
+ struct mlx4_mr_cache entry = { 0, };
+
+ /* Find a contiguous chunk and advance the index. */
+ n = mr_find_next_chunk(mr, &entry, n);
+ if (!entry.end)
+ break;
+ if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
+ /*
+ * Overflowed, but the global table cannot be expanded
+ * because of deadlock.
+ */
+ return -1;
+ }
+ }
+ return 0;
}
/**
- * Obtain a memory region from a memory pool.
+ * Look up address in the original global MR list.
*
- * If a matching memory region already exists, it is returned with its
- * reference count incremented, otherwise a new one is registered.
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] entry
+ * Pointer to returning MR cache entry. If no match, this will not be updated.
+ * @param addr
+ * Search key.
*
- * @param priv
- * Pointer to private structure.
- * @param mp
- * Pointer to memory pool.
+ * @return
+ * Found MR on match, NULL otherwise.
+ */
+static struct mlx4_mr *
+mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
+ uintptr_t addr)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct mlx4_mr *mr;
+
+ /* Iterate all the existing MRs. */
+ LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+ unsigned int n;
+
+ if (mr->ms_n == 0)
+ continue;
+ for (n = 0; n < mr->ms_bmp_n; ) {
+ struct mlx4_mr_cache ret = { 0, };
+
+ n = mr_find_next_chunk(mr, &ret, n);
+ if (addr >= ret.start && addr < ret.end) {
+ /* Found. */
+ *entry = ret;
+ return mr;
+ }
+ }
+ }
+ return NULL;
+}
+
+/**
+ * Look up address on device.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] entry
+ * Pointer to returning MR cache entry. If no match, this will not be updated.
+ * @param addr
+ * Search key.
*
* @return
- * Memory region pointer, NULL in case of error and rte_errno is set.
+ * Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
*/
-struct mlx4_mr *
-mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
+static uint32_t
+mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
+ uintptr_t addr)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- uintptr_t start;
- uintptr_t end;
- unsigned int i;
+ struct priv *priv = dev->data->dev_private;
+ uint16_t idx;
+ uint32_t lkey = UINT32_MAX;
struct mlx4_mr *mr;
- if (mlx4_check_mempool(mp, &start, &end) != 0) {
- rte_errno = EINVAL;
- ERROR("mempool %p: not virtually contiguous",
- (void *)mp);
- return NULL;
+ /*
+ * If the global cache has overflowed since it failed to expand the
+ * B-tree table, it can't have all the existing MRs. Then, the address
+ * has to be searched by traversing the original MR list instead, which
+ * is very slow path. Otherwise, the global cache is all inclusive.
+ */
+ if (!unlikely(priv->mr.cache.overflow)) {
+ lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+ if (lkey != UINT32_MAX)
+ *entry = (*priv->mr.cache.table)[idx];
+ } else {
+ /* Falling back to the slowest path. */
+ mr = mr_lookup_dev_list(dev, entry, addr);
+ if (mr != NULL)
+ lkey = entry->lkey;
}
- DEBUG("mempool %p area start=%p end=%p size=%zu",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- /* Round start and end to page boundary if found in memory segments. */
- for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
- uintptr_t addr = (uintptr_t)ms[i].addr;
- size_t len = ms[i].len;
- unsigned int align = ms[i].hugepage_sz;
-
- if ((start > addr) && (start < addr + len))
- start = RTE_ALIGN_FLOOR(start, align);
- if ((end > addr) && (end < addr + len))
- end = RTE_ALIGN_CEIL(end, align);
+ assert(lkey == UINT32_MAX || (addr >= entry->start &&
+ addr < entry->end));
+ return lkey;
+}
+
+/**
+ * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free()
+ * can raise memory free event and the callback function will spin on the lock.
+ *
+ * @param mr
+ * Pointer to MR to free.
+ */
+static void
+mr_free(struct mlx4_mr *mr)
+{
+ if (mr == NULL)
+ return;
+ DEBUG("freeing MR(%p):", (void *)mr);
+ if (mr->ibv_mr != NULL)
+ claim_zero(mlx4_glue->dereg_mr(mr->ibv_mr));
+ if (mr->ms_bmp != NULL)
+ rte_bitmap_free(mr->ms_bmp);
+ rte_free(mr);
+}
+
+/**
+ * Releass resources of detached MR having no online entry.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+mlx4_mr_garbage_collect(struct rte_eth_dev *dev)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct mlx4_mr *mr_next;
+ struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
+
+ /*
+ * MR can't be freed with holding the lock because rte_free() could call
+ * memory free callback function. This will be a deadlock situation.
+ */
+ rte_rwlock_write_lock(&priv->mr.rwlock);
+ /* Detach the whole free list and release it after unlocking. */
+ free_list = priv->mr.mr_free_list;
+ LIST_INIT(&priv->mr.mr_free_list);
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+ /* Release resources. */
+ mr_next = LIST_FIRST(&free_list);
+ while (mr_next != NULL) {
+ struct mlx4_mr *mr = mr_next;
+
+ mr_next = LIST_NEXT(mr, mr);
+ mr_free(mr);
}
- DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- rte_spinlock_lock(&priv->mr_lock);
- LIST_FOREACH(mr, &priv->mr, next)
- if (mp == mr->mp && start >= mr->start && end <= mr->end)
- break;
- if (mr) {
- ++mr->refcnt;
- goto release;
+}
+
+/* Called during rte_memseg_contig_walk() by mlx4_mr_create(). */
+static int
+mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, size_t len, void *arg)
+{
+ struct mr_find_contig_memsegs_data *data = arg;
+
+ if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len)
+ return 0;
+ /* Found, save it and stop walking. */
+ data->start = ms->addr_64;
+ data->end = ms->addr_64 + len;
+ data->msl = msl;
+ return 1;
+}
+
+/**
+ * Create a new global Memroy Region (MR) for a missing virtual address.
+ * Register entire virtually contiguous memory chunk around the address.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] entry
+ * Pointer to returning MR cache entry, found in the global cache or newly
+ * created. If failed to create one, this will not be updated.
+ * @param addr
+ * Target virtual address to register.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
+ */
+static uint32_t
+mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
+ uintptr_t addr)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ const struct rte_memseg_list *msl;
+ const struct rte_memseg *ms;
+ struct mlx4_mr *mr = NULL;
+ size_t len;
+ uint32_t ms_n;
+ uint32_t bmp_size;
+ void *bmp_mem;
+ int ms_idx_shift = -1;
+ unsigned int n;
+ struct mr_find_contig_memsegs_data data = {
+ .addr = addr,
+ };
+ struct mr_find_contig_memsegs_data data_re;
+
+ DEBUG("port %u creating a MR using address (%p)",
+ dev->data->port_id, (void *)addr);
+ /*
+ * Release detached MRs if any. This can't be called with holding either
+ * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
+ * been detached by the memory free event but it couldn't be released
+ * inside the callback due to deadlock. As a result, releasing resources
+ * is quite opportunistic.
+ */
+ mlx4_mr_garbage_collect(dev);
+ /*
+ * Find out a contiguous virtual address chunk in use, to which the
+ * given address belongs, in order to register maximum range. In the
+ * best case where mempools are not dynamically recreated and
+ * '--socket-mem' is speicified as an EAL option, it is very likely to
+ * have only one MR(LKey) per a socket and per a hugepage-size even
+ * though the system memory is highly fragmented.
+ */
+ if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) {
+ WARN("port %u unable to find virtually contiguous"
+ " chunk for address (%p)."
+ " rte_memseg_contig_walk() failed.",
+ dev->data->port_id, (void *)addr);
+ rte_errno = ENXIO;
+ goto err_nolock;
}
- mr = rte_malloc(__func__, sizeof(*mr), 0);
- if (!mr) {
+alloc_resources:
+ /* Addresses must be page-aligned. */
+ assert(rte_is_aligned((void *)data.start, data.msl->page_sz));
+ assert(rte_is_aligned((void *)data.end, data.msl->page_sz));
+ msl = data.msl;
+ ms = rte_mem_virt2memseg((void *)data.start, msl);
+ len = data.end - data.start;
+ assert(msl->page_sz == ms->hugepage_sz);
+ /* Number of memsegs in the range. */
+ ms_n = len / msl->page_sz;
+ DEBUG("port %u extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
+ " page_sz=0x%" PRIx64 ", ms_n=%u",
+ dev->data->port_id, (void *)addr,
+ data.start, data.end, msl->page_sz, ms_n);
+ /* Size of memory for bitmap. */
+ bmp_size = rte_bitmap_get_memory_footprint(ms_n);
+ mr = rte_zmalloc_socket(NULL,
+ RTE_ALIGN_CEIL(sizeof(*mr),
+ RTE_CACHE_LINE_SIZE) +
+ bmp_size,
+ RTE_CACHE_LINE_SIZE, msl->socket_id);
+ if (mr == NULL) {
+ WARN("port %u unable to allocate memory for a new MR of"
+ " address (%p).",
+ dev->data->port_id, (void *)addr);
rte_errno = ENOMEM;
- goto release;
+ goto err_nolock;
}
- *mr = (struct mlx4_mr){
- .start = start,
- .end = end,
- .refcnt = 1,
- .priv = priv,
- .mr = mlx4_glue->reg_mr(priv->pd, (void *)start, end - start,
- IBV_ACCESS_LOCAL_WRITE),
- .mp = mp,
- };
- if (mr->mr) {
- mr->lkey = mr->mr->lkey;
- LIST_INSERT_HEAD(&priv->mr, mr, next);
- } else {
- rte_free(mr);
- mr = NULL;
- rte_errno = errno ? errno : EINVAL;
+ mr->msl = msl;
+ /*
+ * Save the index of the first memseg and initialize memseg bitmap. To
+ * see if a memseg of ms_idx in the memseg-list is still valid, check:
+ * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx)
+ */
+ mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+ bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE);
+ mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size);
+ if (mr->ms_bmp == NULL) {
+ WARN("port %u unable to initialize bitamp for a new MR of"
+ " address (%p).",
+ dev->data->port_id, (void *)addr);
+ rte_errno = EINVAL;
+ goto err_nolock;
+ }
+ /*
+ * Should recheck whether the extended contiguous chunk is still valid.
+ * Because memory_hotplug_lock can't be held if there's any memory
+ * related calls in a critical path, resource allocation above can't be
+ * locked. If the memory has been changed at this point, try again with
+ * just single page. If not, go on with the big chunk atomically from
+ * here.
+ */
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+ data_re = data;
+ if (len > msl->page_sz &&
+ !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) {
+ WARN("port %u unable to find virtually contiguous"
+ " chunk for address (%p)."
+ " rte_memseg_contig_walk() failed.",
+ dev->data->port_id, (void *)addr);
+ rte_errno = ENXIO;
+ goto err_memlock;
+ }
+ if (data.start != data_re.start || data.end != data_re.end) {
+ /*
+ * The extended contiguous chunk has been changed. Try again
+ * with single memseg instead.
+ */
+ data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz);
+ data.end = data.start + msl->page_sz;
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ mr_free(mr);
+ goto alloc_resources;
}
-release:
- rte_spinlock_unlock(&priv->mr_lock);
- return mr;
+ assert(data.msl == data_re.msl);
+ rte_rwlock_write_lock(&priv->mr.rwlock);
+ /*
+ * Check the address is really missing. If other thread already created
+ * one or it is not found due to overflow, abort and return.
+ */
+ if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) {
+ /*
+ * Insert to the global cache table. It may fail due to
+ * low-on-memory. Then, this entry will have to be searched
+ * here again.
+ */
+ mr_btree_insert(&priv->mr.cache, entry);
+ DEBUG("port %u found MR for %p on final lookup, abort",
+ dev->data->port_id, (void *)addr);
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ /*
+ * Must be unlocked before calling rte_free() because
+ * mlx4_mr_mem_event_free_cb() can be called inside.
+ */
+ mr_free(mr);
+ return entry->lkey;
+ }
+ /*
+ * Trim start and end addresses for verbs MR. Set bits for registering
+ * memsegs but exclude already registered ones. Bitmap can be
+ * fragmented.
+ */
+ for (n = 0; n < ms_n; ++n) {
+ uintptr_t start;
+ struct mlx4_mr_cache ret = { 0, };
+
+ start = data_re.start + n * msl->page_sz;
+ /* Exclude memsegs already registered by other MRs. */
+ if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) {
+ /*
+ * Start from the first unregistered memseg in the
+ * extended range.
+ */
+ if (ms_idx_shift == -1) {
+ mr->ms_base_idx += n;
+ data.start = start;
+ ms_idx_shift = n;
+ }
+ data.end = start + msl->page_sz;
+ rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift);
+ ++mr->ms_n;
+ }
+ }
+ len = data.end - data.start;
+ mr->ms_bmp_n = len / msl->page_sz;
+ assert(ms_idx_shift + mr->ms_bmp_n <= ms_n);
+ /*
+ * Finally create a verbs MR for the memory chunk. ibv_reg_mr() can be
+ * called with holding the memory lock because it doesn't use
+ * mlx4_alloc_buf_extern() which eventually calls rte_malloc_socket()
+ * through mlx4_alloc_verbs_buf().
+ */
+ mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)data.start, len,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (mr->ibv_mr == NULL) {
+ WARN("port %u fail to create a verbs MR for address (%p)",
+ dev->data->port_id, (void *)addr);
+ rte_errno = EINVAL;
+ goto err_mrlock;
+ }
+ assert((uintptr_t)mr->ibv_mr->addr == data.start);
+ assert(mr->ibv_mr->length == len);
+ LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+ DEBUG("port %u MR CREATED (%p) for %p:\n"
+ " [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
+ " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
+ dev->data->port_id, (void *)mr, (void *)addr,
+ data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey),
+ mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
+ /* Insert to the global cache table. */
+ mr_insert_dev_cache(dev, mr);
+ /* Fill in output data. */
+ mr_lookup_dev(dev, entry, addr);
+ /* Lookup can't fail. */
+ assert(entry->lkey != UINT32_MAX);
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return entry->lkey;
+err_mrlock:
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+err_memlock:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+err_nolock:
+ /*
+ * In case of error, as this can be called in a datapath, a warning
+ * message per an error is preferable instead. Must be unlocked before
+ * calling rte_free() because mlx4_mr_mem_event_free_cb() can be called
+ * inside.
+ */
+ mr_free(mr);
+ return UINT32_MAX;
}
/**
- * Release a memory region.
+ * Rebuild the global B-tree cache of device from the original MR list.
*
- * This function decrements its reference count and destroys it after
- * reaching 0.
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+mr_rebuild_dev_cache(struct rte_eth_dev *dev)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct mlx4_mr *mr;
+
+ DEBUG("port %u rebuild dev cache[]", dev->data->port_id);
+ /* Flush cache to rebuild. */
+ priv->mr.cache.len = 1;
+ priv->mr.cache.overflow = 0;
+ /* Iterate all the existing MRs. */
+ LIST_FOREACH(mr, &priv->mr.mr_list, mr)
+ if (mr_insert_dev_cache(dev, mr) < 0)
+ return;
+}
+
+/**
+ * Callback for memory free event. Iterate freed memsegs and check whether it
+ * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a
+ * result, the MR would be fragmented. If it becomes empty, the MR will be freed
+ * later by mlx4_mr_garbage_collect().
*
- * Note to avoid race conditions given this function may be used from the
- * data plane, it's extremely important that each user holds its own
- * reference.
+ * The global cache must be rebuilt if there's any change and this event has to
+ * be propagated to dataplane threads to flush the local caches.
*
- * @param mr
- * Memory region to release.
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param addr
+ * Address of freed memory.
+ * @param len
+ * Size of freed memory.
+ */
+static void
+mlx4_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
+{
+ struct priv *priv = dev->data->dev_private;
+ const struct rte_memseg_list *msl;
+ struct mlx4_mr *mr;
+ int ms_n;
+ int i;
+ int rebuild = 0;
+
+ DEBUG("port %u free callback: addr=%p, len=%zu",
+ dev->data->port_id, addr, len);
+ msl = rte_mem_virt2memseg_list(addr);
+ /* addr and len must be page-aligned. */
+ assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz));
+ assert(len == RTE_ALIGN(len, msl->page_sz));
+ ms_n = len / msl->page_sz;
+ rte_rwlock_write_lock(&priv->mr.rwlock);
+ /* Clear bits of freed memsegs from MR. */
+ for (i = 0; i < ms_n; ++i) {
+ const struct rte_memseg *ms;
+ struct mlx4_mr_cache entry;
+ uintptr_t start;
+ int ms_idx;
+ uint32_t pos;
+
+ /* Find MR having this memseg. */
+ start = (uintptr_t)addr + i * msl->page_sz;
+ mr = mr_lookup_dev_list(dev, &entry, start);
+ if (mr == NULL)
+ continue;
+ ms = rte_mem_virt2memseg((void *)start, msl);
+ assert(ms != NULL);
+ assert(msl->page_sz == ms->hugepage_sz);
+ ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+ pos = ms_idx - mr->ms_base_idx;
+ assert(rte_bitmap_get(mr->ms_bmp, pos));
+ assert(pos < mr->ms_bmp_n);
+ DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p",
+ dev->data->port_id, (void *)mr, pos, (void *)start);
+ rte_bitmap_clear(mr->ms_bmp, pos);
+ if (--mr->ms_n == 0) {
+ LIST_REMOVE(mr, mr);
+ LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+ DEBUG("port %u remove MR(%p) from list",
+ dev->data->port_id, (void *)mr);
+ }
+ /*
+ * MR is fragmented or will be freed. the global cache must be
+ * rebuilt.
+ */
+ rebuild = 1;
+ }
+ if (rebuild) {
+ mr_rebuild_dev_cache(dev);
+ /*
+ * Flush local caches by propagating invalidation across cores.
+ * rte_smp_wmb() is enough to synchronize this event. If one of
+ * freed memsegs is seen by other core, that means the memseg
+ * has been allocated by allocator, which will come after this
+ * free call. Therefore, this store instruction (incrementing
+ * generation below) will be guaranteed to be seen by other core
+ * before the core sees the newly allocated memory.
+ */
+ ++priv->mr.dev_gen;
+ DEBUG("broadcasting local cache flush, gen=%d",
+ priv->mr.dev_gen);
+ rte_smp_wmb();
+ }
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+#ifndef NDEBUG
+ if (rebuild)
+ mlx4_mr_dump_dev(dev);
+#endif
+}
+
+/**
+ * Callback for memory event.
+ *
+ * @param event_type
+ * Memory event type.
+ * @param addr
+ * Address of memory.
+ * @param len
+ * Size of memory.
*/
void
-mlx4_mr_put(struct mlx4_mr *mr)
+mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
+ size_t len, void *arg __rte_unused)
{
- struct priv *priv = mr->priv;
-
- rte_spinlock_lock(&priv->mr_lock);
- assert(mr->refcnt);
- if (--mr->refcnt)
- goto release;
- LIST_REMOVE(mr, next);
- claim_zero(mlx4_glue->dereg_mr(mr->mr));
- rte_free(mr);
-release:
- rte_spinlock_unlock(&priv->mr_lock);
+ struct priv *priv;
+
+ switch (event_type) {
+ case RTE_MEM_EVENT_FREE:
+ rte_rwlock_read_lock(&mlx4_mem_event_rwlock);
+ /* Iterate all the existing mlx4 devices. */
+ LIST_FOREACH(priv, &mlx4_mem_event_cb_list, mem_event_cb)
+ mlx4_mr_mem_event_free_cb(priv->dev, addr, len);
+ rte_rwlock_read_unlock(&mlx4_mem_event_rwlock);
+ break;
+ case RTE_MEM_EVENT_ALLOC:
+ default:
+ break;
+ }
}
/**
- * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
- * If mp2mr[] is full, remove an entry first.
+ * Look up address in the global MR cache table. If not found, create a new MR.
+ * Insert the found/created entry to local bottom-half cache table.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mr_ctrl
+ * Pointer to per-queue MR control structure.
+ * @param[out] entry
+ * Pointer to returning MR cache entry, found in the global cache or newly
+ * created. If failed to create one, this is not written.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mlx4_mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+ struct mlx4_mr_cache *entry, uintptr_t addr)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct mlx4_mr_btree *bt = &mr_ctrl->cache_bh;
+ uint16_t idx;
+ uint32_t lkey;
+
+ /* If local cache table is full, try to double it. */
+ if (unlikely(bt->len == bt->size))
+ mr_btree_expand(bt, bt->size << 1);
+ /* Look up in the global cache. */
+ rte_rwlock_read_lock(&priv->mr.rwlock);
+ lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
+ if (lkey != UINT32_MAX) {
+ /* Found. */
+ *entry = (*priv->mr.cache.table)[idx];
+ rte_rwlock_read_unlock(&priv->mr.rwlock);
+ /*
+ * Update local cache. Even if it fails, return the found entry
+ * to update top-half cache. Next time, this entry will be found
+ * in the global cache.
+ */
+ mr_btree_insert(bt, entry);
+ return lkey;
+ }
+ rte_rwlock_read_unlock(&priv->mr.rwlock);
+ /* First time to see the address? Create a new MR. */
+ lkey = mlx4_mr_create(dev, entry, addr);
+ /*
+ * Update the local cache if successfully created a new global MR. Even
+ * if failed to create one, there's no action to take in this datapath
+ * code. As returning LKey is invalid, this will eventually make HW
+ * fail.
+ */
+ if (lkey != UINT32_MAX)
+ mr_btree_insert(bt, entry);
+ return lkey;
+}
+
+/**
+ * Bottom-half of LKey search on datapath. Firstly search in cache_bh[] and if
+ * misses, search in the global MR cache table and update the new entry to
+ * per-queue local caches.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mr_ctrl
+ * Pointer to per-queue MR control structure.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mlx4_mr_addr2mr_bh(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+ uintptr_t addr)
+{
+ uint32_t lkey;
+ uint16_t bh_idx = 0;
+ /* Victim in top-half cache to replace with new entry. */
+ struct mlx4_mr_cache *repl = &mr_ctrl->cache[mr_ctrl->head];
+
+ /* Binary-search MR translation table. */
+ lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
+ /* Update top-half cache. */
+ if (likely(lkey != UINT32_MAX)) {
+ *repl = (*mr_ctrl->cache_bh.table)[bh_idx];
+ } else {
+ /*
+ * If missed in local lookup table, search in the global cache
+ * and local cache_bh[] will be updated inside if possible.
+ * Top-half cache entry will also be updated.
+ */
+ lkey = mlx4_mr_lookup_dev(dev, mr_ctrl, repl, addr);
+ if (unlikely(lkey == UINT32_MAX))
+ return UINT32_MAX;
+ }
+ /* Update the most recently used entry. */
+ mr_ctrl->mru = mr_ctrl->head;
+ /* Point to the next victim, the oldest. */
+ mr_ctrl->head = (mr_ctrl->head + 1) % MLX4_MR_CACHE_N;
+ return lkey;
+}
+
+/**
+ * Bottom-half of LKey search on Rx.
+ *
+ * @param rxq
+ * Pointer to Rx queue structure.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr)
+{
+ struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
+ struct priv *priv = rxq->priv;
+
+ DEBUG("Rx queue %u: miss on top-half, mru=%u, head=%u, addr=%p",
+ rxq->stats.idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr);
+ return mlx4_mr_addr2mr_bh(priv->dev, mr_ctrl, addr);
+}
+
+/**
+ * Bottom-half of LKey search on Tx.
*
* @param txq
* Pointer to Tx queue structure.
- * @param[in] mp
- * Memory pool for which a memory region lkey must be added.
- * @param[in] i
- * Index in memory pool (MP) where to add memory region (MR).
+ * @param addr
+ * Search key.
*
* @return
- * Added mr->lkey on success, (uint32_t)-1 on failure.
+ * Searched LKey on success, UINT32_MAX on no match.
*/
uint32_t
-mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
+mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr)
{
+ struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+ struct priv *priv = txq->priv;
+
+ DEBUG("Tx queue %u: miss on top-half, mru=%u, head=%u, addr=%p",
+ txq->stats.idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr);
+ return mlx4_mr_addr2mr_bh(priv->dev, mr_ctrl, addr);
+}
+
+/**
+ * Flush all of the local cache entries.
+ *
+ * @param mr_ctrl
+ * Pointer to per-queue MR control structure.
+ */
+void
+mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl)
+{
+ /* Reset the most-recently-used index. */
+ mr_ctrl->mru = 0;
+ /* Reset the linear search array. */
+ mr_ctrl->head = 0;
+ memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache));
+ /* Reset the B-tree table. */
+ mr_ctrl->cache_bh.len = 1;
+ mr_ctrl->cache_bh.overflow = 0;
+ /* Update the generation number. */
+ mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr;
+ DEBUG("mr_ctrl(%p): flushed, cur_gen=%d",
+ (void *)mr_ctrl, mr_ctrl->cur_gen);
+}
+
+/* Called during rte_mempool_mem_iter() by mlx4_mr_update_mp(). */
+static void
+mlx4_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque,
+ struct rte_mempool_memhdr *memhdr,
+ unsigned mem_idx __rte_unused)
+{
+ struct mr_update_mp_data *data = opaque;
+ uint32_t lkey;
+
+ /* Stop iteration if failed in the previous walk. */
+ if (data->ret < 0)
+ return;
+ /* Register address of the chunk and update local caches. */
+ lkey = mlx4_mr_addr2mr_bh(data->dev, data->mr_ctrl,
+ (uintptr_t)memhdr->addr);
+ if (lkey == UINT32_MAX)
+ data->ret = -1;
+}
+
+/**
+ * Register entire memory chunks in a Mempool.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mr_ctrl
+ * Pointer to per-queue MR control structure.
+ * @param mp
+ * Pointer to registering Mempool.
+ *
+ * @return
+ * 0 on success, -1 on failure.
+ */
+int
+mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+ struct rte_mempool *mp)
+{
+ struct mr_update_mp_data data = {
+ .dev = dev,
+ .mr_ctrl = mr_ctrl,
+ .ret = 0,
+ };
+
+ rte_mempool_mem_iter(mp, mlx4_mr_update_mp_cb, &data);
+ return data.ret;
+}
+
+#ifndef NDEBUG
+/**
+ * Dump all the created MRs and the global cache entries.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx4_mr_dump_dev(struct rte_eth_dev *dev)
+{
+ struct priv *priv = dev->data->dev_private;
struct mlx4_mr *mr;
+ int mr_n = 0;
+ int chunk_n = 0;
+
+ rte_rwlock_read_lock(&priv->mr.rwlock);
+ /* Iterate all the existing MRs. */
+ LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
+ unsigned int n;
+
+ DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
+ dev->data->port_id, mr_n++,
+ rte_cpu_to_be_32(mr->ibv_mr->lkey),
+ mr->ms_n, mr->ms_bmp_n);
+ if (mr->ms_n == 0)
+ continue;
+ for (n = 0; n < mr->ms_bmp_n; ) {
+ struct mlx4_mr_cache ret = { 0, };
- /* Add a new entry, register MR first. */
- DEBUG("%p: discovered new memory pool \"%s\" (%p)",
- (void *)txq, mp->name, (void *)mp);
- mr = mlx4_mr_get(txq->priv, mp);
- if (unlikely(mr == NULL)) {
- DEBUG("%p: unable to configure MR, mlx4_mr_get() failed",
- (void *)txq);
- return (uint32_t)-1;
+ n = mr_find_next_chunk(mr, &ret, n);
+ if (!ret.end)
+ break;
+ DEBUG(" chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")",
+ chunk_n++, ret.start, ret.end);
+ }
}
- if (unlikely(i == RTE_DIM(txq->mp2mr))) {
- /* Table is full, remove oldest entry. */
- DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
- (void *)txq);
- --i;
- mlx4_mr_put(txq->mp2mr[0].mr);
- memmove(&txq->mp2mr[0], &txq->mp2mr[1],
- (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ DEBUG("port %u dumping global cache", dev->data->port_id);
+ mlx4_mr_btree_dump(&priv->mr.cache);
+ rte_rwlock_read_unlock(&priv->mr.rwlock);
+}
+#endif
+
+/**
+ * Release all the created MRs and resources. Remove device from memory callback
+ * list.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx4_mr_release(struct rte_eth_dev *dev)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct mlx4_mr *mr_next = LIST_FIRST(&priv->mr.mr_list);
+
+ /* Remove from memory callback device list. */
+ rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
+ LIST_REMOVE(priv, mem_event_cb);
+ rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+#ifndef NDEBUG
+ mlx4_mr_dump_dev(dev);
+#endif
+ rte_rwlock_write_lock(&priv->mr.rwlock);
+ /* Detach from MR list and move to free list. */
+ while (mr_next != NULL) {
+ struct mlx4_mr *mr = mr_next;
+
+ mr_next = LIST_NEXT(mr, mr);
+ LIST_REMOVE(mr, mr);
+ LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
}
- /* Store the new entry. */
- txq->mp2mr[i].mp = mp;
- txq->mp2mr[i].mr = mr;
- txq->mp2mr[i].lkey = mr->lkey;
- DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
- (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
+ LIST_INIT(&priv->mr.mr_list);
+ /* Free global cache. */
+ mlx4_mr_btree_free(&priv->mr.cache);
+ rte_rwlock_write_unlock(&priv->mr.rwlock);
+ /* Free all remaining MRs. */
+ mlx4_mr_garbage_collect(dev);
}
diff --git a/drivers/net/mlx4/mlx4_mr.h b/drivers/net/mlx4/mlx4_mr.h
new file mode 100644
index 00000000..37a365a8
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_mr.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX4_MR_H_
+#define RTE_PMD_MLX4_MR_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_eal_memconfig.h>
+#include <rte_ethdev.h>
+#include <rte_rwlock.h>
+#include <rte_bitmap.h>
+
+/* Size of per-queue MR cache array for linear search. */
+#define MLX4_MR_CACHE_N 8
+
+/* Size of MR cache table for binary search. */
+#define MLX4_MR_BTREE_CACHE_N 256
+
+/* Memory Region object. */
+struct mlx4_mr {
+ LIST_ENTRY(mlx4_mr) mr; /**< Pointer to the prev/next entry. */
+ struct ibv_mr *ibv_mr; /* Verbs Memory Region. */
+ const struct rte_memseg_list *msl;
+ int ms_base_idx; /* Start index of msl->memseg_arr[]. */
+ int ms_n; /* Number of memsegs in use. */
+ uint32_t ms_bmp_n; /* Number of bits in memsegs bit-mask. */
+ struct rte_bitmap *ms_bmp; /* Bit-mask of memsegs belonged to MR. */
+};
+
+/* Cache entry for Memory Region. */
+struct mlx4_mr_cache {
+ uintptr_t start; /* Start address of MR. */
+ uintptr_t end; /* End address of MR. */
+ uint32_t lkey; /* rte_cpu_to_be_32(ibv_mr->lkey). */
+} __rte_packed;
+
+/* MR Cache table for Binary search. */
+struct mlx4_mr_btree {
+ uint16_t len; /* Number of entries. */
+ uint16_t size; /* Total number of entries. */
+ int overflow; /* Mark failure of table expansion. */
+ struct mlx4_mr_cache (*table)[];
+} __rte_packed;
+
+/* Per-queue MR control descriptor. */
+struct mlx4_mr_ctrl {
+ uint32_t *dev_gen_ptr; /* Generation number of device to poll. */
+ uint32_t cur_gen; /* Generation number saved to flush caches. */
+ uint16_t mru; /* Index of last hit entry in top-half cache. */
+ uint16_t head; /* Index of the oldest entry in top-half cache. */
+ struct mlx4_mr_cache cache[MLX4_MR_CACHE_N]; /* Cache for top-half. */
+ struct mlx4_mr_btree cache_bh; /* Cache for bottom-half. */
+} __rte_packed;
+
+extern struct mlx4_dev_list mlx4_mem_event_cb_list;
+extern rte_rwlock_t mlx4_mem_event_rwlock;
+
+/* First entry must be NULL for comparison. */
+#define mlx4_mr_btree_len(bt) ((bt)->len - 1)
+
+int mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket);
+void mlx4_mr_btree_free(struct mlx4_mr_btree *bt);
+void mlx4_mr_btree_dump(struct mlx4_mr_btree *bt);
+void mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
+ size_t len, void *arg);
+int mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
+ struct rte_mempool *mp);
+void mlx4_mr_dump_dev(struct rte_eth_dev *dev);
+void mlx4_mr_release(struct rte_eth_dev *dev);
+
+/**
+ * Look up LKey from given lookup table by linear search. Firstly look up the
+ * last-hit entry. If miss, the entire array is searched. If found, update the
+ * last-hit index and return LKey.
+ *
+ * @param lkp_tbl
+ * Pointer to lookup table.
+ * @param[in,out] cached_idx
+ * Pointer to last-hit index.
+ * @param n
+ * Size of lookup table.
+ * @param addr
+ * Search key.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx4_mr_lookup_cache(struct mlx4_mr_cache *lkp_tbl, uint16_t *cached_idx,
+ uint16_t n, uintptr_t addr)
+{
+ uint16_t idx;
+
+ if (likely(addr >= lkp_tbl[*cached_idx].start &&
+ addr < lkp_tbl[*cached_idx].end))
+ return lkp_tbl[*cached_idx].lkey;
+ for (idx = 0; idx < n && lkp_tbl[idx].start != 0; ++idx) {
+ if (addr >= lkp_tbl[idx].start &&
+ addr < lkp_tbl[idx].end) {
+ /* Found. */
+ *cached_idx = idx;
+ return lkp_tbl[idx].lkey;
+ }
+ }
+ return UINT32_MAX;
+}
+
+#endif /* RTE_PMD_MLX4_MR_H_ */
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index 153dda52..e15a3c14 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
#ifndef MLX4_PRM_H_
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 7a036ed8..87688c1c 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -88,7 +88,7 @@ mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = {
*/
struct mlx4_rss *
mlx4_rss_get(struct priv *priv, uint64_t fields,
- uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+ const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
uint16_t queues, const uint16_t queue_id[])
{
struct mlx4_rss *rss;
@@ -336,6 +336,8 @@ mlx4_rss_init(struct priv *priv)
unsigned int i;
int ret;
+ if (priv->rss_init)
+ return 0;
/* Prepare range for RSS contexts before creating the first WQ. */
ret = mlx4_glue->dv_set_context_attr
(priv->ctx,
@@ -418,6 +420,7 @@ wq_num_check:
}
wq_num_prev = wq_num;
}
+ priv->rss_init = 1;
return 0;
error:
ERROR("cannot initialize common RSS resources (queue %u): %s: %s",
@@ -446,6 +449,8 @@ mlx4_rss_deinit(struct priv *priv)
{
unsigned int i;
+ if (!priv->rss_init)
+ return;
for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) {
struct rxq *rxq = priv->dev->data->rx_queues[i];
@@ -454,6 +459,7 @@ mlx4_rss_deinit(struct priv *priv)
mlx4_rxq_detach(rxq);
}
}
+ priv->rss_init = 0;
}
/**
@@ -482,6 +488,7 @@ mlx4_rxq_attach(struct rxq *rxq)
}
struct priv *priv = rxq->priv;
+ struct rte_eth_dev *dev = priv->dev;
const uint32_t elts_n = 1 << rxq->elts_n;
const uint32_t sges_n = 1 << rxq->sges_n;
struct rte_mbuf *(*elts)[elts_n] = rxq->elts;
@@ -491,6 +498,8 @@ mlx4_rxq_attach(struct rxq *rxq)
const char *msg;
struct ibv_cq *cq = NULL;
struct ibv_wq *wq = NULL;
+ uint32_t create_flags = 0;
+ uint32_t comp_mask = 0;
volatile struct mlx4_wqe_data_seg (*wqes)[];
unsigned int i;
int ret;
@@ -503,6 +512,11 @@ mlx4_rxq_attach(struct rxq *rxq)
msg = "CQ creation failure";
goto error;
}
+ /* By default, FCS (CRC) is stripped by hardware. */
+ if (rxq->crc_present) {
+ create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
+ comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+ }
wq = mlx4_glue->create_wq
(priv->ctx,
&(struct ibv_wq_init_attr){
@@ -511,6 +525,8 @@ mlx4_rxq_attach(struct rxq *rxq)
.max_sge = sges_n,
.pd = priv->pd,
.cq = cq,
+ .comp_mask = comp_mask,
+ .create_flags = create_flags,
});
if (!wq) {
ret = errno ? errno : EINVAL;
@@ -537,6 +553,11 @@ mlx4_rxq_attach(struct rxq *rxq)
msg = "failed to obtain device information from WQ/CQ objects";
goto error;
}
+ /* Pre-register Rx mempool. */
+ DEBUG("port %u Rx queue %u registering mp %s having %u chunks",
+ priv->dev->data->port_id, rxq->stats.idx,
+ rxq->mp->name, rxq->mp->nb_mem_chunks);
+ mlx4_mr_update_mp(dev, &rxq->mr_ctrl, rxq->mp);
wqes = (volatile struct mlx4_wqe_data_seg (*)[])
((uintptr_t)dv_rwq.buf.buf + dv_rwq.rq.offset);
for (i = 0; i != RTE_DIM(*elts); ++i) {
@@ -568,7 +589,7 @@ mlx4_rxq_attach(struct rxq *rxq)
.addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
uintptr_t)),
.byte_count = rte_cpu_to_be_32(buf->data_len),
- .lkey = rte_cpu_to_be_32(rxq->mr->lkey),
+ .lkey = mlx4_rx_mb2mr(rxq, buf),
};
(*elts)[i] = buf;
}
@@ -597,6 +618,7 @@ error:
claim_zero(mlx4_glue->destroy_wq(wq));
if (cq)
claim_zero(mlx4_glue->destroy_cq(cq));
+ --rxq->usecnt;
rte_errno = ret;
ERROR("error while attaching Rx queue %p: %s: %s",
(void *)rxq, msg, strerror(ret));
@@ -676,26 +698,6 @@ mlx4_get_rx_port_offloads(struct priv *priv)
}
/**
- * Checks if the per-queue offload configuration is valid.
- *
- * @param priv
- * Pointer to private structure.
- * @param requested
- * Per-queue offloads configuration.
- *
- * @return
- * Nonzero when configuration is valid.
- */
-static int
-mlx4_check_rx_queue_offloads(struct priv *priv, uint64_t requested)
-{
- uint64_t mandatory = priv->dev->data->dev_conf.rxmode.offloads;
- uint64_t supported = mlx4_get_rx_port_offloads(priv);
-
- return !((mandatory ^ requested) & supported);
-}
-
-/**
* DPDK callback to configure a Rx queue.
*
* @param dev
@@ -736,20 +738,14 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
},
};
int ret;
+ uint32_t crc_present;
+ uint64_t offloads;
+
+ offloads = conf->offloads | dev->data->dev_conf.rxmode.offloads;
- (void)conf; /* Thresholds configuration (ignored). */
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
- if (!mlx4_check_rx_queue_offloads(priv, conf->offloads)) {
- rte_errno = ENOTSUP;
- ERROR("%p: Rx queue offloads 0x%" PRIx64 " don't match port "
- "offloads 0x%" PRIx64 " or supported offloads 0x%" PRIx64,
- (void *)dev, conf->offloads,
- dev->data->dev_conf.rxmode.offloads,
- (mlx4_get_rx_port_offloads(priv) |
- mlx4_get_rx_queue_offloads(priv)));
- return -rte_errno;
- }
+
if (idx >= dev->data->nb_rx_queues) {
rte_errno = EOVERFLOW;
ERROR("%p: queue index out of range (%u >= %u)",
@@ -774,6 +770,23 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
" to the next power of two (%u)",
(void *)dev, idx, desc);
}
+ /* By default, FCS (CRC) is stripped by hardware. */
+ if (offloads & DEV_RX_OFFLOAD_CRC_STRIP) {
+ crc_present = 0;
+ } else if (priv->hw_fcs_strip) {
+ crc_present = 1;
+ } else {
+ WARN("%p: CRC stripping has been disabled but will still"
+ " be performed by hardware, make sure MLNX_OFED and"
+ " firmware are up to date",
+ (void *)dev);
+ crc_present = 0;
+ }
+ DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
+ " incoming frames to hide it",
+ (void *)dev,
+ crc_present ? "disabled" : "enabled",
+ crc_present << 2);
/* Allocate and initialize Rx queue. */
mlx4_zmallocv_socket("RXQ", vec, RTE_DIM(vec), socket);
if (!rxq) {
@@ -790,9 +803,10 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
.elts = elts,
/* Toggle Rx checksum offload if hardware supports it. */
.csum = priv->hw_csum &&
- (conf->offloads & DEV_RX_OFFLOAD_CHECKSUM),
+ (offloads & DEV_RX_OFFLOAD_CHECKSUM),
.csum_l2tun = priv->hw_csum_l2tun &&
- (conf->offloads & DEV_RX_OFFLOAD_CHECKSUM),
+ (offloads & DEV_RX_OFFLOAD_CHECKSUM),
+ .crc_present = crc_present,
.l2tun_offload = priv->hw_csum_l2tun,
.stats = {
.idx = idx,
@@ -804,7 +818,7 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
(mb_len - RTE_PKTMBUF_HEADROOM)) {
;
- } else if (conf->offloads & DEV_RX_OFFLOAD_SCATTER) {
+ } else if (offloads & DEV_RX_OFFLOAD_SCATTER) {
uint32_t size =
RTE_PKTMBUF_HEADROOM +
dev->data->dev_conf.rxmode.max_rx_pkt_len;
@@ -847,11 +861,9 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1 << rxq->sges_n);
goto error;
}
- /* Use the entire Rx mempool as the memory region. */
- rxq->mr = mlx4_mr_get(priv, mp);
- if (!rxq->mr) {
- ERROR("%p: MR creation failure: %s",
- (void *)dev, strerror(rte_errno));
+ if (mlx4_mr_btree_init(&rxq->mr_ctrl.cache_bh,
+ MLX4_MR_BTREE_CACHE_N, socket)) {
+ /* rte_errno is already set. */
goto error;
}
if (dev->data->dev_conf.intr_conf.rxq) {
@@ -911,7 +923,6 @@ mlx4_rx_queue_release(void *dpdk_rxq)
assert(!rxq->rq_db);
if (rxq->channel)
claim_zero(mlx4_glue->destroy_comp_channel(rxq->channel));
- if (rxq->mr)
- mlx4_mr_put(rxq->mr);
+ mlx4_mr_btree_free(&rxq->mr_ctrl.cache_bh);
rte_free(rxq);
}
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 8ca8b77c..a92da66b 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -263,7 +263,7 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, volatile uint32_t *start,
} while (start != (volatile uint32_t *)sq->eob);
start = (volatile uint32_t *)sq->buf;
/* Flip invalid stamping ownership. */
- stamp ^= RTE_BE32(0x1 << MLX4_SQ_OWNER_BIT);
+ stamp ^= RTE_BE32(1u << MLX4_SQ_OWNER_BIT);
sq->stamp = stamp;
if (start == end)
return size;
@@ -344,24 +344,6 @@ mlx4_txq_complete(struct txq *txq, const unsigned int elts_m,
}
/**
- * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
- * the cloned mbuf is allocated is returned instead.
- *
- * @param buf
- * Pointer to mbuf.
- *
- * @return
- * Memory pool where data is located for given mbuf.
- */
-static struct rte_mempool *
-mlx4_txq_mb2mp(struct rte_mbuf *buf)
-{
- if (unlikely(RTE_MBUF_INDIRECT(buf)))
- return rte_mbuf_from_indirect(buf)->pool;
- return buf->pool;
-}
-
-/**
* Write Tx data segment to the SQ.
*
* @param dseg
@@ -378,7 +360,7 @@ mlx4_fill_tx_data_seg(volatile struct mlx4_wqe_data_seg *dseg,
uint32_t lkey, uintptr_t addr, rte_be32_t byte_count)
{
dseg->addr = rte_cpu_to_be_64(addr);
- dseg->lkey = rte_cpu_to_be_32(lkey);
+ dseg->lkey = lkey;
#if RTE_CACHE_LINE_SIZE < 64
/*
* Need a barrier here before writing the byte_count
@@ -437,7 +419,7 @@ mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
goto txbb_tail_segs;
txbb_head_seg:
/* Memory region key (big endian) for this memory pool. */
- lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
if (unlikely(lkey == (uint32_t)-1)) {
DEBUG("%p: unable to get MP <-> MR association",
(void *)txq);
@@ -449,7 +431,7 @@ txbb_head_seg:
dseg = (volatile struct mlx4_wqe_data_seg *)
sq->buf;
dseg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(sbuf, uintptr_t));
- dseg->lkey = rte_cpu_to_be_32(lkey);
+ dseg->lkey = lkey;
/*
* This data segment starts at the beginning of a new
* TXBB, so we need to postpone its byte_count writing
@@ -469,7 +451,7 @@ txbb_tail_segs:
/* Jump to default if there are more than two segments remaining. */
switch (nb_segs) {
default:
- lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
if (unlikely(lkey == (uint32_t)-1)) {
DEBUG("%p: unable to get MP <-> MR association",
(void *)txq);
@@ -485,7 +467,7 @@ txbb_tail_segs:
nb_segs--;
/* fallthrough */
case 2:
- lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
if (unlikely(lkey == (uint32_t)-1)) {
DEBUG("%p: unable to get MP <-> MR association",
(void *)txq);
@@ -501,7 +483,7 @@ txbb_tail_segs:
nb_segs--;
/* fallthrough */
case 1:
- lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
if (unlikely(lkey == (uint32_t)-1)) {
DEBUG("%p: unable to get MP <-> MR association",
(void *)txq);
@@ -611,7 +593,7 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
elt->buf = NULL;
break;
}
- lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+ lkey = mlx4_tx_mb2mr(txq, buf);
if (unlikely(lkey == (uint32_t)-1)) {
/* MR does not exist. */
DEBUG("%p: unable to get MP <-> MR association",
@@ -639,7 +621,7 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
ctrl_next = (volatile struct mlx4_wqe_ctrl_seg *)
((volatile uint8_t *)ctrl_next - sq->size);
/* Flip HW valid ownership. */
- sq->owner_opcode ^= 0x1 << MLX4_SQ_OWNER_BIT;
+ sq->owner_opcode ^= 1u << MLX4_SQ_OWNER_BIT;
}
/*
* For raw Ethernet, the SOLICIT flag is used to indicate
@@ -934,11 +916,14 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
goto skip;
}
pkt = seg;
+ assert(len >= (rxq->crc_present << 2));
/* Update packet information. */
pkt->packet_type =
rxq_cq_to_pkt_type(cqe, rxq->l2tun_offload);
pkt->ol_flags = PKT_RX_RSS_HASH;
pkt->hash.rss = cqe->immed_rss_invalid;
+ if (rxq->crc_present)
+ len -= ETHER_CRC_LEN;
pkt->pkt_len = len;
if (rxq->csum | rxq->csum_l2tun) {
uint32_t flags =
@@ -963,6 +948,9 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
* changes.
*/
scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+ /* If there's only one MR, no need to replace LKey in WQE. */
+ if (unlikely(mlx4_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+ scat->lkey = mlx4_rx_mb2mr(rxq, rep);
if (len > seg->data_len) {
len -= seg->data_len;
++pkt->nb_segs;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index c12bd39a..4c025e3a 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
#ifndef MLX4_RXTX_H_
@@ -25,6 +25,7 @@
#include "mlx4.h"
#include "mlx4_prm.h"
+#include "mlx4_mr.h"
/** Rx queue counters. */
struct mlx4_rxq_stats {
@@ -39,7 +40,6 @@ struct mlx4_rxq_stats {
struct rxq {
struct priv *priv; /**< Back pointer to private data. */
struct rte_mempool *mp; /**< Memory pool for allocations. */
- struct mlx4_mr *mr; /**< Memory region. */
struct ibv_cq *cq; /**< Completion queue. */
struct ibv_wq *wq; /**< Work queue. */
struct ibv_comp_channel *channel; /**< Rx completion channel. */
@@ -47,11 +47,13 @@ struct rxq {
uint16_t port_id; /**< Port ID for incoming packets. */
uint16_t sges_n; /**< Number of segments per packet (log2 value). */
uint16_t elts_n; /**< Mbuf queue size (log2 value). */
+ struct mlx4_mr_ctrl mr_ctrl; /* MR control descriptor. */
struct rte_mbuf *(*elts)[]; /**< Rx elements. */
volatile struct mlx4_wqe_data_seg (*wqes)[]; /**< HW queue entries. */
volatile uint32_t *rq_db; /**< RQ doorbell record. */
uint32_t csum:1; /**< Enable checksum offloading. */
uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */
+ uint32_t crc_present:1; /**< CRC must be subtracted. */
uint32_t l2tun_offload:1; /**< L2 tunnel offload is enabled. */
struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */
struct mlx4_rxq_stats stats; /**< Rx queue counters. */
@@ -83,7 +85,7 @@ struct txq_elt {
};
};
-/** Rx queue counters. */
+/** Tx queue counters. */
struct mlx4_txq_stats {
unsigned int idx; /**< Mapping index. */
uint64_t opackets; /**< Total of successfully sent packets. */
@@ -100,6 +102,7 @@ struct txq {
int elts_comp_cd; /**< Countdown for next completion. */
unsigned int elts_comp_cd_init; /**< Initial value for countdown. */
unsigned int elts_n; /**< (*elts)[] length. */
+ struct mlx4_mr_ctrl mr_ctrl; /* MR control descriptor. */
struct txq_elt (*elts)[]; /**< Tx elements. */
struct mlx4_txq_stats stats; /**< Tx queue counters. */
uint32_t max_inline; /**< Max inline send size. */
@@ -108,11 +111,6 @@ struct txq {
uint32_t lb:1; /**< Whether packets should be looped back by eSwitch. */
uint8_t *bounce_buf;
/**< Memory used for storing the first DWORD of data TXBBs. */
- struct {
- const struct rte_mempool *mp; /**< Cached memory pool. */
- struct mlx4_mr *mr; /**< Memory region (for mp). */
- uint32_t lkey; /**< mr->lkey copy. */
- } mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */
struct priv *priv; /**< Back pointer to private data. */
unsigned int socket; /**< CPU socket ID for allocations. */
struct ibv_cq *cq; /**< Completion queue. */
@@ -126,7 +124,7 @@ uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE];
int mlx4_rss_init(struct priv *priv);
void mlx4_rss_deinit(struct priv *priv);
struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields,
- uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+ const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
uint16_t queues, const uint16_t queue_id[]);
void mlx4_rss_put(struct mlx4_rss *rss);
int mlx4_rss_attach(struct mlx4_rss *rss);
@@ -160,34 +158,70 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
const struct rte_eth_txconf *conf);
void mlx4_tx_queue_release(void *dpdk_txq);
+/* mlx4_mr.c */
+
+void mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl);
+uint32_t mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr);
+uint32_t mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr);
+
/**
- * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
- * Call mlx4_txq_add_mr() if MP is not registered yet.
+ * Query LKey from a packet buffer for Rx. No need to flush local caches for Rx
+ * as mempool is pre-configured and static.
+ *
+ * @param rxq
+ * Pointer to Rx queue structure.
+ * @param addr
+ * Address to search.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx4_rx_addr2mr(struct rxq *rxq, uintptr_t addr)
+{
+ struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
+ uint32_t lkey;
+
+ /* Linear search on MR cache array. */
+ lkey = mlx4_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru,
+ MLX4_MR_CACHE_N, addr);
+ if (likely(lkey != UINT32_MAX))
+ return lkey;
+ /* Take slower bottom-half (Binary Search) on miss. */
+ return mlx4_rx_addr2mr_bh(rxq, addr);
+}
+
+#define mlx4_rx_mb2mr(rxq, mb) mlx4_rx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr))
+
+/**
+ * Query LKey from a packet buffer for Tx. If not found, add the mempool.
*
* @param txq
* Pointer to Tx queue structure.
- * @param[in] mp
- * Memory pool for which a memory region lkey must be returned.
+ * @param addr
+ * Address to search.
*
* @return
- * mr->lkey on success, (uint32_t)-1 on failure.
+ * Searched LKey on success, UINT32_MAX on no match.
*/
-static inline uint32_t
-mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+static __rte_always_inline uint32_t
+mlx4_tx_addr2mr(struct txq *txq, uintptr_t addr)
{
- unsigned int i;
-
- for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
- if (unlikely(txq->mp2mr[i].mp == NULL)) {
- /* Unknown MP, add a new MR for it. */
- break;
- }
- if (txq->mp2mr[i].mp == mp) {
- /* MP found MP. */
- return txq->mp2mr[i].lkey;
- }
- }
- return mlx4_txq_add_mr(txq, mp, i);
+ struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+ uint32_t lkey;
+
+ /* Check generation bit to see if there's any change on existing MRs. */
+ if (unlikely(*mr_ctrl->dev_gen_ptr != mr_ctrl->cur_gen))
+ mlx4_mr_flush_local_cache(mr_ctrl);
+ /* Linear search on MR cache array. */
+ lkey = mlx4_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru,
+ MLX4_MR_CACHE_N, addr);
+ if (likely(lkey != UINT32_MAX))
+ return lkey;
+ /* Take slower bottom-half (binary search) on miss. */
+ return mlx4_tx_addr2mr_bh(txq, addr);
}
+#define mlx4_tx_mb2mr(rxq, mb) mlx4_tx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr))
+
#endif /* MLX4_RXTX_H_ */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 071b2d5d..6edaadbb 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
@@ -63,64 +63,6 @@ mlx4_txq_free_elts(struct txq *txq)
txq->elts_tail = txq->elts_head;
}
-struct txq_mp2mr_mbuf_check_data {
- int ret;
-};
-
-/**
- * Callback function for rte_mempool_obj_iter() to check whether a given
- * mempool object looks like a mbuf.
- *
- * @param[in] mp
- * The mempool pointer
- * @param[in] arg
- * Context data (struct mlx4_txq_mp2mr_mbuf_check_data). Contains the
- * return value.
- * @param[in] obj
- * Object address.
- * @param index
- * Object index, unused.
- */
-static void
-mlx4_txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
- uint32_t index)
-{
- struct txq_mp2mr_mbuf_check_data *data = arg;
- struct rte_mbuf *buf = obj;
-
- (void)index;
- /*
- * Check whether mbuf structure fits element size and whether mempool
- * pointer is valid.
- */
- if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
- data->ret = -1;
-}
-
-/**
- * Iterator function for rte_mempool_walk() to register existing mempools and
- * fill the MP to MR cache of a Tx queue.
- *
- * @param[in] mp
- * Memory Pool to register.
- * @param *arg
- * Pointer to Tx queue structure.
- */
-static void
-mlx4_txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
-{
- struct txq *txq = arg;
- struct txq_mp2mr_mbuf_check_data data = {
- .ret = 0,
- };
-
- /* Register mempool only if the first element looks like a mbuf. */
- if (rte_mempool_obj_iter(mp, mlx4_txq_mp2mr_mbuf_check, &data) == 0 ||
- data.ret == -1)
- return;
- mlx4_txq_mp2mr(txq, mp);
-}
-
/**
* Retrieves information needed in order to directly access the Tx queue.
*
@@ -144,9 +86,9 @@ mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv)
uint32_t headroom_size = 2048 + (1 << dqp->sq.wqe_shift);
/* Continuous headroom size bytes must always stay freed. */
sq->remain_size = sq->size - headroom_size;
- sq->owner_opcode = MLX4_OPCODE_SEND | (0 << MLX4_SQ_OWNER_BIT);
+ sq->owner_opcode = MLX4_OPCODE_SEND | (0u << MLX4_SQ_OWNER_BIT);
sq->stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
- (0 << MLX4_SQ_OWNER_BIT));
+ (0u << MLX4_SQ_OWNER_BIT));
sq->db = dqp->sdb;
sq->doorbell_qpn = dqp->doorbell_qpn;
cq->buf = dcq->buf.buf;
@@ -180,26 +122,6 @@ mlx4_get_tx_port_offloads(struct priv *priv)
}
/**
- * Checks if the per-queue offload configuration is valid.
- *
- * @param priv
- * Pointer to private structure.
- * @param requested
- * Per-queue offloads configuration.
- *
- * @return
- * Nonzero when configuration is valid.
- */
-static int
-mlx4_check_tx_queue_offloads(struct priv *priv, uint64_t requested)
-{
- uint64_t mandatory = priv->dev->data->dev_conf.txmode.offloads;
- uint64_t supported = mlx4_get_tx_port_offloads(priv);
-
- return !((mandatory ^ requested) & supported);
-}
-
-/**
* DPDK callback to configure a Tx queue.
*
* @param dev
@@ -246,23 +168,13 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
},
};
int ret;
+ uint64_t offloads;
+
+ offloads = conf->offloads | dev->data->dev_conf.txmode.offloads;
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
- /*
- * Don't verify port offloads for application which
- * use the old API.
- */
- if ((conf->txq_flags & ETH_TXQ_FLAGS_IGNORE) &&
- !mlx4_check_tx_queue_offloads(priv, conf->offloads)) {
- rte_errno = ENOTSUP;
- ERROR("%p: Tx queue offloads 0x%" PRIx64 " don't match port "
- "offloads 0x%" PRIx64 " or supported offloads 0x%" PRIx64,
- (void *)dev, conf->offloads,
- dev->data->dev_conf.txmode.offloads,
- mlx4_get_tx_port_offloads(priv));
- return -rte_errno;
- }
+
if (idx >= dev->data->nb_tx_queues) {
rte_errno = EOVERFLOW;
ERROR("%p: queue index out of range (%u >= %u)",
@@ -313,11 +225,11 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
.elts_comp_cd_init =
RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4),
.csum = priv->hw_csum &&
- (conf->offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
+ (offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM |
DEV_TX_OFFLOAD_UDP_CKSUM |
DEV_TX_OFFLOAD_TCP_CKSUM)),
.csum_l2tun = priv->hw_csum_l2tun &&
- (conf->offloads &
+ (offloads &
DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM),
/* Enable Tx loopback for VF devices. */
.lb = !!priv->vf,
@@ -404,8 +316,13 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
/* Save first wqe pointer in the first element. */
(&(*txq->elts)[0])->wqe =
(volatile struct mlx4_wqe_ctrl_seg *)txq->msq.buf;
- /* Pre-register known mempools. */
- rte_mempool_walk(mlx4_txq_mp2mr_iter, txq);
+ if (mlx4_mr_btree_init(&txq->mr_ctrl.cache_bh,
+ MLX4_MR_BTREE_CACHE_N, socket)) {
+ /* rte_errno is already set. */
+ goto error;
+ }
+ /* Save pointer of global generation number to check memory event. */
+ txq->mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen;
DEBUG("%p: adding Tx queue %p to list", (void *)dev, (void *)txq);
dev->data->tx_queues[idx] = txq;
return 0;
@@ -446,11 +363,6 @@ mlx4_tx_queue_release(void *dpdk_txq)
claim_zero(mlx4_glue->destroy_qp(txq->qp));
if (txq->cq)
claim_zero(mlx4_glue->destroy_cq(txq->cq));
- for (i = 0; i != RTE_DIM(txq->mp2mr); ++i) {
- if (!txq->mp2mr[i].mp)
- break;
- assert(txq->mp2mr[i].mr);
- mlx4_mr_put(txq->mp2mr[i].mr);
- }
+ mlx4_mr_btree_free(&txq->mr_ctrl.cache_bh);
rte_free(txq);
}
diff --git a/drivers/net/mlx4/mlx4_utils.c b/drivers/net/mlx4/mlx4_utils.c
index d10812ec..a727d703 100644
--- a/drivers/net/mlx4/mlx4_utils.c
+++ b/drivers/net/mlx4/mlx4_utils.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
/**
diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h
index 9fdbacad..86abb3b7 100644
--- a/drivers/net/mlx4/mlx4_utils.h
+++ b/drivers/net/mlx4/mlx4_utils.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox
+ * Copyright 2017 Mellanox Technologies, Ltd
*/
#ifndef MLX4_UTILS_H_