From b63264c8342e6a1b6971c79550d2af2024b6a4de Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Tue, 14 Aug 2018 18:52:30 +0100 Subject: New upstream version 18.08 Change-Id: I32fdf5e5016556d9c0a6d88ddaf1fc468961790a Signed-off-by: Luca Boccassi --- drivers/net/mlx4/Makefile | 50 +- drivers/net/mlx4/mlx4.c | 255 ++++++-- drivers/net/mlx4/mlx4.h | 57 +- drivers/net/mlx4/mlx4_ethdev.c | 208 +------ drivers/net/mlx4/mlx4_flow.c | 240 +++++--- drivers/net/mlx4/mlx4_flow.h | 6 +- drivers/net/mlx4/mlx4_glue.c | 2 +- drivers/net/mlx4/mlx4_glue.h | 2 +- drivers/net/mlx4/mlx4_intr.c | 2 +- drivers/net/mlx4/mlx4_mr.c | 1247 ++++++++++++++++++++++++++++++++++------ drivers/net/mlx4/mlx4_mr.h | 122 ++++ drivers/net/mlx4/mlx4_prm.h | 18 +- drivers/net/mlx4/mlx4_rxq.c | 109 ++-- drivers/net/mlx4/mlx4_rxtx.c | 566 ++++++++++++++---- drivers/net/mlx4/mlx4_rxtx.h | 94 ++- drivers/net/mlx4/mlx4_txq.c | 130 +---- drivers/net/mlx4/mlx4_utils.c | 2 +- drivers/net/mlx4/mlx4_utils.h | 2 +- 18 files changed, 2267 insertions(+), 845 deletions(-) create mode 100644 drivers/net/mlx4/mlx4_mr.h (limited to 'drivers/net/mlx4') diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile index cc800493..92e93225 100644 --- a/drivers/net/mlx4/Makefile +++ b/drivers/net/mlx4/Makefile @@ -1,33 +1,6 @@ -# BSD LICENSE -# +# SPDX-License-Identifier: BSD-3-Clause # Copyright 2012 6WIND S.A. -# Copyright 2012 Mellanox -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of 6WIND S.A. nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Copyright 2012 Mellanox Technologies, Ltd include $(RTE_SDK)/mk/rte.vars.mk @@ -64,6 +37,7 @@ CFLAGS += -D_BSD_SOURCE CFLAGS += -D_DEFAULT_SOURCE CFLAGS += -D_XOPEN_SOURCE=600 CFLAGS += $(WERROR_FLAGS) +CFLAGS += -DALLOW_EXPERIMENTAL_API ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DLOPEN_DEPS),y) CFLAGS += -DMLX4_GLUE='"$(LIB_GLUE)"' CFLAGS += -DMLX4_GLUE_VERSION='"$(LIB_GLUE_VERSION)"' @@ -95,10 +69,6 @@ else CFLAGS += -DNDEBUG -UPEDANTIC endif -ifdef CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE -CFLAGS += -DMLX4_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE) -endif - include $(RTE_SDK)/mk/rte.lib.mk # Generate and clean-up mlx4_autoconf.h. @@ -115,6 +85,11 @@ mlx4_autoconf.h.new: FORCE mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh $Q $(RM) -f -- '$@' $Q : > '$@' + $Q sh -- '$<' '$@' \ + HAVE_IBV_MLX4_WQE_LSO_SEG \ + infiniband/mlx4dv.h \ + type 'struct mlx4_wqe_lso_seg' \ + $(AUTOCONF_OUTPUT) # Create mlx4_autoconf.h or update it in case it differs from the new one. @@ -132,10 +107,15 @@ ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DLOPEN_DEPS),y) $(LIB): $(LIB_GLUE) +ifeq ($(LINK_USING_CC),1) +GLUE_LDFLAGS := $(call linkerprefix,$(LDFLAGS)) +else +GLUE_LDFLAGS := $(LDFLAGS) +endif $(LIB_GLUE): mlx4_glue.o - $Q $(LD) $(LDFLAGS) $(EXTRA_LDFLAGS) \ + $Q $(LD) $(GLUE_LDFLAGS) $(EXTRA_LDFLAGS) \ -Wl,-h,$(LIB_GLUE) \ - -s -shared -o $@ $< -libverbs -lmlx4 + -shared -o $@ $< -libverbs -lmlx4 mlx4_glue.o: mlx4_autoconf.h diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index ee93dafe..defc0d4b 100644 --- a/drivers/net/mlx4/mlx4.c +++ b/drivers/net/mlx4/mlx4.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2012 6WIND S.A. - * Copyright 2012 Mellanox + * Copyright 2012 Mellanox Technologies, Ltd */ /** @@ -44,9 +44,15 @@ #include "mlx4.h" #include "mlx4_glue.h" #include "mlx4_flow.h" +#include "mlx4_mr.h" #include "mlx4_rxtx.h" #include "mlx4_utils.h" +struct mlx4_dev_list mlx4_mem_event_cb_list = + LIST_HEAD_INITIALIZER(mlx4_mem_event_cb_list); + +rte_rwlock_t mlx4_mem_event_rwlock = RTE_RWLOCK_INITIALIZER; + /** Configuration structure for device arguments. */ struct mlx4_conf { struct { @@ -61,6 +67,8 @@ const char *pmd_mlx4_init_params[] = { NULL, }; +static void mlx4_dev_stop(struct rte_eth_dev *dev); + /** * DPDK callback for Ethernet device configuration. * @@ -123,6 +131,9 @@ mlx4_dev_start(struct rte_eth_dev *dev) (void *)dev, strerror(-ret)); goto err; } +#ifndef NDEBUG + mlx4_mr_dump_dev(dev); +#endif ret = mlx4_rxq_intr_enable(priv); if (ret) { ERROR("%p: interrupt handler installation failed", @@ -143,8 +154,7 @@ mlx4_dev_start(struct rte_eth_dev *dev) dev->rx_pkt_burst = mlx4_rx_burst; return 0; err: - /* Rollback. */ - priv->started = 0; + mlx4_dev_stop(dev); return ret; } @@ -194,10 +204,12 @@ mlx4_dev_close(struct rte_eth_dev *dev) dev->tx_pkt_burst = mlx4_tx_burst_removed; rte_wmb(); mlx4_flow_clean(priv); + mlx4_rss_deinit(priv); for (i = 0; i != dev->data->nb_rx_queues; ++i) mlx4_rx_queue_release(dev->data->rx_queues[i]); for (i = 0; i != dev->data->nb_tx_queues; ++i) mlx4_tx_queue_release(dev->data->tx_queues[i]); + mlx4_mr_release(dev); if (priv->pd != NULL) { assert(priv->ctx != NULL); claim_zero(mlx4_glue->dealloc_pd(priv->pd)); @@ -385,6 +397,99 @@ free_kvlist: return ret; } +/** + * Interpret RSS capabilities reported by device. + * + * This function returns the set of usable Verbs RSS hash fields, kernel + * quirks taken into account. + * + * @param ctx + * Verbs context. + * @param pd + * Verbs protection domain. + * @param device_attr_ex + * Extended device attributes to interpret. + * + * @return + * Usable RSS hash fields mask in Verbs format. + */ +static uint64_t +mlx4_hw_rss_sup(struct ibv_context *ctx, struct ibv_pd *pd, + struct ibv_device_attr_ex *device_attr_ex) +{ + uint64_t hw_rss_sup = device_attr_ex->rss_caps.rx_hash_fields_mask; + struct ibv_cq *cq = NULL; + struct ibv_wq *wq = NULL; + struct ibv_rwq_ind_table *ind = NULL; + struct ibv_qp *qp = NULL; + + if (!hw_rss_sup) { + WARN("no RSS capabilities reported; disabling support for UDP" + " RSS and inner VXLAN RSS"); + return IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4 | + IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6 | + IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP; + } + if (!(hw_rss_sup & IBV_RX_HASH_INNER)) + return hw_rss_sup; + /* + * Although reported as supported, missing code in some Linux + * versions (v4.15, v4.16) prevents the creation of hash QPs with + * inner capability. + * + * There is no choice but to attempt to instantiate a temporary RSS + * context in order to confirm its support. + */ + cq = mlx4_glue->create_cq(ctx, 1, NULL, NULL, 0); + wq = cq ? mlx4_glue->create_wq + (ctx, + &(struct ibv_wq_init_attr){ + .wq_type = IBV_WQT_RQ, + .max_wr = 1, + .max_sge = 1, + .pd = pd, + .cq = cq, + }) : NULL; + ind = wq ? mlx4_glue->create_rwq_ind_table + (ctx, + &(struct ibv_rwq_ind_table_init_attr){ + .log_ind_tbl_size = 0, + .ind_tbl = &wq, + .comp_mask = 0, + }) : NULL; + qp = ind ? mlx4_glue->create_qp_ex + (ctx, + &(struct ibv_qp_init_attr_ex){ + .comp_mask = + (IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_RX_HASH | + IBV_QP_INIT_ATTR_IND_TABLE), + .qp_type = IBV_QPT_RAW_PACKET, + .pd = pd, + .rwq_ind_tbl = ind, + .rx_hash_conf = { + .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ, + .rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE, + .rx_hash_key = mlx4_rss_hash_key_default, + .rx_hash_fields_mask = hw_rss_sup, + }, + }) : NULL; + if (!qp) { + WARN("disabling unusable inner RSS capability due to kernel" + " quirk"); + hw_rss_sup &= ~IBV_RX_HASH_INNER; + } else { + claim_zero(mlx4_glue->destroy_qp(qp)); + } + if (ind) + claim_zero(mlx4_glue->destroy_rwq_ind_table(ind)); + if (wq) + claim_zero(mlx4_glue->destroy_wq(wq)); + if (cq) + claim_zero(mlx4_glue->destroy_cq(cq)); + return hw_rss_sup; +} + static struct rte_pci_driver mlx4_driver; /** @@ -470,14 +575,14 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) ibv_dev = list[i]; DEBUG("device opened"); if (mlx4_glue->query_device(attr_ctx, &device_attr)) { - rte_errno = ENODEV; + err = ENODEV; goto error; } INFO("%u port(s) detected", device_attr.phys_port_cnt); conf.ports.present |= (UINT64_C(1) << device_attr.phys_port_cnt) - 1; if (mlx4_args(pci_dev->device.devargs, &conf)) { ERROR("failed to process device arguments"); - rte_errno = EINVAL; + err = EINVAL; goto error; } /* Use all ports when none are defined */ @@ -485,7 +590,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) conf.ports.enabled = conf.ports.present; /* Retrieve extended device attributes. */ if (mlx4_glue->query_device_ex(attr_ctx, NULL, &device_attr_ex)) { - rte_errno = ENODEV; + err = ENODEV; goto error; } assert(device_attr.max_sge >= MLX4_MAX_SGE); @@ -504,18 +609,18 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) DEBUG("using port %u", port); ctx = mlx4_glue->open_device(ibv_dev); if (ctx == NULL) { - rte_errno = ENODEV; + err = ENODEV; goto port_error; } /* Check port status. */ err = mlx4_glue->query_port(ctx, port, &port_attr); if (err) { - rte_errno = err; - ERROR("port query failed: %s", strerror(rte_errno)); + err = ENODEV; + ERROR("port query failed: %s", strerror(err)); goto port_error; } if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { - rte_errno = ENOTSUP; + err = ENOTSUP; ERROR("port %d is not configured in Ethernet mode", port); goto port_error; @@ -525,15 +630,16 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) port, mlx4_glue->port_state_str(port_attr.state), port_attr.state); /* Make asynchronous FD non-blocking to handle interrupts. */ - if (mlx4_fd_set_non_blocking(ctx->async_fd) < 0) { + err = mlx4_fd_set_non_blocking(ctx->async_fd); + if (err) { ERROR("cannot make asynchronous FD non-blocking: %s", - strerror(rte_errno)); + strerror(err)); goto port_error; } /* Allocate protection domain. */ pd = mlx4_glue->alloc_pd(ctx); if (pd == NULL) { - rte_errno = ENOMEM; + err = ENOMEM; ERROR("PD allocation failure"); goto port_error; } @@ -542,7 +648,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) sizeof(*priv), RTE_CACHE_LINE_SIZE); if (priv == NULL) { - rte_errno = ENOMEM; + err = ENOMEM; ERROR("priv allocation failure"); goto port_error; } @@ -562,26 +668,32 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) (device_attr.vendor_part_id == PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO); DEBUG("L2 tunnel checksum offloads are %ssupported", - (priv->hw_csum_l2tun ? "" : "not ")); - priv->hw_rss_sup = device_attr_ex.rss_caps.rx_hash_fields_mask; - if (!priv->hw_rss_sup) { - WARN("no RSS capabilities reported; disabling support" - " for UDP RSS and inner VXLAN RSS"); - /* Fake support for all possible RSS hash fields. */ - priv->hw_rss_sup = ~UINT64_C(0); - priv->hw_rss_sup = mlx4_conv_rss_hf(priv, -1); - /* Filter out known unsupported fields. */ - priv->hw_rss_sup &= - ~(uint64_t)(IBV_RX_HASH_SRC_PORT_UDP | - IBV_RX_HASH_DST_PORT_UDP | - IBV_RX_HASH_INNER); - } + priv->hw_csum_l2tun ? "" : "not "); + priv->hw_rss_sup = mlx4_hw_rss_sup(priv->ctx, priv->pd, + &device_attr_ex); DEBUG("supported RSS hash fields mask: %016" PRIx64, priv->hw_rss_sup); + priv->hw_rss_max_qps = + device_attr_ex.rss_caps.max_rwq_indirection_table_size; + DEBUG("MAX RSS queues %d", priv->hw_rss_max_qps); + priv->hw_fcs_strip = !!(device_attr_ex.raw_packet_caps & + IBV_RAW_PACKET_CAP_SCATTER_FCS); + DEBUG("FCS stripping toggling is %ssupported", + priv->hw_fcs_strip ? "" : "not "); + priv->tso = + ((device_attr_ex.tso_caps.max_tso > 0) && + (device_attr_ex.tso_caps.supported_qpts & + (1 << IBV_QPT_RAW_PACKET))); + if (priv->tso) + priv->tso_max_payload_sz = + device_attr_ex.tso_caps.max_tso; + DEBUG("TSO is %ssupported", + priv->tso ? "" : "not "); /* Configure the first MAC address by default. */ - if (mlx4_get_mac(priv, &mac.addr_bytes)) { + err = mlx4_get_mac(priv, &mac.addr_bytes); + if (err) { ERROR("cannot get MAC address, is mlx4_en loaded?" - " (rte_errno: %s)", strerror(rte_errno)); + " (error: %s)", strerror(err)); goto port_error; } INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", @@ -614,8 +726,8 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) eth_dev = rte_eth_dev_allocate(name); } if (eth_dev == NULL) { + err = ENOMEM; ERROR("can not allocate rte ethdev"); - rte_errno = ENOMEM; goto port_error; } eth_dev->data->dev_private = priv; @@ -649,6 +761,24 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) /* Update link status once if waiting for LSC. */ if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) mlx4_link_update(eth_dev, 0); + /* + * Once the device is added to the list of memory event + * callback, its global MR cache table cannot be expanded + * on the fly because of deadlock. If it overflows, lookup + * should be done by searching MR list linearly, which is slow. + */ + err = mlx4_mr_btree_init(&priv->mr.cache, + MLX4_MR_BTREE_CACHE_N * 2, + eth_dev->device->numa_node); + if (err) { + /* rte_errno is already set. */ + goto port_error; + } + /* Add device to memory callback list. */ + rte_rwlock_write_lock(&mlx4_mem_event_rwlock); + LIST_INSERT_HEAD(&mlx4_mem_event_cb_list, priv, mem_event_cb); + rte_rwlock_write_unlock(&mlx4_mem_event_rwlock); + rte_eth_dev_probing_finish(eth_dev); continue; port_error: rte_free(priv); @@ -660,8 +790,6 @@ port_error: rte_eth_dev_release_port(eth_dev); break; } - if (i == device_attr.phys_port_cnt) - return 0; /* * XXX if something went wrong in the loop above, there is a resource * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as @@ -673,8 +801,9 @@ error: claim_zero(mlx4_glue->close_device(attr_ctx)); if (list) mlx4_glue->free_device_list(list); - assert(rte_errno >= 0); - return -rte_errno; + if (err) + rte_errno = err; + return -err; } static const struct rte_pci_id mlx4_pci_id_map[] = { @@ -707,12 +836,54 @@ static struct rte_pci_driver mlx4_driver = { #ifdef RTE_LIBRTE_MLX4_DLOPEN_DEPS +/** + * Suffix RTE_EAL_PMD_PATH with "-glue". + * + * This function performs a sanity check on RTE_EAL_PMD_PATH before + * suffixing its last component. + * + * @param buf[out] + * Output buffer, should be large enough otherwise NULL is returned. + * @param size + * Size of @p out. + * + * @return + * Pointer to @p buf or @p NULL in case suffix cannot be appended. + */ +static char * +mlx4_glue_path(char *buf, size_t size) +{ + static const char *const bad[] = { "/", ".", "..", NULL }; + const char *path = RTE_EAL_PMD_PATH; + size_t len = strlen(path); + size_t off; + int i; + + while (len && path[len - 1] == '/') + --len; + for (off = len; off && path[off - 1] != '/'; --off) + ; + for (i = 0; bad[i]; ++i) + if (!strncmp(path + off, bad[i], (int)(len - off))) + goto error; + i = snprintf(buf, size, "%.*s-glue", (int)len, path); + if (i == -1 || (size_t)i >= size) + goto error; + return buf; +error: + ERROR("unable to append \"-glue\" to last component of" + " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\")," + " please re-configure DPDK"); + return NULL; +} + /** * Initialization routine for run-time dependency on rdma-core. */ static int mlx4_glue_init(void) { + char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; const char *path[] = { /* * A basic security check is necessary before trusting @@ -720,7 +891,13 @@ mlx4_glue_init(void) */ (geteuid() == getuid() && getegid() == getgid() ? getenv("MLX4_GLUE_PATH") : NULL), - RTE_EAL_PMD_PATH, + /* + * When RTE_EAL_PMD_PATH is set, use its glue-suffixed + * variant, otherwise let dlopen() look up libraries on its + * own. + */ + (*RTE_EAL_PMD_PATH ? + mlx4_glue_path(glue_path, sizeof(glue_path)) : ""), }; unsigned int i = 0; void *handle = NULL; @@ -790,9 +967,7 @@ glue_error: /** * Driver initialization routine. */ -RTE_INIT(rte_mlx4_pmd_init); -static void -rte_mlx4_pmd_init(void) +RTE_INIT(rte_mlx4_pmd_init) { /* * MLX4_DEVICE_FATAL_CLEANUP tells ibv_destroy functions we @@ -828,6 +1003,8 @@ rte_mlx4_pmd_init(void) } mlx4_glue->fork_init(); rte_pci_register(&mlx4_driver); + rte_mem_event_callback_register("MLX4_MEM_EVENT_CB", + mlx4_mr_mem_event_cb, NULL); } RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__); diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 19c8a223..e6fb934f 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2012 6WIND S.A. - * Copyright 2012 Mellanox + * Copyright 2012 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX4_H_ @@ -23,7 +23,9 @@ #include #include #include -#include +#include + +#include "mlx4_mr.h" #ifndef IBV_RX_HASH_INNER /** This is not necessarily defined by supported RDMA core versions. */ @@ -42,20 +44,12 @@ /** Fixed RSS hash key size in bytes. Cannot be modified. */ #define MLX4_RSS_HASH_KEY_SIZE 40 -/** - * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP - * from which buffers are to be transmitted will have to be mapped by this - * driver to their own Memory Region (MR). This is a slow operation. - * - * This value is always 1 for RX queues. - */ -#ifndef MLX4_PMD_TX_MP_CACHE -#define MLX4_PMD_TX_MP_CACHE 8 -#endif - /** Interrupt alarm timeout value in microseconds. */ #define MLX4_INTR_ALARM_TIMEOUT 100000 +/* Maximum packet headers size (L2+L3+L4) for TSO. */ +#define MLX4_MAX_TSO_HEADER 192 + /** Port parameter. */ #define MLX4_PMD_PORT_KVARG "port" @@ -78,20 +72,12 @@ struct rxq; struct txq; struct rte_flow; -/** Memory region descriptor. */ -struct mlx4_mr { - LIST_ENTRY(mlx4_mr) next; /**< Next entry in list. */ - uintptr_t start; /**< Base address for memory region. */ - uintptr_t end; /**< End address for memory region. */ - uint32_t lkey; /**< L_Key extracted from @p mr. */ - uint32_t refcnt; /**< Reference count for this object. */ - struct priv *priv; /**< Back pointer to private data. */ - struct ibv_mr *mr; /**< Memory region associated with @p mp. */ - struct rte_mempool *mp; /**< Target memory pool (mempool). */ -}; +LIST_HEAD(mlx4_dev_list, priv); +LIST_HEAD(mlx4_mr_list, mlx4_mr); /** Private data structure. */ struct priv { + LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */ struct rte_eth_dev *dev; /**< Ethernet device. */ struct ibv_context *ctx; /**< Verbs context. */ struct ibv_device_attr device_attr; /**< Device properties. */ @@ -103,15 +89,25 @@ struct priv { uint32_t vf:1; /**< This is a VF device. */ uint32_t intr_alarm:1; /**< An interrupt alarm is scheduled. */ uint32_t isolated:1; /**< Toggle isolated mode. */ + uint32_t rss_init:1; /**< Common RSS context is initialized. */ uint32_t hw_csum:1; /**< Checksum offload is supported. */ uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */ + uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */ + uint32_t tso:1; /**< Transmit segmentation offload is supported. */ + uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */ + uint32_t hw_rss_max_qps; /**< Max Rx Queues supported by RSS. */ uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */ struct rte_intr_handle intr_handle; /**< Port interrupt handle. */ struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */ + struct { + uint32_t dev_gen; /* Generation number to flush local caches. */ + rte_rwlock_t rwlock; /* MR Lock. */ + struct mlx4_mr_btree cache; /* Global MR cache table. */ + struct mlx4_mr_list mr_list; /* Registered MR list. */ + struct mlx4_mr_list mr_free_list; /* Freed MR list. */ + } mr; LIST_HEAD(, mlx4_rss) rss; /**< Shared targets for Rx flow rules. */ LIST_HEAD(, rte_flow) flows; /**< Configured flow rule handles. */ - LIST_HEAD(, mlx4_mr) mr; /**< Registered memory regions. */ - rte_spinlock_t mr_lock; /**< Lock for @p mr access. */ struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES]; /**< Configured MAC addresses. Unused entries are zeroed. */ }; @@ -131,7 +127,7 @@ void mlx4_allmulticast_disable(struct rte_eth_dev *dev); void mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index); int mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr, uint32_t index, uint32_t vmdq); -void mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr); +int mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr); int mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on); int mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats); void mlx4_stats_reset(struct rte_eth_dev *dev); @@ -154,11 +150,4 @@ void mlx4_rxq_intr_disable(struct priv *priv); int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx); int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx); -/* mlx4_mr.c */ - -struct mlx4_mr *mlx4_mr_get(struct priv *priv, struct rte_mempool *mp); -void mlx4_mr_put(struct mlx4_mr *mr); -uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, - uint32_t i); - #endif /* RTE_PMD_MLX4_H_ */ diff --git a/drivers/net/mlx4/mlx4_ethdev.c b/drivers/net/mlx4/mlx4_ethdev.c index 3bc69273..30deb3ef 100644 --- a/drivers/net/mlx4/mlx4_ethdev.c +++ b/drivers/net/mlx4/mlx4_ethdev.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ /** @@ -39,6 +39,7 @@ #include #include #include +#include #include "mlx4.h" #include "mlx4_flow.h" @@ -120,7 +121,7 @@ try_dev_id: goto try_dev_id; dev_port_prev = dev_port; if (dev_port == (priv->port - 1u)) - snprintf(match, sizeof(match), "%s", name); + strlcpy(match, name, sizeof(match)); } closedir(dir); if (match[0] == '\0') { @@ -131,167 +132,6 @@ try_dev_id: return 0; } -/** - * Read from sysfs entry. - * - * @param[in] priv - * Pointer to private structure. - * @param[in] entry - * Entry name relative to sysfs path. - * @param[out] buf - * Data output buffer. - * @param size - * Buffer size. - * - * @return - * Number of bytes read on success, negative errno value otherwise and - * rte_errno is set. - */ -static int -mlx4_sysfs_read(const struct priv *priv, const char *entry, - char *buf, size_t size) -{ - char ifname[IF_NAMESIZE]; - FILE *file; - int ret; - - ret = mlx4_get_ifname(priv, &ifname); - if (ret) - return ret; - - MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, - ifname, entry); - - file = fopen(path, "rb"); - if (file == NULL) { - rte_errno = errno; - return -rte_errno; - } - ret = fread(buf, 1, size, file); - if ((size_t)ret < size && ferror(file)) { - rte_errno = EIO; - ret = -rte_errno; - } else { - ret = size; - } - fclose(file); - return ret; -} - -/** - * Write to sysfs entry. - * - * @param[in] priv - * Pointer to private structure. - * @param[in] entry - * Entry name relative to sysfs path. - * @param[in] buf - * Data buffer. - * @param size - * Buffer size. - * - * @return - * Number of bytes written on success, negative errno value otherwise and - * rte_errno is set. - */ -static int -mlx4_sysfs_write(const struct priv *priv, const char *entry, - char *buf, size_t size) -{ - char ifname[IF_NAMESIZE]; - FILE *file; - int ret; - - ret = mlx4_get_ifname(priv, &ifname); - if (ret) - return ret; - - MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, - ifname, entry); - - file = fopen(path, "wb"); - if (file == NULL) { - rte_errno = errno; - return -rte_errno; - } - ret = fwrite(buf, 1, size, file); - if ((size_t)ret < size || ferror(file)) { - rte_errno = EIO; - ret = -rte_errno; - } else { - ret = size; - } - fclose(file); - return ret; -} - -/** - * Get unsigned long sysfs property. - * - * @param priv - * Pointer to private structure. - * @param[in] name - * Entry name relative to sysfs path. - * @param[out] value - * Value output buffer. - * - * @return - * 0 on success, negative errno value otherwise and rte_errno is set. - */ -static int -mlx4_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) -{ - int ret; - unsigned long value_ret; - char value_str[32]; - - ret = mlx4_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); - if (ret < 0) { - DEBUG("cannot read %s value from sysfs: %s", - name, strerror(rte_errno)); - return ret; - } - value_str[ret] = '\0'; - errno = 0; - value_ret = strtoul(value_str, NULL, 0); - if (errno) { - rte_errno = errno; - DEBUG("invalid %s value `%s': %s", name, value_str, - strerror(rte_errno)); - return -rte_errno; - } - *value = value_ret; - return 0; -} - -/** - * Set unsigned long sysfs property. - * - * @param priv - * Pointer to private structure. - * @param[in] name - * Entry name relative to sysfs path. - * @param value - * Value to set. - * - * @return - * 0 on success, negative errno value otherwise and rte_errno is set. - */ -static int -mlx4_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) -{ - int ret; - MKSTR(value_str, "%lu", value); - - ret = mlx4_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); - if (ret < 0) { - DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", - name, value_str, value, strerror(rte_errno)); - return ret; - } - return 0; -} - /** * Perform ifreq ioctl() on associated Ethernet device. * @@ -361,12 +201,12 @@ mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]) int mlx4_mtu_get(struct priv *priv, uint16_t *mtu) { - unsigned long ulong_mtu = 0; - int ret = mlx4_get_sysfs_ulong(priv, "mtu", &ulong_mtu); + struct ifreq request; + int ret = mlx4_ifreq(priv, SIOCGIFMTU, &request); if (ret) return ret; - *mtu = ulong_mtu; + *mtu = request.ifr_mtu; return 0; } @@ -385,20 +225,13 @@ int mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) { struct priv *priv = dev->data->dev_private; - uint16_t new_mtu; - int ret = mlx4_set_sysfs_ulong(priv, "mtu", mtu); + struct ifreq request = { .ifr_mtu = mtu, }; + int ret = mlx4_ifreq(priv, SIOCSIFMTU, &request); if (ret) return ret; - ret = mlx4_mtu_get(priv, &new_mtu); - if (ret) - return ret; - if (new_mtu == mtu) { - priv->mtu = mtu; - return 0; - } - rte_errno = EINVAL; - return -rte_errno; + priv->mtu = mtu; + return 0; } /** @@ -417,14 +250,14 @@ mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) static int mlx4_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) { - unsigned long tmp = 0; - int ret = mlx4_get_sysfs_ulong(priv, "flags", &tmp); + struct ifreq request; + int ret = mlx4_ifreq(priv, SIOCGIFFLAGS, &request); if (ret) return ret; - tmp &= keep; - tmp |= (flags & (~keep)); - return mlx4_set_sysfs_ulong(priv, "flags", tmp); + request.ifr_flags &= keep; + request.ifr_flags |= flags & ~keep; + return mlx4_ifreq(priv, SIOCSIFFLAGS, &request); } /** @@ -701,11 +534,14 @@ mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) * Pointer to Ethernet device structure. * @param mac_addr * MAC address to register. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. */ -void +int mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr) { - mlx4_mac_addr_add(dev, mac_addr, 0, 0); + return mlx4_mac_addr_add(dev, mac_addr, 0, 0); } /** @@ -723,7 +559,6 @@ mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) unsigned int max; char ifname[IF_NAMESIZE]; - info->pci_dev = RTE_ETH_DEV_TO_PCI(dev); /* FIXME: we should ask the device for these values. */ info->min_rx_bufsize = 32; info->max_rx_pktlen = 65536; @@ -752,6 +587,7 @@ mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) ETH_LINK_SPEED_20G | ETH_LINK_SPEED_40G | ETH_LINK_SPEED_56G; + info->flow_type_rss_offloads = mlx4_conv_rss_types(priv, 0, 1); } /** @@ -878,7 +714,7 @@ mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete) } link_speed = ethtool_cmd_speed(&edata); if (link_speed == -1) - dev_link.link_speed = 0; + dev_link.link_speed = ETH_SPEED_NUM_NONE; else dev_link.link_speed = link_speed; dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c index 2d55bfe0..b40e7e5c 100644 --- a/drivers/net/mlx4/mlx4_flow.c +++ b/drivers/net/mlx4/mlx4_flow.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ /** @@ -76,69 +76,94 @@ struct mlx4_drop { }; /** - * Convert DPDK RSS hash fields to their Verbs equivalent. + * Convert supported RSS hash field types between DPDK and Verbs formats. * - * This function returns the supported (default) set when @p rss_hf has - * special value (uint64_t)-1. + * This function returns the supported (default) set when @p types has + * special value 0. * * @param priv * Pointer to private structure. - * @param rss_hf - * Hash fields in DPDK format (see struct rte_eth_rss_conf). + * @param types + * Depending on @p verbs_to_dpdk, hash types in either DPDK (see struct + * rte_eth_rss_conf) or Verbs format. + * @param verbs_to_dpdk + * A zero value converts @p types from DPDK to Verbs, a nonzero value + * performs the reverse operation. * * @return - * A valid Verbs RSS hash fields mask for mlx4 on success, (uint64_t)-1 - * otherwise and rte_errno is set. + * Converted RSS hash fields on success, (uint64_t)-1 otherwise and + * rte_errno is set. */ uint64_t -mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf) +mlx4_conv_rss_types(struct priv *priv, uint64_t types, int verbs_to_dpdk) { - enum { IPV4, IPV6, TCP, UDP, }; - const uint64_t in[] = { - [IPV4] = (ETH_RSS_IPV4 | - ETH_RSS_FRAG_IPV4 | - ETH_RSS_NONFRAG_IPV4_TCP | - ETH_RSS_NONFRAG_IPV4_UDP | - ETH_RSS_NONFRAG_IPV4_OTHER), - [IPV6] = (ETH_RSS_IPV6 | - ETH_RSS_FRAG_IPV6 | - ETH_RSS_NONFRAG_IPV6_TCP | - ETH_RSS_NONFRAG_IPV6_UDP | - ETH_RSS_NONFRAG_IPV6_OTHER | - ETH_RSS_IPV6_EX | - ETH_RSS_IPV6_TCP_EX | - ETH_RSS_IPV6_UDP_EX), - [TCP] = (ETH_RSS_NONFRAG_IPV4_TCP | - ETH_RSS_NONFRAG_IPV6_TCP | - ETH_RSS_IPV6_TCP_EX), - [UDP] = (ETH_RSS_NONFRAG_IPV4_UDP | - ETH_RSS_NONFRAG_IPV6_UDP | - ETH_RSS_IPV6_UDP_EX), + enum { + INNER, + IPV4, IPV4_1, IPV4_2, IPV6, IPV6_1, IPV6_2, IPV6_3, + TCP, UDP, + IPV4_TCP, IPV4_UDP, IPV6_TCP, IPV6_TCP_1, IPV6_UDP, IPV6_UDP_1, }; - const uint64_t out[RTE_DIM(in)] = { - [IPV4] = IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4, - [IPV6] = IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6, - [TCP] = IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP, - [UDP] = IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP, + enum { + VERBS_IPV4 = IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4, + VERBS_IPV6 = IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6, + VERBS_TCP = IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP, + VERBS_UDP = IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP, }; + static const uint64_t dpdk[] = { + [INNER] = 0, + [IPV4] = ETH_RSS_IPV4, + [IPV4_1] = ETH_RSS_FRAG_IPV4, + [IPV4_2] = ETH_RSS_NONFRAG_IPV4_OTHER, + [IPV6] = ETH_RSS_IPV6, + [IPV6_1] = ETH_RSS_FRAG_IPV6, + [IPV6_2] = ETH_RSS_NONFRAG_IPV6_OTHER, + [IPV6_3] = ETH_RSS_IPV6_EX, + [TCP] = 0, + [UDP] = 0, + [IPV4_TCP] = ETH_RSS_NONFRAG_IPV4_TCP, + [IPV4_UDP] = ETH_RSS_NONFRAG_IPV4_UDP, + [IPV6_TCP] = ETH_RSS_NONFRAG_IPV6_TCP, + [IPV6_TCP_1] = ETH_RSS_IPV6_TCP_EX, + [IPV6_UDP] = ETH_RSS_NONFRAG_IPV6_UDP, + [IPV6_UDP_1] = ETH_RSS_IPV6_UDP_EX, + }; + static const uint64_t verbs[RTE_DIM(dpdk)] = { + [INNER] = IBV_RX_HASH_INNER, + [IPV4] = VERBS_IPV4, + [IPV4_1] = VERBS_IPV4, + [IPV4_2] = VERBS_IPV4, + [IPV6] = VERBS_IPV6, + [IPV6_1] = VERBS_IPV6, + [IPV6_2] = VERBS_IPV6, + [IPV6_3] = VERBS_IPV6, + [TCP] = VERBS_TCP, + [UDP] = VERBS_UDP, + [IPV4_TCP] = VERBS_IPV4 | VERBS_TCP, + [IPV4_UDP] = VERBS_IPV4 | VERBS_UDP, + [IPV6_TCP] = VERBS_IPV6 | VERBS_TCP, + [IPV6_TCP_1] = VERBS_IPV6 | VERBS_TCP, + [IPV6_UDP] = VERBS_IPV6 | VERBS_UDP, + [IPV6_UDP_1] = VERBS_IPV6 | VERBS_UDP, + }; + const uint64_t *in = verbs_to_dpdk ? verbs : dpdk; + const uint64_t *out = verbs_to_dpdk ? dpdk : verbs; uint64_t seen = 0; uint64_t conv = 0; unsigned int i; - for (i = 0; i != RTE_DIM(in); ++i) - if (rss_hf & in[i]) { - seen |= rss_hf & in[i]; + if (!types) { + if (!verbs_to_dpdk) + return priv->hw_rss_sup; + types = priv->hw_rss_sup; + } + for (i = 0; i != RTE_DIM(dpdk); ++i) + if (in[i] && (types & in[i]) == in[i]) { + seen |= types & in[i]; conv |= out[i]; } - if ((conv & priv->hw_rss_sup) == conv) { - if (rss_hf == (uint64_t)-1) { - /* Include inner RSS by default if supported. */ - conv |= priv->hw_rss_sup & IBV_RX_HASH_INNER; - return conv; - } - if (!(rss_hf & ~seen)) - return conv; - } + if ((verbs_to_dpdk || (conv & priv->hw_rss_sup) == conv) && + !(types & ~seen)) + return conv; rte_errno = ENOTSUP; return (uint64_t)-1; } @@ -362,6 +387,9 @@ error: * Additional mlx4-specific constraints on supported fields: * * - No support for partial masks. + * - Due to HW/FW limitation, flow rule priority is not taken into account + * when matching UDP destination ports, doing is therefore only supported + * at the highest priority level (0). * * @param[in, out] flow * Flow rule handle to update. @@ -393,6 +421,11 @@ mlx4_flow_merge_udp(struct rte_flow *flow, msg = "mlx4 does not support matching partial UDP fields"; goto error; } + if (mask && mask->hdr.dst_port && flow->priority) { + msg = "combining UDP destination port matching with a nonzero" + " priority level is not supported"; + goto error; + } if (!flow->ibv_attr) return 0; ++flow->ibv_attr->num_of_specs; @@ -637,6 +670,7 @@ mlx4_flow_prepare(struct priv *priv, struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) }; struct rte_flow *flow = &temp; const char *msg = NULL; + int overlap; if (attr->group) return rte_flow_error_set @@ -651,12 +685,18 @@ mlx4_flow_prepare(struct priv *priv, return rte_flow_error_set (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, "egress is not supported"); + if (attr->transfer) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, + NULL, "transfer is not supported"); if (!attr->ingress) return rte_flow_error_set (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, NULL, "only ingress is supported"); fill: + overlap = 0; proc = mlx4_flow_proc_item_list; + flow->priority = attr->priority; /* Go over pattern. */ for (item = pattern; item->type; ++item) { const struct mlx4_flow_proc_item *next = NULL; @@ -702,14 +742,24 @@ fill: } /* Go over actions list. */ for (action = actions; action->type; ++action) { + /* This one may appear anywhere multiple times. */ + if (action->type == RTE_FLOW_ACTION_TYPE_VOID) + continue; + /* Fate-deciding actions may appear exactly once. */ + if (overlap) { + msg = "cannot combine several fate-deciding actions," + " choose between DROP, QUEUE or RSS"; + goto exit_action_not_supported; + } + overlap = 1; switch (action->type) { const struct rte_flow_action_queue *queue; const struct rte_flow_action_rss *rss; - const struct rte_eth_rss_conf *rss_conf; + const uint8_t *rss_key; + uint32_t rss_key_len; + uint64_t fields; unsigned int i; - case RTE_FLOW_ACTION_TYPE_VOID: - continue; case RTE_FLOW_ACTION_TYPE_DROP: flow->drop = 1; break; @@ -736,54 +786,68 @@ fill: break; rss = action->conf; /* Default RSS configuration if none is provided. */ - rss_conf = - rss->rss_conf ? - rss->rss_conf : - &(struct rte_eth_rss_conf){ - .rss_key = mlx4_rss_hash_key_default, - .rss_key_len = MLX4_RSS_HASH_KEY_SIZE, - .rss_hf = -1, - }; + if (rss->key_len) { + rss_key = rss->key; + rss_key_len = rss->key_len; + } else { + rss_key = mlx4_rss_hash_key_default; + rss_key_len = MLX4_RSS_HASH_KEY_SIZE; + } /* Sanity checks. */ - for (i = 0; i < rss->num; ++i) + for (i = 0; i < rss->queue_num; ++i) if (rss->queue[i] >= priv->dev->data->nb_rx_queues) break; - if (i != rss->num) { + if (i != rss->queue_num) { msg = "queue index target beyond number of" " configured Rx queues"; goto exit_action_not_supported; } - if (!rte_is_power_of_2(rss->num)) { + if (!rte_is_power_of_2(rss->queue_num)) { msg = "for RSS, mlx4 requires the number of" " queues to be a power of two"; goto exit_action_not_supported; } - if (rss_conf->rss_key_len != - sizeof(flow->rss->key)) { + if (rss_key_len != sizeof(flow->rss->key)) { msg = "mlx4 supports exactly one RSS hash key" " length: " MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE); goto exit_action_not_supported; } - for (i = 1; i < rss->num; ++i) + for (i = 1; i < rss->queue_num; ++i) if (rss->queue[i] - rss->queue[i - 1] != 1) break; - if (i != rss->num) { + if (i != rss->queue_num) { msg = "mlx4 requires RSS contexts to use" " consecutive queue indices only"; goto exit_action_not_supported; } - if (rss->queue[0] % rss->num) { + if (rss->queue[0] % rss->queue_num) { msg = "mlx4 requires the first queue of a RSS" " context to be aligned on a multiple" " of the context size"; goto exit_action_not_supported; } + if (rss->func && + rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) { + msg = "the only supported RSS hash function" + " is Toeplitz"; + goto exit_action_not_supported; + } + if (rss->level) { + msg = "a nonzero RSS encapsulation level is" + " not supported"; + goto exit_action_not_supported; + } + rte_errno = 0; + fields = mlx4_conv_rss_types(priv, rss->types, 0); + if (fields == (uint64_t)-1 && rte_errno) { + msg = "unsupported RSS hash type requested"; + goto exit_action_not_supported; + } flow->rss = mlx4_rss_get - (priv, - mlx4_conv_rss_hf(priv, rss_conf->rss_hf), - rss_conf->rss_key, rss->num, rss->queue); + (priv, fields, rss_key, rss->queue_num, + rss->queue); if (!flow->rss) { msg = "either invalid parameters or not enough" " resources for additional multi-queue" @@ -795,10 +859,9 @@ fill: goto exit_action_not_supported; } } - if (!flow->rss && !flow->drop) - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, "no valid action"); + /* When fate is unknown, drop traffic. */ + if (!overlap) + flow->drop = 1; /* Validation ends here. */ if (!addr) { if (flow->rss) @@ -820,11 +883,14 @@ fill: }, }; - if (!mlx4_zmallocv(__func__, vec, RTE_DIM(vec))) + if (!mlx4_zmallocv(__func__, vec, RTE_DIM(vec))) { + if (temp.rss) + mlx4_rss_put(temp.rss); return rte_flow_error_set (error, -rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, "flow rule handle allocation failure"); + } /* Most fields will be updated by second pass. */ *flow = (struct rte_flow){ .ibv_attr = temp.ibv_attr, @@ -1264,14 +1330,20 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error) */ uint32_t queues = rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1; - alignas(struct rte_flow_action_rss) uint8_t rss_conf_data - [offsetof(struct rte_flow_action_rss, queue) + - sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues]; - struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data; + uint16_t queue[queues]; + struct rte_flow_action_rss action_rss = { + .func = RTE_ETH_HASH_FUNCTION_DEFAULT, + .level = 0, + .types = 0, + .key_len = MLX4_RSS_HASH_KEY_SIZE, + .queue_num = queues, + .key = mlx4_rss_hash_key_default, + .queue = queue, + }; struct rte_flow_action actions[] = { { .type = RTE_FLOW_ACTION_TYPE_RSS, - .conf = rss_conf, + .conf = &action_rss, }, { .type = RTE_FLOW_ACTION_TYPE_END, @@ -1293,12 +1365,8 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error) if (!queues) goto error; /* Prepare default RSS configuration. */ - *rss_conf = (struct rte_flow_action_rss){ - .rss_conf = NULL, /* Rely on default fallback settings. */ - .num = queues, - }; for (i = 0; i != queues; ++i) - rss_conf->queue[i] = i; + queue[i] = i; /* * Set up VLAN item if filtering is enabled and at least one VLAN * filter is configured. @@ -1357,7 +1425,7 @@ next_vlan: if (j != sizeof(mac->addr_bytes)) continue; if (flow->rss->queues != queues || - memcmp(flow->rss->queue_id, rss_conf->queue, + memcmp(flow->rss->queue_id, action_rss.queue, queues * sizeof(flow->rss->queue_id[0]))) continue; break; @@ -1397,7 +1465,7 @@ next_vlan: if (flow && flow->internal) { assert(flow->rss); if (flow->rss->queues != queues || - memcmp(flow->rss->queue_id, rss_conf->queue, + memcmp(flow->rss->queue_id, action_rss.queue, queues * sizeof(flow->rss->queue_id[0]))) flow = NULL; } diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h index 00188a65..2917ebe9 100644 --- a/drivers/net/mlx4/mlx4_flow.h +++ b/drivers/net/mlx4/mlx4_flow.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX4_FLOW_H_ @@ -42,12 +42,14 @@ struct rte_flow { uint32_t promisc:1; /**< This rule matches everything. */ uint32_t allmulti:1; /**< This rule matches all multicast traffic. */ uint32_t drop:1; /**< This rule drops packets. */ + uint32_t priority; /**< Flow rule priority. */ struct mlx4_rss *rss; /**< Rx target. */ }; /* mlx4_flow.c */ -uint64_t mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf); +uint64_t mlx4_conv_rss_types(struct priv *priv, uint64_t types, + int verbs_to_dpdk); int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error); void mlx4_flow_clean(struct priv *priv); int mlx4_filter_ctrl(struct rte_eth_dev *dev, diff --git a/drivers/net/mlx4/mlx4_glue.c b/drivers/net/mlx4/mlx4_glue.c index 3b79d320..67b3bfac 100644 --- a/drivers/net/mlx4/mlx4_glue.c +++ b/drivers/net/mlx4/mlx4_glue.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2018 6WIND S.A. - * Copyright 2018 Mellanox + * Copyright 2018 Mellanox Technologies, Ltd */ #include diff --git a/drivers/net/mlx4/mlx4_glue.h b/drivers/net/mlx4/mlx4_glue.h index 368f906b..668ca867 100644 --- a/drivers/net/mlx4/mlx4_glue.h +++ b/drivers/net/mlx4/mlx4_glue.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2018 6WIND S.A. - * Copyright 2018 Mellanox + * Copyright 2018 Mellanox Technologies, Ltd */ #ifndef MLX4_GLUE_H_ diff --git a/drivers/net/mlx4/mlx4_intr.c b/drivers/net/mlx4/mlx4_intr.c index 2141992e..eeb982a0 100644 --- a/drivers/net/mlx4/mlx4_intr.c +++ b/drivers/net/mlx4/mlx4_intr.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ /** diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c index 9a1e4de3..d23d3c61 100644 --- a/drivers/net/mlx4/mlx4_mr.c +++ b/drivers/net/mlx4/mlx4_mr.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ /** @@ -30,237 +30,1152 @@ #include #include #include -#include +#include #include "mlx4_glue.h" +#include "mlx4_mr.h" #include "mlx4_rxtx.h" #include "mlx4_utils.h" -struct mlx4_check_mempool_data { +struct mr_find_contig_memsegs_data { + uintptr_t addr; + uintptr_t start; + uintptr_t end; + const struct rte_memseg_list *msl; +}; + +struct mr_update_mp_data { + struct rte_eth_dev *dev; + struct mlx4_mr_ctrl *mr_ctrl; int ret; - char *start; - char *end; }; /** - * Called by mlx4_check_mempool() when iterating the memory chunks. - * - * @param[in] mp - * Pointer to memory pool (unused). - * @param[in, out] data - * Pointer to shared buffer with mlx4_check_mempool(). - * @param[in] memhdr - * Pointer to mempool chunk header. - * @param mem_idx - * Mempool element index (unused). + * Expand B-tree table to a given size. Can't be called with holding + * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc(). + * + * @param bt + * Pointer to B-tree structure. + * @param n + * Number of entries for expansion. + * + * @return + * 0 on success, -1 on failure. */ -static void -mlx4_check_mempool_cb(struct rte_mempool *mp, void *opaque, - struct rte_mempool_memhdr *memhdr, - unsigned int mem_idx) +static int +mr_btree_expand(struct mlx4_mr_btree *bt, int n) +{ + void *mem; + int ret = 0; + + if (n <= bt->size) + return ret; + /* + * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is + * used inside if there's no room to expand. Because this is a quite + * rare case and a part of very slow path, it is very acceptable. + * Initially cache_bh[] will be given practically enough space and once + * it is expanded, expansion wouldn't be needed again ever. + */ + mem = rte_realloc(bt->table, n * sizeof(struct mlx4_mr_cache), 0); + if (mem == NULL) { + /* Not an error, B-tree search will be skipped. */ + WARN("failed to expand MR B-tree (%p) table", (void *)bt); + ret = -1; + } else { + DEBUG("expanded MR B-tree table (size=%u)", n); + bt->table = mem; + bt->size = n; + } + return ret; +} + +/** + * Look up LKey from given B-tree lookup table, store the last index and return + * searched LKey. + * + * @param bt + * Pointer to B-tree structure. + * @param[out] idx + * Pointer to index. Even on search failure, returns index where it stops + * searching so that index can be used when inserting a new entry. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static uint32_t +mr_btree_lookup(struct mlx4_mr_btree *bt, uint16_t *idx, uintptr_t addr) +{ + struct mlx4_mr_cache *lkp_tbl; + uint16_t n; + uint16_t base = 0; + + assert(bt != NULL); + lkp_tbl = *bt->table; + n = bt->len; + /* First entry must be NULL for comparison. */ + assert(bt->len > 0 || (lkp_tbl[0].start == 0 && + lkp_tbl[0].lkey == UINT32_MAX)); + /* Binary search. */ + do { + register uint16_t delta = n >> 1; + + if (addr < lkp_tbl[base + delta].start) { + n = delta; + } else { + base += delta; + n -= delta; + } + } while (n > 1); + assert(addr >= lkp_tbl[base].start); + *idx = base; + if (addr < lkp_tbl[base].end) + return lkp_tbl[base].lkey; + /* Not found. */ + return UINT32_MAX; +} + +/** + * Insert an entry to B-tree lookup table. + * + * @param bt + * Pointer to B-tree structure. + * @param entry + * Pointer to new entry to insert. + * + * @return + * 0 on success, -1 on failure. + */ +static int +mr_btree_insert(struct mlx4_mr_btree *bt, struct mlx4_mr_cache *entry) { - struct mlx4_check_mempool_data *data = opaque; + struct mlx4_mr_cache *lkp_tbl; + uint16_t idx = 0; + size_t shift; - (void)mp; - (void)mem_idx; - /* It already failed, skip the next chunks. */ - if (data->ret != 0) + assert(bt != NULL); + assert(bt->len <= bt->size); + assert(bt->len > 0); + lkp_tbl = *bt->table; + /* Find out the slot for insertion. */ + if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { + DEBUG("abort insertion to B-tree(%p): already exist at" + " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); + /* Already exist, return. */ + return 0; + } + /* If table is full, return error. */ + if (unlikely(bt->len == bt->size)) { + bt->overflow = 1; + return -1; + } + /* Insert entry. */ + ++idx; + shift = (bt->len - idx) * sizeof(struct mlx4_mr_cache); + if (shift) + memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); + lkp_tbl[idx] = *entry; + bt->len++; + DEBUG("inserted B-tree(%p)[%u]," + " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); + return 0; +} + +/** + * Initialize B-tree and allocate memory for lookup table. + * + * @param bt + * Pointer to B-tree structure. + * @param n + * Number of entries to allocate. + * @param socket + * NUMA socket on which memory must be allocated. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket) +{ + if (bt == NULL) { + rte_errno = EINVAL; + return -rte_errno; + } + memset(bt, 0, sizeof(*bt)); + bt->table = rte_calloc_socket("B-tree table", + n, sizeof(struct mlx4_mr_cache), + 0, socket); + if (bt->table == NULL) { + rte_errno = ENOMEM; + ERROR("failed to allocate memory for btree cache on socket %d", + socket); + return -rte_errno; + } + bt->size = n; + /* First entry must be NULL for binary search. */ + (*bt->table)[bt->len++] = (struct mlx4_mr_cache) { + .lkey = UINT32_MAX, + }; + DEBUG("initialized B-tree %p with table %p", + (void *)bt, (void *)bt->table); + return 0; +} + +/** + * Free B-tree resources. + * + * @param bt + * Pointer to B-tree structure. + */ +void +mlx4_mr_btree_free(struct mlx4_mr_btree *bt) +{ + if (bt == NULL) return; - /* It is the first chunk. */ - if (data->start == NULL && data->end == NULL) { - data->start = memhdr->addr; - data->end = data->start + memhdr->len; + DEBUG("freeing B-tree %p with table %p", (void *)bt, (void *)bt->table); + rte_free(bt->table); + memset(bt, 0, sizeof(*bt)); +} + +#ifndef NDEBUG +/** + * Dump all the entries in a B-tree + * + * @param bt + * Pointer to B-tree structure. + */ +void +mlx4_mr_btree_dump(struct mlx4_mr_btree *bt) +{ + int idx; + struct mlx4_mr_cache *lkp_tbl; + + if (bt == NULL) return; + lkp_tbl = *bt->table; + for (idx = 0; idx < bt->len; ++idx) { + struct mlx4_mr_cache *entry = &lkp_tbl[idx]; + + DEBUG("B-tree(%p)[%u]," + " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); } - if (data->end == memhdr->addr) { - data->end += memhdr->len; - return; +} +#endif + +/** + * Find virtually contiguous memory chunk in a given MR. + * + * @param dev + * Pointer to MR structure. + * @param[out] entry + * Pointer to returning MR cache entry. If not found, this will not be + * updated. + * @param start_idx + * Start index of the memseg bitmap. + * + * @return + * Next index to go on lookup. + */ +static int +mr_find_next_chunk(struct mlx4_mr *mr, struct mlx4_mr_cache *entry, + int base_idx) +{ + uintptr_t start = 0; + uintptr_t end = 0; + uint32_t idx = 0; + + for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { + if (rte_bitmap_get(mr->ms_bmp, idx)) { + const struct rte_memseg_list *msl; + const struct rte_memseg *ms; + + msl = mr->msl; + ms = rte_fbarray_get(&msl->memseg_arr, + mr->ms_base_idx + idx); + assert(msl->page_sz == ms->hugepage_sz); + if (!start) + start = ms->addr_64; + end = ms->addr_64 + ms->hugepage_sz; + } else if (start) { + /* Passed the end of a fragment. */ + break; + } } - if (data->start == (char *)memhdr->addr + memhdr->len) { - data->start -= memhdr->len; - return; + if (start) { + /* Found one chunk. */ + entry->start = start; + entry->end = end; + entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey); } - /* Error, mempool is not virtually contiguous. */ - data->ret = -1; + return idx; } /** - * Check if a mempool can be used: it must be virtually contiguous. + * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. + * Then, this entry will have to be searched by mr_lookup_dev_list() in + * mlx4_mr_create() on miss. * - * @param[in] mp - * Pointer to memory pool. - * @param[out] start - * Pointer to the start address of the mempool virtual memory area. - * @param[out] end - * Pointer to the end address of the mempool virtual memory area. + * @param dev + * Pointer to Ethernet device. + * @param mr + * Pointer to MR to insert. * * @return - * 0 on success (mempool is virtually contiguous), -1 on error. + * 0 on success, -1 on failure. */ static int -mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start, uintptr_t *end) +mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx4_mr *mr) { - struct mlx4_check_mempool_data data; + struct priv *priv = dev->data->dev_private; + unsigned int n; - memset(&data, 0, sizeof(data)); - rte_mempool_mem_iter(mp, mlx4_check_mempool_cb, &data); - *start = (uintptr_t)data.start; - *end = (uintptr_t)data.end; - return data.ret; + DEBUG("port %u inserting MR(%p) to global cache", + dev->data->port_id, (void *)mr); + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx4_mr_cache entry = { 0, }; + + /* Find a contiguous chunk and advance the index. */ + n = mr_find_next_chunk(mr, &entry, n); + if (!entry.end) + break; + if (mr_btree_insert(&priv->mr.cache, &entry) < 0) { + /* + * Overflowed, but the global table cannot be expanded + * because of deadlock. + */ + return -1; + } + } + return 0; } /** - * Obtain a memory region from a memory pool. + * Look up address in the original global MR list. * - * If a matching memory region already exists, it is returned with its - * reference count incremented, otherwise a new one is registered. + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry. If no match, this will not be updated. + * @param addr + * Search key. * - * @param priv - * Pointer to private structure. - * @param mp - * Pointer to memory pool. + * @return + * Found MR on match, NULL otherwise. + */ +static struct mlx4_mr * +mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, + uintptr_t addr) +{ + struct priv *priv = dev->data->dev_private; + struct mlx4_mr *mr; + + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + unsigned int n; + + if (mr->ms_n == 0) + continue; + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx4_mr_cache ret = { 0, }; + + n = mr_find_next_chunk(mr, &ret, n); + if (addr >= ret.start && addr < ret.end) { + /* Found. */ + *entry = ret; + return mr; + } + } + } + return NULL; +} + +/** + * Look up address on device. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry. If no match, this will not be updated. + * @param addr + * Search key. * * @return - * Memory region pointer, NULL in case of error and rte_errno is set. + * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. */ -struct mlx4_mr * -mlx4_mr_get(struct priv *priv, struct rte_mempool *mp) +static uint32_t +mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, + uintptr_t addr) { - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - uintptr_t start; - uintptr_t end; - unsigned int i; + struct priv *priv = dev->data->dev_private; + uint16_t idx; + uint32_t lkey = UINT32_MAX; struct mlx4_mr *mr; - if (mlx4_check_mempool(mp, &start, &end) != 0) { - rte_errno = EINVAL; - ERROR("mempool %p: not virtually contiguous", - (void *)mp); - return NULL; + /* + * If the global cache has overflowed since it failed to expand the + * B-tree table, it can't have all the existing MRs. Then, the address + * has to be searched by traversing the original MR list instead, which + * is very slow path. Otherwise, the global cache is all inclusive. + */ + if (!unlikely(priv->mr.cache.overflow)) { + lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + if (lkey != UINT32_MAX) + *entry = (*priv->mr.cache.table)[idx]; + } else { + /* Falling back to the slowest path. */ + mr = mr_lookup_dev_list(dev, entry, addr); + if (mr != NULL) + lkey = entry->lkey; } - DEBUG("mempool %p area start=%p end=%p size=%zu", - (void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - /* Round start and end to page boundary if found in memory segments. */ - for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { - uintptr_t addr = (uintptr_t)ms[i].addr; - size_t len = ms[i].len; - unsigned int align = ms[i].hugepage_sz; - - if ((start > addr) && (start < addr + len)) - start = RTE_ALIGN_FLOOR(start, align); - if ((end > addr) && (end < addr + len)) - end = RTE_ALIGN_CEIL(end, align); + assert(lkey == UINT32_MAX || (addr >= entry->start && + addr < entry->end)); + return lkey; +} + +/** + * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() + * can raise memory free event and the callback function will spin on the lock. + * + * @param mr + * Pointer to MR to free. + */ +static void +mr_free(struct mlx4_mr *mr) +{ + if (mr == NULL) + return; + DEBUG("freeing MR(%p):", (void *)mr); + if (mr->ibv_mr != NULL) + claim_zero(mlx4_glue->dereg_mr(mr->ibv_mr)); + if (mr->ms_bmp != NULL) + rte_bitmap_free(mr->ms_bmp); + rte_free(mr); +} + +/** + * Releass resources of detached MR having no online entry. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +mlx4_mr_garbage_collect(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct mlx4_mr *mr_next; + struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); + + /* + * MR can't be freed with holding the lock because rte_free() could call + * memory free callback function. This will be a deadlock situation. + */ + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Detach the whole free list and release it after unlocking. */ + free_list = priv->mr.mr_free_list; + LIST_INIT(&priv->mr.mr_free_list); + rte_rwlock_write_unlock(&priv->mr.rwlock); + /* Release resources. */ + mr_next = LIST_FIRST(&free_list); + while (mr_next != NULL) { + struct mlx4_mr *mr = mr_next; + + mr_next = LIST_NEXT(mr, mr); + mr_free(mr); } - DEBUG("mempool %p using start=%p end=%p size=%zu for MR", - (void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - rte_spinlock_lock(&priv->mr_lock); - LIST_FOREACH(mr, &priv->mr, next) - if (mp == mr->mp && start >= mr->start && end <= mr->end) - break; - if (mr) { - ++mr->refcnt; - goto release; +} + +/* Called during rte_memseg_contig_walk() by mlx4_mr_create(). */ +static int +mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg) +{ + struct mr_find_contig_memsegs_data *data = arg; + + if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) + return 0; + /* Found, save it and stop walking. */ + data->start = ms->addr_64; + data->end = ms->addr_64 + len; + data->msl = msl; + return 1; +} + +/** + * Create a new global Memroy Region (MR) for a missing virtual address. + * Register entire virtually contiguous memory chunk around the address. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry, found in the global cache or newly + * created. If failed to create one, this will not be updated. + * @param addr + * Target virtual address to register. + * + * @return + * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. + */ +static uint32_t +mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, + uintptr_t addr) +{ + struct priv *priv = dev->data->dev_private; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + const struct rte_memseg_list *msl; + const struct rte_memseg *ms; + struct mlx4_mr *mr = NULL; + size_t len; + uint32_t ms_n; + uint32_t bmp_size; + void *bmp_mem; + int ms_idx_shift = -1; + unsigned int n; + struct mr_find_contig_memsegs_data data = { + .addr = addr, + }; + struct mr_find_contig_memsegs_data data_re; + + DEBUG("port %u creating a MR using address (%p)", + dev->data->port_id, (void *)addr); + /* + * Release detached MRs if any. This can't be called with holding either + * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have + * been detached by the memory free event but it couldn't be released + * inside the callback due to deadlock. As a result, releasing resources + * is quite opportunistic. + */ + mlx4_mr_garbage_collect(dev); + /* + * Find out a contiguous virtual address chunk in use, to which the + * given address belongs, in order to register maximum range. In the + * best case where mempools are not dynamically recreated and + * '--socket-mem' is speicified as an EAL option, it is very likely to + * have only one MR(LKey) per a socket and per a hugepage-size even + * though the system memory is highly fragmented. + */ + if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { + WARN("port %u unable to find virtually contiguous" + " chunk for address (%p)." + " rte_memseg_contig_walk() failed.", + dev->data->port_id, (void *)addr); + rte_errno = ENXIO; + goto err_nolock; } - mr = rte_malloc(__func__, sizeof(*mr), 0); - if (!mr) { +alloc_resources: + /* Addresses must be page-aligned. */ + assert(rte_is_aligned((void *)data.start, data.msl->page_sz)); + assert(rte_is_aligned((void *)data.end, data.msl->page_sz)); + msl = data.msl; + ms = rte_mem_virt2memseg((void *)data.start, msl); + len = data.end - data.start; + assert(msl->page_sz == ms->hugepage_sz); + /* Number of memsegs in the range. */ + ms_n = len / msl->page_sz; + DEBUG("port %u extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," + " page_sz=0x%" PRIx64 ", ms_n=%u", + dev->data->port_id, (void *)addr, + data.start, data.end, msl->page_sz, ms_n); + /* Size of memory for bitmap. */ + bmp_size = rte_bitmap_get_memory_footprint(ms_n); + mr = rte_zmalloc_socket(NULL, + RTE_ALIGN_CEIL(sizeof(*mr), + RTE_CACHE_LINE_SIZE) + + bmp_size, + RTE_CACHE_LINE_SIZE, msl->socket_id); + if (mr == NULL) { + WARN("port %u unable to allocate memory for a new MR of" + " address (%p).", + dev->data->port_id, (void *)addr); rte_errno = ENOMEM; - goto release; + goto err_nolock; } - *mr = (struct mlx4_mr){ - .start = start, - .end = end, - .refcnt = 1, - .priv = priv, - .mr = mlx4_glue->reg_mr(priv->pd, (void *)start, end - start, - IBV_ACCESS_LOCAL_WRITE), - .mp = mp, - }; - if (mr->mr) { - mr->lkey = mr->mr->lkey; - LIST_INSERT_HEAD(&priv->mr, mr, next); - } else { - rte_free(mr); - mr = NULL; - rte_errno = errno ? errno : EINVAL; + mr->msl = msl; + /* + * Save the index of the first memseg and initialize memseg bitmap. To + * see if a memseg of ms_idx in the memseg-list is still valid, check: + * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) + */ + mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); + mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); + if (mr->ms_bmp == NULL) { + WARN("port %u unable to initialize bitamp for a new MR of" + " address (%p).", + dev->data->port_id, (void *)addr); + rte_errno = EINVAL; + goto err_nolock; + } + /* + * Should recheck whether the extended contiguous chunk is still valid. + * Because memory_hotplug_lock can't be held if there's any memory + * related calls in a critical path, resource allocation above can't be + * locked. If the memory has been changed at this point, try again with + * just single page. If not, go on with the big chunk atomically from + * here. + */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + data_re = data; + if (len > msl->page_sz && + !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { + WARN("port %u unable to find virtually contiguous" + " chunk for address (%p)." + " rte_memseg_contig_walk() failed.", + dev->data->port_id, (void *)addr); + rte_errno = ENXIO; + goto err_memlock; + } + if (data.start != data_re.start || data.end != data_re.end) { + /* + * The extended contiguous chunk has been changed. Try again + * with single memseg instead. + */ + data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); + data.end = data.start + msl->page_sz; + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + mr_free(mr); + goto alloc_resources; } -release: - rte_spinlock_unlock(&priv->mr_lock); - return mr; + assert(data.msl == data_re.msl); + rte_rwlock_write_lock(&priv->mr.rwlock); + /* + * Check the address is really missing. If other thread already created + * one or it is not found due to overflow, abort and return. + */ + if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) { + /* + * Insert to the global cache table. It may fail due to + * low-on-memory. Then, this entry will have to be searched + * here again. + */ + mr_btree_insert(&priv->mr.cache, entry); + DEBUG("port %u found MR for %p on final lookup, abort", + dev->data->port_id, (void *)addr); + rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + /* + * Must be unlocked before calling rte_free() because + * mlx4_mr_mem_event_free_cb() can be called inside. + */ + mr_free(mr); + return entry->lkey; + } + /* + * Trim start and end addresses for verbs MR. Set bits for registering + * memsegs but exclude already registered ones. Bitmap can be + * fragmented. + */ + for (n = 0; n < ms_n; ++n) { + uintptr_t start; + struct mlx4_mr_cache ret = { 0, }; + + start = data_re.start + n * msl->page_sz; + /* Exclude memsegs already registered by other MRs. */ + if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) { + /* + * Start from the first unregistered memseg in the + * extended range. + */ + if (ms_idx_shift == -1) { + mr->ms_base_idx += n; + data.start = start; + ms_idx_shift = n; + } + data.end = start + msl->page_sz; + rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); + ++mr->ms_n; + } + } + len = data.end - data.start; + mr->ms_bmp_n = len / msl->page_sz; + assert(ms_idx_shift + mr->ms_bmp_n <= ms_n); + /* + * Finally create a verbs MR for the memory chunk. ibv_reg_mr() can be + * called with holding the memory lock because it doesn't use + * mlx4_alloc_buf_extern() which eventually calls rte_malloc_socket() + * through mlx4_alloc_verbs_buf(). + */ + mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)data.start, len, + IBV_ACCESS_LOCAL_WRITE); + if (mr->ibv_mr == NULL) { + WARN("port %u fail to create a verbs MR for address (%p)", + dev->data->port_id, (void *)addr); + rte_errno = EINVAL; + goto err_mrlock; + } + assert((uintptr_t)mr->ibv_mr->addr == data.start); + assert(mr->ibv_mr->length == len); + LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + DEBUG("port %u MR CREATED (%p) for %p:\n" + " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," + " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", + dev->data->port_id, (void *)mr, (void *)addr, + data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey), + mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); + /* Insert to the global cache table. */ + mr_insert_dev_cache(dev, mr); + /* Fill in output data. */ + mr_lookup_dev(dev, entry, addr); + /* Lookup can't fail. */ + assert(entry->lkey != UINT32_MAX); + rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return entry->lkey; +err_mrlock: + rte_rwlock_write_unlock(&priv->mr.rwlock); +err_memlock: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); +err_nolock: + /* + * In case of error, as this can be called in a datapath, a warning + * message per an error is preferable instead. Must be unlocked before + * calling rte_free() because mlx4_mr_mem_event_free_cb() can be called + * inside. + */ + mr_free(mr); + return UINT32_MAX; } /** - * Release a memory region. + * Rebuild the global B-tree cache of device from the original MR list. * - * This function decrements its reference count and destroys it after - * reaching 0. + * @param dev + * Pointer to Ethernet device. + */ +static void +mr_rebuild_dev_cache(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct mlx4_mr *mr; + + DEBUG("port %u rebuild dev cache[]", dev->data->port_id); + /* Flush cache to rebuild. */ + priv->mr.cache.len = 1; + priv->mr.cache.overflow = 0; + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) + if (mr_insert_dev_cache(dev, mr) < 0) + return; +} + +/** + * Callback for memory free event. Iterate freed memsegs and check whether it + * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a + * result, the MR would be fragmented. If it becomes empty, the MR will be freed + * later by mlx4_mr_garbage_collect(). * - * Note to avoid race conditions given this function may be used from the - * data plane, it's extremely important that each user holds its own - * reference. + * The global cache must be rebuilt if there's any change and this event has to + * be propagated to dataplane threads to flush the local caches. * - * @param mr - * Memory region to release. + * @param dev + * Pointer to Ethernet device. + * @param addr + * Address of freed memory. + * @param len + * Size of freed memory. + */ +static void +mlx4_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len) +{ + struct priv *priv = dev->data->dev_private; + const struct rte_memseg_list *msl; + struct mlx4_mr *mr; + int ms_n; + int i; + int rebuild = 0; + + DEBUG("port %u free callback: addr=%p, len=%zu", + dev->data->port_id, addr, len); + msl = rte_mem_virt2memseg_list(addr); + /* addr and len must be page-aligned. */ + assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz)); + assert(len == RTE_ALIGN(len, msl->page_sz)); + ms_n = len / msl->page_sz; + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Clear bits of freed memsegs from MR. */ + for (i = 0; i < ms_n; ++i) { + const struct rte_memseg *ms; + struct mlx4_mr_cache entry; + uintptr_t start; + int ms_idx; + uint32_t pos; + + /* Find MR having this memseg. */ + start = (uintptr_t)addr + i * msl->page_sz; + mr = mr_lookup_dev_list(dev, &entry, start); + if (mr == NULL) + continue; + ms = rte_mem_virt2memseg((void *)start, msl); + assert(ms != NULL); + assert(msl->page_sz == ms->hugepage_sz); + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + pos = ms_idx - mr->ms_base_idx; + assert(rte_bitmap_get(mr->ms_bmp, pos)); + assert(pos < mr->ms_bmp_n); + DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p", + dev->data->port_id, (void *)mr, pos, (void *)start); + rte_bitmap_clear(mr->ms_bmp, pos); + if (--mr->ms_n == 0) { + LIST_REMOVE(mr, mr); + LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + DEBUG("port %u remove MR(%p) from list", + dev->data->port_id, (void *)mr); + } + /* + * MR is fragmented or will be freed. the global cache must be + * rebuilt. + */ + rebuild = 1; + } + if (rebuild) { + mr_rebuild_dev_cache(dev); + /* + * Flush local caches by propagating invalidation across cores. + * rte_smp_wmb() is enough to synchronize this event. If one of + * freed memsegs is seen by other core, that means the memseg + * has been allocated by allocator, which will come after this + * free call. Therefore, this store instruction (incrementing + * generation below) will be guaranteed to be seen by other core + * before the core sees the newly allocated memory. + */ + ++priv->mr.dev_gen; + DEBUG("broadcasting local cache flush, gen=%d", + priv->mr.dev_gen); + rte_smp_wmb(); + } + rte_rwlock_write_unlock(&priv->mr.rwlock); +#ifndef NDEBUG + if (rebuild) + mlx4_mr_dump_dev(dev); +#endif +} + +/** + * Callback for memory event. + * + * @param event_type + * Memory event type. + * @param addr + * Address of memory. + * @param len + * Size of memory. */ void -mlx4_mr_put(struct mlx4_mr *mr) +mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, + size_t len, void *arg __rte_unused) { - struct priv *priv = mr->priv; - - rte_spinlock_lock(&priv->mr_lock); - assert(mr->refcnt); - if (--mr->refcnt) - goto release; - LIST_REMOVE(mr, next); - claim_zero(mlx4_glue->dereg_mr(mr->mr)); - rte_free(mr); -release: - rte_spinlock_unlock(&priv->mr_lock); + struct priv *priv; + + switch (event_type) { + case RTE_MEM_EVENT_FREE: + rte_rwlock_read_lock(&mlx4_mem_event_rwlock); + /* Iterate all the existing mlx4 devices. */ + LIST_FOREACH(priv, &mlx4_mem_event_cb_list, mem_event_cb) + mlx4_mr_mem_event_free_cb(priv->dev, addr, len); + rte_rwlock_read_unlock(&mlx4_mem_event_rwlock); + break; + case RTE_MEM_EVENT_ALLOC: + default: + break; + } } /** - * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[]. - * If mp2mr[] is full, remove an entry first. + * Look up address in the global MR cache table. If not found, create a new MR. + * Insert the found/created entry to local bottom-half cache table. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param[out] entry + * Pointer to returning MR cache entry, found in the global cache or newly + * created. If failed to create one, this is not written. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static uint32_t +mlx4_mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, + struct mlx4_mr_cache *entry, uintptr_t addr) +{ + struct priv *priv = dev->data->dev_private; + struct mlx4_mr_btree *bt = &mr_ctrl->cache_bh; + uint16_t idx; + uint32_t lkey; + + /* If local cache table is full, try to double it. */ + if (unlikely(bt->len == bt->size)) + mr_btree_expand(bt, bt->size << 1); + /* Look up in the global cache. */ + rte_rwlock_read_lock(&priv->mr.rwlock); + lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + if (lkey != UINT32_MAX) { + /* Found. */ + *entry = (*priv->mr.cache.table)[idx]; + rte_rwlock_read_unlock(&priv->mr.rwlock); + /* + * Update local cache. Even if it fails, return the found entry + * to update top-half cache. Next time, this entry will be found + * in the global cache. + */ + mr_btree_insert(bt, entry); + return lkey; + } + rte_rwlock_read_unlock(&priv->mr.rwlock); + /* First time to see the address? Create a new MR. */ + lkey = mlx4_mr_create(dev, entry, addr); + /* + * Update the local cache if successfully created a new global MR. Even + * if failed to create one, there's no action to take in this datapath + * code. As returning LKey is invalid, this will eventually make HW + * fail. + */ + if (lkey != UINT32_MAX) + mr_btree_insert(bt, entry); + return lkey; +} + +/** + * Bottom-half of LKey search on datapath. Firstly search in cache_bh[] and if + * misses, search in the global MR cache table and update the new entry to + * per-queue local caches. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static uint32_t +mlx4_mr_addr2mr_bh(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, + uintptr_t addr) +{ + uint32_t lkey; + uint16_t bh_idx = 0; + /* Victim in top-half cache to replace with new entry. */ + struct mlx4_mr_cache *repl = &mr_ctrl->cache[mr_ctrl->head]; + + /* Binary-search MR translation table. */ + lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); + /* Update top-half cache. */ + if (likely(lkey != UINT32_MAX)) { + *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; + } else { + /* + * If missed in local lookup table, search in the global cache + * and local cache_bh[] will be updated inside if possible. + * Top-half cache entry will also be updated. + */ + lkey = mlx4_mr_lookup_dev(dev, mr_ctrl, repl, addr); + if (unlikely(lkey == UINT32_MAX)) + return UINT32_MAX; + } + /* Update the most recently used entry. */ + mr_ctrl->mru = mr_ctrl->head; + /* Point to the next victim, the oldest. */ + mr_ctrl->head = (mr_ctrl->head + 1) % MLX4_MR_CACHE_N; + return lkey; +} + +/** + * Bottom-half of LKey search on Rx. + * + * @param rxq + * Pointer to Rx queue structure. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +uint32_t +mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr) +{ + struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl; + struct priv *priv = rxq->priv; + + DEBUG("Rx queue %u: miss on top-half, mru=%u, head=%u, addr=%p", + rxq->stats.idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr); + return mlx4_mr_addr2mr_bh(priv->dev, mr_ctrl, addr); +} + +/** + * Bottom-half of LKey search on Tx. * * @param txq * Pointer to Tx queue structure. - * @param[in] mp - * Memory pool for which a memory region lkey must be added. - * @param[in] i - * Index in memory pool (MP) where to add memory region (MR). + * @param addr + * Search key. * * @return - * Added mr->lkey on success, (uint32_t)-1 on failure. + * Searched LKey on success, UINT32_MAX on no match. */ uint32_t -mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i) +mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr) { + struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + struct priv *priv = txq->priv; + + DEBUG("Tx queue %u: miss on top-half, mru=%u, head=%u, addr=%p", + txq->stats.idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr); + return mlx4_mr_addr2mr_bh(priv->dev, mr_ctrl, addr); +} + +/** + * Flush all of the local cache entries. + * + * @param mr_ctrl + * Pointer to per-queue MR control structure. + */ +void +mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl) +{ + /* Reset the most-recently-used index. */ + mr_ctrl->mru = 0; + /* Reset the linear search array. */ + mr_ctrl->head = 0; + memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); + /* Reset the B-tree table. */ + mr_ctrl->cache_bh.len = 1; + mr_ctrl->cache_bh.overflow = 0; + /* Update the generation number. */ + mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; + DEBUG("mr_ctrl(%p): flushed, cur_gen=%d", + (void *)mr_ctrl, mr_ctrl->cur_gen); +} + +/* Called during rte_mempool_mem_iter() by mlx4_mr_update_mp(). */ +static void +mlx4_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque, + struct rte_mempool_memhdr *memhdr, + unsigned mem_idx __rte_unused) +{ + struct mr_update_mp_data *data = opaque; + uint32_t lkey; + + /* Stop iteration if failed in the previous walk. */ + if (data->ret < 0) + return; + /* Register address of the chunk and update local caches. */ + lkey = mlx4_mr_addr2mr_bh(data->dev, data->mr_ctrl, + (uintptr_t)memhdr->addr); + if (lkey == UINT32_MAX) + data->ret = -1; +} + +/** + * Register entire memory chunks in a Mempool. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param mp + * Pointer to registering Mempool. + * + * @return + * 0 on success, -1 on failure. + */ +int +mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, + struct rte_mempool *mp) +{ + struct mr_update_mp_data data = { + .dev = dev, + .mr_ctrl = mr_ctrl, + .ret = 0, + }; + + rte_mempool_mem_iter(mp, mlx4_mr_update_mp_cb, &data); + return data.ret; +} + +#ifndef NDEBUG +/** + * Dump all the created MRs and the global cache entries. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx4_mr_dump_dev(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; struct mlx4_mr *mr; + int mr_n = 0; + int chunk_n = 0; + + rte_rwlock_read_lock(&priv->mr.rwlock); + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + unsigned int n; + + DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", + dev->data->port_id, mr_n++, + rte_cpu_to_be_32(mr->ibv_mr->lkey), + mr->ms_n, mr->ms_bmp_n); + if (mr->ms_n == 0) + continue; + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx4_mr_cache ret = { 0, }; - /* Add a new entry, register MR first. */ - DEBUG("%p: discovered new memory pool \"%s\" (%p)", - (void *)txq, mp->name, (void *)mp); - mr = mlx4_mr_get(txq->priv, mp); - if (unlikely(mr == NULL)) { - DEBUG("%p: unable to configure MR, mlx4_mr_get() failed", - (void *)txq); - return (uint32_t)-1; + n = mr_find_next_chunk(mr, &ret, n); + if (!ret.end) + break; + DEBUG(" chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", + chunk_n++, ret.start, ret.end); + } } - if (unlikely(i == RTE_DIM(txq->mp2mr))) { - /* Table is full, remove oldest entry. */ - DEBUG("%p: MR <-> MP table full, dropping oldest entry.", - (void *)txq); - --i; - mlx4_mr_put(txq->mp2mr[0].mr); - memmove(&txq->mp2mr[0], &txq->mp2mr[1], - (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); + DEBUG("port %u dumping global cache", dev->data->port_id); + mlx4_mr_btree_dump(&priv->mr.cache); + rte_rwlock_read_unlock(&priv->mr.rwlock); +} +#endif + +/** + * Release all the created MRs and resources. Remove device from memory callback + * list. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx4_mr_release(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct mlx4_mr *mr_next = LIST_FIRST(&priv->mr.mr_list); + + /* Remove from memory callback device list. */ + rte_rwlock_write_lock(&mlx4_mem_event_rwlock); + LIST_REMOVE(priv, mem_event_cb); + rte_rwlock_write_unlock(&mlx4_mem_event_rwlock); +#ifndef NDEBUG + mlx4_mr_dump_dev(dev); +#endif + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Detach from MR list and move to free list. */ + while (mr_next != NULL) { + struct mlx4_mr *mr = mr_next; + + mr_next = LIST_NEXT(mr, mr); + LIST_REMOVE(mr, mr); + LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); } - /* Store the new entry. */ - txq->mp2mr[i].mp = mp; - txq->mp2mr[i].mr = mr; - txq->mp2mr[i].lkey = mr->lkey; - DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, - (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey); - return txq->mp2mr[i].lkey; + LIST_INIT(&priv->mr.mr_list); + /* Free global cache. */ + mlx4_mr_btree_free(&priv->mr.cache); + rte_rwlock_write_unlock(&priv->mr.rwlock); + /* Free all remaining MRs. */ + mlx4_mr_garbage_collect(dev); } diff --git a/drivers/net/mlx4/mlx4_mr.h b/drivers/net/mlx4/mlx4_mr.h new file mode 100644 index 00000000..37a365a8 --- /dev/null +++ b/drivers/net/mlx4/mlx4_mr.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 6WIND S.A. + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX4_MR_H_ +#define RTE_PMD_MLX4_MR_H_ + +#include +#include +#include + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include +#include +#include +#include + +/* Size of per-queue MR cache array for linear search. */ +#define MLX4_MR_CACHE_N 8 + +/* Size of MR cache table for binary search. */ +#define MLX4_MR_BTREE_CACHE_N 256 + +/* Memory Region object. */ +struct mlx4_mr { + LIST_ENTRY(mlx4_mr) mr; /**< Pointer to the prev/next entry. */ + struct ibv_mr *ibv_mr; /* Verbs Memory Region. */ + const struct rte_memseg_list *msl; + int ms_base_idx; /* Start index of msl->memseg_arr[]. */ + int ms_n; /* Number of memsegs in use. */ + uint32_t ms_bmp_n; /* Number of bits in memsegs bit-mask. */ + struct rte_bitmap *ms_bmp; /* Bit-mask of memsegs belonged to MR. */ +}; + +/* Cache entry for Memory Region. */ +struct mlx4_mr_cache { + uintptr_t start; /* Start address of MR. */ + uintptr_t end; /* End address of MR. */ + uint32_t lkey; /* rte_cpu_to_be_32(ibv_mr->lkey). */ +} __rte_packed; + +/* MR Cache table for Binary search. */ +struct mlx4_mr_btree { + uint16_t len; /* Number of entries. */ + uint16_t size; /* Total number of entries. */ + int overflow; /* Mark failure of table expansion. */ + struct mlx4_mr_cache (*table)[]; +} __rte_packed; + +/* Per-queue MR control descriptor. */ +struct mlx4_mr_ctrl { + uint32_t *dev_gen_ptr; /* Generation number of device to poll. */ + uint32_t cur_gen; /* Generation number saved to flush caches. */ + uint16_t mru; /* Index of last hit entry in top-half cache. */ + uint16_t head; /* Index of the oldest entry in top-half cache. */ + struct mlx4_mr_cache cache[MLX4_MR_CACHE_N]; /* Cache for top-half. */ + struct mlx4_mr_btree cache_bh; /* Cache for bottom-half. */ +} __rte_packed; + +extern struct mlx4_dev_list mlx4_mem_event_cb_list; +extern rte_rwlock_t mlx4_mem_event_rwlock; + +/* First entry must be NULL for comparison. */ +#define mlx4_mr_btree_len(bt) ((bt)->len - 1) + +int mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket); +void mlx4_mr_btree_free(struct mlx4_mr_btree *bt); +void mlx4_mr_btree_dump(struct mlx4_mr_btree *bt); +void mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, + size_t len, void *arg); +int mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, + struct rte_mempool *mp); +void mlx4_mr_dump_dev(struct rte_eth_dev *dev); +void mlx4_mr_release(struct rte_eth_dev *dev); + +/** + * Look up LKey from given lookup table by linear search. Firstly look up the + * last-hit entry. If miss, the entire array is searched. If found, update the + * last-hit index and return LKey. + * + * @param lkp_tbl + * Pointer to lookup table. + * @param[in,out] cached_idx + * Pointer to last-hit index. + * @param n + * Size of lookup table. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static __rte_always_inline uint32_t +mlx4_mr_lookup_cache(struct mlx4_mr_cache *lkp_tbl, uint16_t *cached_idx, + uint16_t n, uintptr_t addr) +{ + uint16_t idx; + + if (likely(addr >= lkp_tbl[*cached_idx].start && + addr < lkp_tbl[*cached_idx].end)) + return lkp_tbl[*cached_idx].lkey; + for (idx = 0; idx < n && lkp_tbl[idx].start != 0; ++idx) { + if (addr >= lkp_tbl[idx].start && + addr < lkp_tbl[idx].end) { + /* Found. */ + *cached_idx = idx; + return lkp_tbl[idx].lkey; + } + } + return UINT32_MAX; +} + +#endif /* RTE_PMD_MLX4_MR_H_ */ diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h index 153dda52..aef77ba0 100644 --- a/drivers/net/mlx4/mlx4_prm.h +++ b/drivers/net/mlx4/mlx4_prm.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ #ifndef MLX4_PRM_H_ @@ -19,6 +19,7 @@ #ifdef PEDANTIC #pragma GCC diagnostic error "-Wpedantic" #endif +#include "mlx4_autoconf.h" /* ConnectX-3 Tx queue basic block. */ #define MLX4_TXBB_SHIFT 6 @@ -40,6 +41,7 @@ /* Work queue element (WQE) flags. */ #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28) #define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27) +#define MLX4_WQE_CTRL_RR (1 << 6) /* CQE checksum flags. */ enum { @@ -51,6 +53,7 @@ enum { }; /* CQE status flags. */ +#define MLX4_CQE_STATUS_IPV6F (1 << 12) #define MLX4_CQE_STATUS_IPV4 (1 << 22) #define MLX4_CQE_STATUS_IPV4F (1 << 23) #define MLX4_CQE_STATUS_IPV6 (1 << 24) @@ -97,6 +100,19 @@ struct mlx4_cq { int arm_sn; /**< Rx event counter. */ }; +#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG +/* + * WQE LSO segment structure. + * Defined here as backward compatibility for rdma-core v17 and below. + * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18 + * and above. + */ +struct mlx4_wqe_lso_seg { + rte_be32_t mss_hdr_size; + rte_be32_t header[]; +}; +#endif + /** * Retrieve a CQE entry from a CQ. * diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c index 7a036ed8..9737da2e 100644 --- a/drivers/net/mlx4/mlx4_rxq.c +++ b/drivers/net/mlx4/mlx4_rxq.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ /** @@ -88,7 +88,7 @@ mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = { */ struct mlx4_rss * mlx4_rss_get(struct priv *priv, uint64_t fields, - uint8_t key[MLX4_RSS_HASH_KEY_SIZE], + const uint8_t key[MLX4_RSS_HASH_KEY_SIZE], uint16_t queues, const uint16_t queue_id[]) { struct mlx4_rss *rss; @@ -336,6 +336,14 @@ mlx4_rss_init(struct priv *priv) unsigned int i; int ret; + if (priv->rss_init) + return 0; + if (priv->dev->data->nb_rx_queues > priv->hw_rss_max_qps) { + ERROR("RSS does not support more than %d queues", + priv->hw_rss_max_qps); + rte_errno = EINVAL; + return -rte_errno; + } /* Prepare range for RSS contexts before creating the first WQ. */ ret = mlx4_glue->dv_set_context_attr (priv->ctx, @@ -418,6 +426,7 @@ wq_num_check: } wq_num_prev = wq_num; } + priv->rss_init = 1; return 0; error: ERROR("cannot initialize common RSS resources (queue %u): %s: %s", @@ -446,6 +455,8 @@ mlx4_rss_deinit(struct priv *priv) { unsigned int i; + if (!priv->rss_init) + return; for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) { struct rxq *rxq = priv->dev->data->rx_queues[i]; @@ -454,6 +465,7 @@ mlx4_rss_deinit(struct priv *priv) mlx4_rxq_detach(rxq); } } + priv->rss_init = 0; } /** @@ -482,6 +494,7 @@ mlx4_rxq_attach(struct rxq *rxq) } struct priv *priv = rxq->priv; + struct rte_eth_dev *dev = priv->dev; const uint32_t elts_n = 1 << rxq->elts_n; const uint32_t sges_n = 1 << rxq->sges_n; struct rte_mbuf *(*elts)[elts_n] = rxq->elts; @@ -491,6 +504,8 @@ mlx4_rxq_attach(struct rxq *rxq) const char *msg; struct ibv_cq *cq = NULL; struct ibv_wq *wq = NULL; + uint32_t create_flags = 0; + uint32_t comp_mask = 0; volatile struct mlx4_wqe_data_seg (*wqes)[]; unsigned int i; int ret; @@ -503,6 +518,11 @@ mlx4_rxq_attach(struct rxq *rxq) msg = "CQ creation failure"; goto error; } + /* By default, FCS (CRC) is stripped by hardware. */ + if (rxq->crc_present) { + create_flags |= IBV_WQ_FLAGS_SCATTER_FCS; + comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; + } wq = mlx4_glue->create_wq (priv->ctx, &(struct ibv_wq_init_attr){ @@ -511,6 +531,8 @@ mlx4_rxq_attach(struct rxq *rxq) .max_sge = sges_n, .pd = priv->pd, .cq = cq, + .comp_mask = comp_mask, + .create_flags = create_flags, }); if (!wq) { ret = errno ? errno : EINVAL; @@ -537,6 +559,11 @@ mlx4_rxq_attach(struct rxq *rxq) msg = "failed to obtain device information from WQ/CQ objects"; goto error; } + /* Pre-register Rx mempool. */ + DEBUG("port %u Rx queue %u registering mp %s having %u chunks", + priv->dev->data->port_id, rxq->stats.idx, + rxq->mp->name, rxq->mp->nb_mem_chunks); + mlx4_mr_update_mp(dev, &rxq->mr_ctrl, rxq->mp); wqes = (volatile struct mlx4_wqe_data_seg (*)[]) ((uintptr_t)dv_rwq.buf.buf + dv_rwq.rq.offset); for (i = 0; i != RTE_DIM(*elts); ++i) { @@ -568,7 +595,7 @@ mlx4_rxq_attach(struct rxq *rxq) .addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t)), .byte_count = rte_cpu_to_be_32(buf->data_len), - .lkey = rte_cpu_to_be_32(rxq->mr->lkey), + .lkey = mlx4_rx_mb2mr(rxq, buf), }; (*elts)[i] = buf; } @@ -597,6 +624,7 @@ error: claim_zero(mlx4_glue->destroy_wq(wq)); if (cq) claim_zero(mlx4_glue->destroy_cq(cq)); + --rxq->usecnt; rte_errno = ret; ERROR("error while attaching Rx queue %p: %s: %s", (void *)rxq, msg, strerror(ret)); @@ -650,7 +678,9 @@ uint64_t mlx4_get_rx_queue_offloads(struct priv *priv) { uint64_t offloads = DEV_RX_OFFLOAD_SCATTER | - DEV_RX_OFFLOAD_CRC_STRIP; + DEV_RX_OFFLOAD_CRC_STRIP | + DEV_RX_OFFLOAD_KEEP_CRC | + DEV_RX_OFFLOAD_JUMBO_FRAME; if (priv->hw_csum) offloads |= DEV_RX_OFFLOAD_CHECKSUM; @@ -675,26 +705,6 @@ mlx4_get_rx_port_offloads(struct priv *priv) return offloads; } -/** - * Checks if the per-queue offload configuration is valid. - * - * @param priv - * Pointer to private structure. - * @param requested - * Per-queue offloads configuration. - * - * @return - * Nonzero when configuration is valid. - */ -static int -mlx4_check_rx_queue_offloads(struct priv *priv, uint64_t requested) -{ - uint64_t mandatory = priv->dev->data->dev_conf.rxmode.offloads; - uint64_t supported = mlx4_get_rx_port_offloads(priv); - - return !((mandatory ^ requested) & supported); -} - /** * DPDK callback to configure a Rx queue. * @@ -736,20 +746,14 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, }, }; int ret; + uint32_t crc_present; + uint64_t offloads; + + offloads = conf->offloads | dev->data->dev_conf.rxmode.offloads; - (void)conf; /* Thresholds configuration (ignored). */ DEBUG("%p: configuring queue %u for %u descriptors", (void *)dev, idx, desc); - if (!mlx4_check_rx_queue_offloads(priv, conf->offloads)) { - rte_errno = ENOTSUP; - ERROR("%p: Rx queue offloads 0x%" PRIx64 " don't match port " - "offloads 0x%" PRIx64 " or supported offloads 0x%" PRIx64, - (void *)dev, conf->offloads, - dev->data->dev_conf.rxmode.offloads, - (mlx4_get_rx_port_offloads(priv) | - mlx4_get_rx_queue_offloads(priv))); - return -rte_errno; - } + if (idx >= dev->data->nb_rx_queues) { rte_errno = EOVERFLOW; ERROR("%p: queue index out of range (%u >= %u)", @@ -774,6 +778,23 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, " to the next power of two (%u)", (void *)dev, idx, desc); } + /* By default, FCS (CRC) is stripped by hardware. */ + crc_present = 0; + if (rte_eth_dev_must_keep_crc(offloads)) { + if (priv->hw_fcs_strip) { + crc_present = 1; + } else { + WARN("%p: CRC stripping has been disabled but will still" + " be performed by hardware, make sure MLNX_OFED and" + " firmware are up to date", + (void *)dev); + } + } + DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from" + " incoming frames to hide it", + (void *)dev, + crc_present ? "disabled" : "enabled", + crc_present << 2); /* Allocate and initialize Rx queue. */ mlx4_zmallocv_socket("RXQ", vec, RTE_DIM(vec), socket); if (!rxq) { @@ -790,9 +811,10 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, .elts = elts, /* Toggle Rx checksum offload if hardware supports it. */ .csum = priv->hw_csum && - (conf->offloads & DEV_RX_OFFLOAD_CHECKSUM), + (offloads & DEV_RX_OFFLOAD_CHECKSUM), .csum_l2tun = priv->hw_csum_l2tun && - (conf->offloads & DEV_RX_OFFLOAD_CHECKSUM), + (offloads & DEV_RX_OFFLOAD_CHECKSUM), + .crc_present = crc_present, .l2tun_offload = priv->hw_csum_l2tun, .stats = { .idx = idx, @@ -804,7 +826,7 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, if (dev->data->dev_conf.rxmode.max_rx_pkt_len <= (mb_len - RTE_PKTMBUF_HEADROOM)) { ; - } else if (conf->offloads & DEV_RX_OFFLOAD_SCATTER) { + } else if (offloads & DEV_RX_OFFLOAD_SCATTER) { uint32_t size = RTE_PKTMBUF_HEADROOM + dev->data->dev_conf.rxmode.max_rx_pkt_len; @@ -847,11 +869,9 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, 1 << rxq->sges_n); goto error; } - /* Use the entire Rx mempool as the memory region. */ - rxq->mr = mlx4_mr_get(priv, mp); - if (!rxq->mr) { - ERROR("%p: MR creation failure: %s", - (void *)dev, strerror(rte_errno)); + if (mlx4_mr_btree_init(&rxq->mr_ctrl.cache_bh, + MLX4_MR_BTREE_CACHE_N, socket)) { + /* rte_errno is already set. */ goto error; } if (dev->data->dev_conf.intr_conf.rxq) { @@ -911,7 +931,6 @@ mlx4_rx_queue_release(void *dpdk_rxq) assert(!rxq->rq_db); if (rxq->channel) claim_zero(mlx4_glue->destroy_comp_channel(rxq->channel)); - if (rxq->mr) - mlx4_mr_put(rxq->mr); + mlx4_mr_btree_free(&rxq->mr_ctrl.cache_bh); rte_free(rxq); } diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c index 8ca8b77c..8c88effc 100644 --- a/drivers/net/mlx4/mlx4_rxtx.c +++ b/drivers/net/mlx4/mlx4_rxtx.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ /** @@ -38,10 +38,29 @@ * DWORD (32 byte) of a TXBB. */ struct pv { - volatile struct mlx4_wqe_data_seg *dseg; + union { + volatile struct mlx4_wqe_data_seg *dseg; + volatile uint32_t *dst; + }; uint32_t val; }; +/** A helper structure for TSO packet handling. */ +struct tso_info { + /** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */ + struct pv *pv; + /** Current entry in the pv array. */ + int pv_counter; + /** Total size of the WQE including padding. */ + uint32_t wqe_size; + /** Size of TSO header to prepend to each packet to send. */ + uint16_t tso_header_size; + /** Total size of the TSO segment in the WQE. */ + uint16_t wqe_tso_seg_size; + /** Raw WQE size in units of 16 Bytes and without padding. */ + uint8_t fence_size; +}; + /** A table to translate Rx completion flags to packet type. */ uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = { /* @@ -52,49 +71,58 @@ uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = { * bit[4] - MLX4_CQE_STATUS_TCP * bit[3] - MLX4_CQE_STATUS_IPV4OPT * bit[2] - MLX4_CQE_STATUS_IPV6 - * bit[1] - MLX4_CQE_STATUS_IPV4F + * bit[1] - MLX4_CQE_STATUS_IPF * bit[0] - MLX4_CQE_STATUS_IPV4 * giving a total of up to 256 entries. */ + /* L2 */ [0x00] = RTE_PTYPE_L2_ETHER, + /* L3 */ [0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_NONFRAG, [0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG, [0x03] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_FRAG, - [0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, - [0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT, + [0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG, + [0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, + [0x08] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_NONFRAG, + [0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_NONFRAG, [0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_FRAG, + [0x0b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_FRAG, + /* TCP */ [0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP, - [0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_L4_TCP, [0x14] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP, + [0x16] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, [0x18] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_TCP, [0x19] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_TCP, - [0x1a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | - RTE_PTYPE_L4_TCP, + /* UDP */ [0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP, - [0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_L4_UDP, [0x24] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP, + [0x26] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, [0x28] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_UDP, [0x29] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_UDP, - [0x2a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | - RTE_PTYPE_L4_UDP, /* Tunneled - L3 IPV6 */ [0x80] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, [0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG, [0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_FRAG, @@ -102,65 +130,58 @@ uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = { RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_FRAG, [0x84] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG, + [0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, [0x88] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT, + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_NONFRAG, [0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT, + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_NONFRAG, [0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG, + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_FRAG, + [0x8b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_FRAG, /* Tunneled - L3 IPV6, TCP */ [0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_TCP, - [0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L4_FRAG | - RTE_PTYPE_INNER_L4_TCP, - [0x93] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L4_FRAG | - RTE_PTYPE_INNER_L4_TCP, [0x94] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_TCP, + [0x96] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, [0x98] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT | - RTE_PTYPE_INNER_L4_TCP, + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP, [0x99] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT | - RTE_PTYPE_INNER_L4_TCP, - [0x9a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | - RTE_PTYPE_INNER_L4_TCP, + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP, /* Tunneled - L3 IPV6, UDP */ - [0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L4_UDP, - [0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + [0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L4_FRAG | RTE_PTYPE_INNER_L4_UDP, - [0xa3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L4_FRAG | - RTE_PTYPE_INNER_L4_UDP, - [0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + [0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_UDP, - [0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + [0xa6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP, - [0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + [0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP, - [0xaa] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | - RTE_PTYPE_INNER_L4_UDP, /* Tunneled - L3 IPV4 */ [0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, [0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG, [0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_FRAG, @@ -168,65 +189,54 @@ uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = { RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_FRAG, [0xc4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG, + [0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, [0xc8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT, + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_NONFRAG, [0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT, + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_NONFRAG, [0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG, + [0xcb] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_FRAG, /* Tunneled - L3 IPV4, TCP */ - [0xd0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L4_TCP, [0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_TCP, - [0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L4_FRAG | - RTE_PTYPE_INNER_L4_TCP, - [0xd3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L4_FRAG | - RTE_PTYPE_INNER_L4_TCP, [0xd4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_TCP, + [0xd6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, [0xd8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP, [0xd9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP, - [0xda] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | - RTE_PTYPE_INNER_L4_TCP, /* Tunneled - L3 IPV4, UDP */ - [0xe0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | - RTE_PTYPE_INNER_L4_UDP, [0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_UDP, - [0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L4_FRAG | - RTE_PTYPE_INNER_L4_UDP, - [0xe3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L4_FRAG | - RTE_PTYPE_INNER_L4_UDP, [0xe4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_UDP, + [0xe6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, [0xe8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP, + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_UDP, [0xe9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP, - [0xea] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | - RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP, }; @@ -263,7 +273,7 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, volatile uint32_t *start, } while (start != (volatile uint32_t *)sq->eob); start = (volatile uint32_t *)sq->buf; /* Flip invalid stamping ownership. */ - stamp ^= RTE_BE32(0x1 << MLX4_SQ_OWNER_BIT); + stamp ^= RTE_BE32(1u << MLX4_SQ_OWNER_BIT); sq->stamp = stamp; if (start == end) return size; @@ -343,24 +353,6 @@ mlx4_txq_complete(struct txq *txq, const unsigned int elts_m, txq->elts_tail = elts_tail; } -/** - * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which - * the cloned mbuf is allocated is returned instead. - * - * @param buf - * Pointer to mbuf. - * - * @return - * Memory pool where data is located for given mbuf. - */ -static struct rte_mempool * -mlx4_txq_mb2mp(struct rte_mbuf *buf) -{ - if (unlikely(RTE_MBUF_INDIRECT(buf))) - return rte_mbuf_from_indirect(buf)->pool; - return buf->pool; -} - /** * Write Tx data segment to the SQ. * @@ -378,7 +370,7 @@ mlx4_fill_tx_data_seg(volatile struct mlx4_wqe_data_seg *dseg, uint32_t lkey, uintptr_t addr, rte_be32_t byte_count) { dseg->addr = rte_cpu_to_be_64(addr); - dseg->lkey = rte_cpu_to_be_32(lkey); + dseg->lkey = lkey; #if RTE_CACHE_LINE_SIZE < 64 /* * Need a barrier here before writing the byte_count @@ -394,6 +386,342 @@ mlx4_fill_tx_data_seg(volatile struct mlx4_wqe_data_seg *dseg, dseg->byte_count = byte_count; } +/** + * Obtain and calculate TSO information needed for assembling a TSO WQE. + * + * @param buf + * Pointer to the first packet mbuf. + * @param txq + * Pointer to Tx queue structure. + * @param tinfo + * Pointer to a structure to fill the info with. + * + * @return + * 0 on success, negative value upon error. + */ +static inline int +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf, + struct txq *txq, + struct tso_info *tinfo) +{ + struct mlx4_sq *sq = &txq->msq; + const uint8_t tunneled = txq->priv->hw_csum_l2tun && + (buf->ol_flags & PKT_TX_TUNNEL_MASK); + + tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len; + if (tunneled) + tinfo->tso_header_size += + buf->outer_l2_len + buf->outer_l3_len; + if (unlikely(buf->tso_segsz == 0 || + tinfo->tso_header_size == 0 || + tinfo->tso_header_size > MLX4_MAX_TSO_HEADER || + tinfo->tso_header_size > buf->data_len)) + return -EINVAL; + /* + * Calculate the WQE TSO segment size + * Note: + * 1. An LSO segment must be padded such that the subsequent data + * segment is 16-byte aligned. + * 2. The start address of the TSO segment is always 16 Bytes aligned. + */ + tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) + + tinfo->tso_header_size, + sizeof(struct mlx4_wqe_data_seg)); + tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) + + tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) + + buf->nb_segs; + tinfo->wqe_size = + RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT), + MLX4_TXBB_SIZE); + /* Validate WQE size and WQE space in the send queue. */ + if (sq->remain_size < tinfo->wqe_size || + tinfo->wqe_size > MLX4_MAX_WQE_SIZE) + return -ENOMEM; + /* Init pv. */ + tinfo->pv = (struct pv *)txq->bounce_buf; + tinfo->pv_counter = 0; + return 0; +} + +/** + * Fill the TSO WQE data segments with info on buffers to transmit . + * + * @param buf + * Pointer to the first packet mbuf. + * @param txq + * Pointer to Tx queue structure. + * @param tinfo + * Pointer to TSO info to use. + * @param dseg + * Pointer to the first data segment in the TSO WQE. + * @param ctrl + * Pointer to the control segment in the TSO WQE. + * + * @return + * 0 on success, negative value upon error. + */ +static inline volatile struct mlx4_wqe_ctrl_seg * +mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf, + struct txq *txq, + struct tso_info *tinfo, + volatile struct mlx4_wqe_data_seg *dseg, + volatile struct mlx4_wqe_ctrl_seg *ctrl) +{ + uint32_t lkey; + int nb_segs = buf->nb_segs; + int nb_segs_txbb; + struct mlx4_sq *sq = &txq->msq; + struct rte_mbuf *sbuf = buf; + struct pv *pv = tinfo->pv; + int *pv_counter = &tinfo->pv_counter; + volatile struct mlx4_wqe_ctrl_seg *ctrl_next = + (volatile struct mlx4_wqe_ctrl_seg *) + ((volatile uint8_t *)ctrl + tinfo->wqe_size); + uint16_t data_len = sbuf->data_len - tinfo->tso_header_size; + uintptr_t data_addr = rte_pktmbuf_mtod_offset(sbuf, uintptr_t, + tinfo->tso_header_size); + + do { + /* how many dseg entries do we have in the current TXBB ? */ + nb_segs_txbb = (MLX4_TXBB_SIZE - + ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >> + MLX4_SEG_SHIFT; + switch (nb_segs_txbb) { +#ifndef NDEBUG + default: + /* Should never happen. */ + rte_panic("%p: Invalid number of SGEs(%d) for a TXBB", + (void *)txq, nb_segs_txbb); + /* rte_panic never returns. */ + break; +#endif /* NDEBUG */ + case 4: + /* Memory region key for this memory pool. */ + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) + goto err; + dseg->addr = rte_cpu_to_be_64(data_addr); + dseg->lkey = lkey; + /* + * This data segment starts at the beginning of a new + * TXBB, so we need to postpone its byte_count writing + * for later. + */ + pv[*pv_counter].dseg = dseg; + /* + * Zero length segment is treated as inline segment + * with zero data. + */ + pv[(*pv_counter)++].val = + rte_cpu_to_be_32(data_len ? + data_len : + 0x80000000); + if (--nb_segs == 0) + return ctrl_next; + /* Prepare next buf info */ + sbuf = sbuf->next; + dseg++; + data_len = sbuf->data_len; + data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); + /* fallthrough */ + case 3: + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) + goto err; + mlx4_fill_tx_data_seg(dseg, lkey, data_addr, + rte_cpu_to_be_32(data_len ? + data_len : + 0x80000000)); + if (--nb_segs == 0) + return ctrl_next; + /* Prepare next buf info */ + sbuf = sbuf->next; + dseg++; + data_len = sbuf->data_len; + data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); + /* fallthrough */ + case 2: + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) + goto err; + mlx4_fill_tx_data_seg(dseg, lkey, data_addr, + rte_cpu_to_be_32(data_len ? + data_len : + 0x80000000)); + if (--nb_segs == 0) + return ctrl_next; + /* Prepare next buf info */ + sbuf = sbuf->next; + dseg++; + data_len = sbuf->data_len; + data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); + /* fallthrough */ + case 1: + lkey = mlx4_tx_mb2mr(txq, sbuf); + if (unlikely(lkey == (uint32_t)-1)) + goto err; + mlx4_fill_tx_data_seg(dseg, lkey, data_addr, + rte_cpu_to_be_32(data_len ? + data_len : + 0x80000000)); + if (--nb_segs == 0) + return ctrl_next; + /* Prepare next buf info */ + sbuf = sbuf->next; + dseg++; + data_len = sbuf->data_len; + data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t); + /* fallthrough */ + } + /* Wrap dseg if it points at the end of the queue. */ + if ((volatile uint8_t *)dseg >= sq->eob) + dseg = (volatile struct mlx4_wqe_data_seg *) + ((volatile uint8_t *)dseg - sq->size); + } while (true); +err: + return NULL; +} + +/** + * Fill the packet's l2, l3 and l4 headers to the WQE. + * + * This will be used as the header for each TSO segment that is transmitted. + * + * @param buf + * Pointer to the first packet mbuf. + * @param txq + * Pointer to Tx queue structure. + * @param tinfo + * Pointer to TSO info to use. + * @param ctrl + * Pointer to the control segment in the TSO WQE. + * + * @return + * 0 on success, negative value upon error. + */ +static inline volatile struct mlx4_wqe_data_seg * +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf, + struct txq *txq, + struct tso_info *tinfo, + volatile struct mlx4_wqe_ctrl_seg *ctrl) +{ + volatile struct mlx4_wqe_lso_seg *tseg = + (volatile struct mlx4_wqe_lso_seg *)(ctrl + 1); + struct mlx4_sq *sq = &txq->msq; + struct pv *pv = tinfo->pv; + int *pv_counter = &tinfo->pv_counter; + int remain_size = tinfo->tso_header_size; + char *from = rte_pktmbuf_mtod(buf, char *); + uint16_t txbb_avail_space; + /* Union to overcome volatile constraints when copying TSO header. */ + union { + volatile uint8_t *vto; + uint8_t *to; + } thdr = { .vto = (volatile uint8_t *)tseg->header, }; + + /* + * TSO data always starts at offset 20 from the beginning of the TXBB + * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned + * we can write the first 44 TSO header bytes without worry for TxQ + * wrapping or overwriting the first TXBB 32bit word. + */ + txbb_avail_space = MLX4_TXBB_SIZE - + (sizeof(struct mlx4_wqe_ctrl_seg) + + sizeof(struct mlx4_wqe_lso_seg)); + while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) { + /* Copy to end of txbb. */ + rte_memcpy(thdr.to, from, txbb_avail_space); + from += txbb_avail_space; + thdr.to += txbb_avail_space; + /* New TXBB, Check for TxQ wrap. */ + if (thdr.to >= sq->eob) + thdr.vto = sq->buf; + /* New TXBB, stash the first 32bits for later use. */ + pv[*pv_counter].dst = (volatile uint32_t *)thdr.to; + pv[(*pv_counter)++].val = *(uint32_t *)from, + from += sizeof(uint32_t); + thdr.to += sizeof(uint32_t); + remain_size -= txbb_avail_space + sizeof(uint32_t); + /* Avail space in new TXBB is TXBB size - 4 */ + txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t); + } + if (remain_size > txbb_avail_space) { + rte_memcpy(thdr.to, from, txbb_avail_space); + from += txbb_avail_space; + thdr.to += txbb_avail_space; + remain_size -= txbb_avail_space; + /* New TXBB, Check for TxQ wrap. */ + if (thdr.to >= sq->eob) + thdr.vto = sq->buf; + pv[*pv_counter].dst = (volatile uint32_t *)thdr.to; + rte_memcpy(&pv[*pv_counter].val, from, remain_size); + (*pv_counter)++; + } else if (remain_size) { + rte_memcpy(thdr.to, from, remain_size); + } + tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) | + tinfo->tso_header_size); + /* Calculate data segment location */ + return (volatile struct mlx4_wqe_data_seg *) + ((uintptr_t)tseg + tinfo->wqe_tso_seg_size); +} + +/** + * Write data segments and header for TSO uni/multi segment packet. + * + * @param buf + * Pointer to the first packet mbuf. + * @param txq + * Pointer to Tx queue structure. + * @param ctrl + * Pointer to the WQE control segment. + * + * @return + * Pointer to the next WQE control segment on success, NULL otherwise. + */ +static volatile struct mlx4_wqe_ctrl_seg * +mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq, + volatile struct mlx4_wqe_ctrl_seg *ctrl) +{ + volatile struct mlx4_wqe_data_seg *dseg; + volatile struct mlx4_wqe_ctrl_seg *ctrl_next; + struct mlx4_sq *sq = &txq->msq; + struct tso_info tinfo; + struct pv *pv; + int pv_counter; + int ret; + + ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo); + if (unlikely(ret)) + goto error; + dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl); + if (unlikely(dseg == NULL)) + goto error; + if ((uintptr_t)dseg >= (uintptr_t)sq->eob) + dseg = (volatile struct mlx4_wqe_data_seg *) + ((uintptr_t)dseg - sq->size); + ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl); + if (unlikely(ctrl_next == NULL)) + goto error; + /* Write the first DWORD of each TXBB save earlier. */ + if (likely(tinfo.pv_counter)) { + pv = tinfo.pv; + pv_counter = tinfo.pv_counter; + /* Need a barrier here before writing the first TXBB word. */ + rte_io_wmb(); + do { + --pv_counter; + *pv[pv_counter].dst = pv[pv_counter].val; + } while (pv_counter > 0); + } + ctrl->fence_size = tinfo.fence_size; + sq->remain_size -= tinfo.wqe_size; + return ctrl_next; +error: + txq->stats.odropped++; + return NULL; +} + /** * Write data segments of multi-segment packet. * @@ -437,7 +765,7 @@ mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq, goto txbb_tail_segs; txbb_head_seg: /* Memory region key (big endian) for this memory pool. */ - lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf)); + lkey = mlx4_tx_mb2mr(txq, sbuf); if (unlikely(lkey == (uint32_t)-1)) { DEBUG("%p: unable to get MP <-> MR association", (void *)txq); @@ -449,7 +777,7 @@ txbb_head_seg: dseg = (volatile struct mlx4_wqe_data_seg *) sq->buf; dseg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(sbuf, uintptr_t)); - dseg->lkey = rte_cpu_to_be_32(lkey); + dseg->lkey = lkey; /* * This data segment starts at the beginning of a new * TXBB, so we need to postpone its byte_count writing @@ -469,7 +797,7 @@ txbb_tail_segs: /* Jump to default if there are more than two segments remaining. */ switch (nb_segs) { default: - lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf)); + lkey = mlx4_tx_mb2mr(txq, sbuf); if (unlikely(lkey == (uint32_t)-1)) { DEBUG("%p: unable to get MP <-> MR association", (void *)txq); @@ -485,7 +813,7 @@ txbb_tail_segs: nb_segs--; /* fallthrough */ case 2: - lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf)); + lkey = mlx4_tx_mb2mr(txq, sbuf); if (unlikely(lkey == (uint32_t)-1)) { DEBUG("%p: unable to get MP <-> MR association", (void *)txq); @@ -501,7 +829,7 @@ txbb_tail_segs: nb_segs--; /* fallthrough */ case 1: - lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf)); + lkey = mlx4_tx_mb2mr(txq, sbuf); if (unlikely(lkey == (uint32_t)-1)) { DEBUG("%p: unable to get MP <-> MR association", (void *)txq); @@ -587,6 +915,7 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) uint16_t flags16[2]; } srcrb; uint32_t lkey; + bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG); /* Clean up old buffer. */ if (likely(elt->buf != NULL)) { @@ -605,13 +934,22 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) } while (tmp != NULL); } RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); - if (buf->nb_segs == 1) { + if (tso) { + /* Change opcode to TSO */ + owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD; + owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR; + ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl); + if (!ctrl_next) { + elt->buf = NULL; + break; + } + } else if (buf->nb_segs == 1) { /* Validate WQE space in the send queue. */ if (sq->remain_size < MLX4_TXBB_SIZE) { elt->buf = NULL; break; } - lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf)); + lkey = mlx4_tx_mb2mr(txq, buf); if (unlikely(lkey == (uint32_t)-1)) { /* MR does not exist. */ DEBUG("%p: unable to get MP <-> MR association", @@ -639,7 +977,7 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) ctrl_next = (volatile struct mlx4_wqe_ctrl_seg *) ((volatile uint8_t *)ctrl_next - sq->size); /* Flip HW valid ownership. */ - sq->owner_opcode ^= 0x1 << MLX4_SQ_OWNER_BIT; + sq->owner_opcode ^= 1u << MLX4_SQ_OWNER_BIT; } /* * For raw Ethernet, the SOLICIT flag is used to indicate @@ -746,11 +1084,13 @@ rxq_cq_to_pkt_type(volatile struct mlx4_cqe *cqe, * bit[4] - MLX4_CQE_STATUS_TCP * bit[3] - MLX4_CQE_STATUS_IPV4OPT * bit[2] - MLX4_CQE_STATUS_IPV6 - * bit[1] - MLX4_CQE_STATUS_IPV4F + * bit[1] - MLX4_CQE_STATUS_IPF * bit[0] - MLX4_CQE_STATUS_IPV4 * giving a total of up to 256 entries. */ idx |= ((status & MLX4_CQE_STATUS_PTYPE_MASK) >> 22); + if (status & MLX4_CQE_STATUS_IPV6) + idx |= ((status & MLX4_CQE_STATUS_IPV6F) >> 11); return mlx4_ptype_table[idx]; } @@ -934,11 +1274,14 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) goto skip; } pkt = seg; + assert(len >= (rxq->crc_present << 2)); /* Update packet information. */ pkt->packet_type = rxq_cq_to_pkt_type(cqe, rxq->l2tun_offload); pkt->ol_flags = PKT_RX_RSS_HASH; pkt->hash.rss = cqe->immed_rss_invalid; + if (rxq->crc_present) + len -= ETHER_CRC_LEN; pkt->pkt_len = len; if (rxq->csum | rxq->csum_l2tun) { uint32_t flags = @@ -963,6 +1306,9 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) * changes. */ scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx4_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + scat->lkey = mlx4_rx_mb2mr(rxq, rep); if (len > seg->data_len) { len -= seg->data_len; ++pkt->nb_segs; diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index c12bd39a..ffa8abfc 100644 --- a/drivers/net/mlx4/mlx4_rxtx.h +++ b/drivers/net/mlx4/mlx4_rxtx.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ #ifndef MLX4_RXTX_H_ @@ -25,6 +25,7 @@ #include "mlx4.h" #include "mlx4_prm.h" +#include "mlx4_mr.h" /** Rx queue counters. */ struct mlx4_rxq_stats { @@ -39,7 +40,6 @@ struct mlx4_rxq_stats { struct rxq { struct priv *priv; /**< Back pointer to private data. */ struct rte_mempool *mp; /**< Memory pool for allocations. */ - struct mlx4_mr *mr; /**< Memory region. */ struct ibv_cq *cq; /**< Completion queue. */ struct ibv_wq *wq; /**< Work queue. */ struct ibv_comp_channel *channel; /**< Rx completion channel. */ @@ -47,11 +47,13 @@ struct rxq { uint16_t port_id; /**< Port ID for incoming packets. */ uint16_t sges_n; /**< Number of segments per packet (log2 value). */ uint16_t elts_n; /**< Mbuf queue size (log2 value). */ + struct mlx4_mr_ctrl mr_ctrl; /* MR control descriptor. */ struct rte_mbuf *(*elts)[]; /**< Rx elements. */ volatile struct mlx4_wqe_data_seg (*wqes)[]; /**< HW queue entries. */ volatile uint32_t *rq_db; /**< RQ doorbell record. */ uint32_t csum:1; /**< Enable checksum offloading. */ uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */ + uint32_t crc_present:1; /**< CRC must be subtracted. */ uint32_t l2tun_offload:1; /**< L2 tunnel offload is enabled. */ struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */ struct mlx4_rxq_stats stats; /**< Rx queue counters. */ @@ -83,12 +85,12 @@ struct txq_elt { }; }; -/** Rx queue counters. */ +/** Tx queue counters. */ struct mlx4_txq_stats { unsigned int idx; /**< Mapping index. */ uint64_t opackets; /**< Total of successfully sent packets. */ uint64_t obytes; /**< Total of successfully sent bytes. */ - uint64_t odropped; /**< Total of packets not sent when Tx ring full. */ + uint64_t odropped; /**< Total number of packets failed to transmit. */ }; /** Tx queue descriptor. */ @@ -100,6 +102,7 @@ struct txq { int elts_comp_cd; /**< Countdown for next completion. */ unsigned int elts_comp_cd_init; /**< Initial value for countdown. */ unsigned int elts_n; /**< (*elts)[] length. */ + struct mlx4_mr_ctrl mr_ctrl; /* MR control descriptor. */ struct txq_elt (*elts)[]; /**< Tx elements. */ struct mlx4_txq_stats stats; /**< Tx queue counters. */ uint32_t max_inline; /**< Max inline send size. */ @@ -108,11 +111,6 @@ struct txq { uint32_t lb:1; /**< Whether packets should be looped back by eSwitch. */ uint8_t *bounce_buf; /**< Memory used for storing the first DWORD of data TXBBs. */ - struct { - const struct rte_mempool *mp; /**< Cached memory pool. */ - struct mlx4_mr *mr; /**< Memory region (for mp). */ - uint32_t lkey; /**< mr->lkey copy. */ - } mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */ struct priv *priv; /**< Back pointer to private data. */ unsigned int socket; /**< CPU socket ID for allocations. */ struct ibv_cq *cq; /**< Completion queue. */ @@ -126,7 +124,7 @@ uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE]; int mlx4_rss_init(struct priv *priv); void mlx4_rss_deinit(struct priv *priv); struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields, - uint8_t key[MLX4_RSS_HASH_KEY_SIZE], + const uint8_t key[MLX4_RSS_HASH_KEY_SIZE], uint16_t queues, const uint16_t queue_id[]); void mlx4_rss_put(struct mlx4_rss *rss); int mlx4_rss_attach(struct mlx4_rss *rss); @@ -160,34 +158,70 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, const struct rte_eth_txconf *conf); void mlx4_tx_queue_release(void *dpdk_txq); +/* mlx4_mr.c */ + +void mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl); +uint32_t mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr); +uint32_t mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr); + /** - * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[]. - * Call mlx4_txq_add_mr() if MP is not registered yet. + * Query LKey from a packet buffer for Rx. No need to flush local caches for Rx + * as mempool is pre-configured and static. + * + * @param rxq + * Pointer to Rx queue structure. + * @param addr + * Address to search. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static __rte_always_inline uint32_t +mlx4_rx_addr2mr(struct rxq *rxq, uintptr_t addr) +{ + struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl; + uint32_t lkey; + + /* Linear search on MR cache array. */ + lkey = mlx4_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru, + MLX4_MR_CACHE_N, addr); + if (likely(lkey != UINT32_MAX)) + return lkey; + /* Take slower bottom-half (Binary Search) on miss. */ + return mlx4_rx_addr2mr_bh(rxq, addr); +} + +#define mlx4_rx_mb2mr(rxq, mb) mlx4_rx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr)) + +/** + * Query LKey from a packet buffer for Tx. If not found, add the mempool. * * @param txq * Pointer to Tx queue structure. - * @param[in] mp - * Memory pool for which a memory region lkey must be returned. + * @param addr + * Address to search. * * @return - * mr->lkey on success, (uint32_t)-1 on failure. + * Searched LKey on success, UINT32_MAX on no match. */ -static inline uint32_t -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) +static __rte_always_inline uint32_t +mlx4_tx_addr2mr(struct txq *txq, uintptr_t addr) { - unsigned int i; - - for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { - if (unlikely(txq->mp2mr[i].mp == NULL)) { - /* Unknown MP, add a new MR for it. */ - break; - } - if (txq->mp2mr[i].mp == mp) { - /* MP found MP. */ - return txq->mp2mr[i].lkey; - } - } - return mlx4_txq_add_mr(txq, mp, i); + struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + uint32_t lkey; + + /* Check generation bit to see if there's any change on existing MRs. */ + if (unlikely(*mr_ctrl->dev_gen_ptr != mr_ctrl->cur_gen)) + mlx4_mr_flush_local_cache(mr_ctrl); + /* Linear search on MR cache array. */ + lkey = mlx4_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru, + MLX4_MR_CACHE_N, addr); + if (likely(lkey != UINT32_MAX)) + return lkey; + /* Take slower bottom-half (binary search) on miss. */ + return mlx4_tx_addr2mr_bh(txq, addr); } +#define mlx4_tx_mb2mr(rxq, mb) mlx4_tx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr)) + #endif /* MLX4_RXTX_H_ */ diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c index 071b2d5d..9aa7440d 100644 --- a/drivers/net/mlx4/mlx4_txq.c +++ b/drivers/net/mlx4/mlx4_txq.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ /** @@ -63,64 +63,6 @@ mlx4_txq_free_elts(struct txq *txq) txq->elts_tail = txq->elts_head; } -struct txq_mp2mr_mbuf_check_data { - int ret; -}; - -/** - * Callback function for rte_mempool_obj_iter() to check whether a given - * mempool object looks like a mbuf. - * - * @param[in] mp - * The mempool pointer - * @param[in] arg - * Context data (struct mlx4_txq_mp2mr_mbuf_check_data). Contains the - * return value. - * @param[in] obj - * Object address. - * @param index - * Object index, unused. - */ -static void -mlx4_txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj, - uint32_t index) -{ - struct txq_mp2mr_mbuf_check_data *data = arg; - struct rte_mbuf *buf = obj; - - (void)index; - /* - * Check whether mbuf structure fits element size and whether mempool - * pointer is valid. - */ - if (sizeof(*buf) > mp->elt_size || buf->pool != mp) - data->ret = -1; -} - -/** - * Iterator function for rte_mempool_walk() to register existing mempools and - * fill the MP to MR cache of a Tx queue. - * - * @param[in] mp - * Memory Pool to register. - * @param *arg - * Pointer to Tx queue structure. - */ -static void -mlx4_txq_mp2mr_iter(struct rte_mempool *mp, void *arg) -{ - struct txq *txq = arg; - struct txq_mp2mr_mbuf_check_data data = { - .ret = 0, - }; - - /* Register mempool only if the first element looks like a mbuf. */ - if (rte_mempool_obj_iter(mp, mlx4_txq_mp2mr_mbuf_check, &data) == 0 || - data.ret == -1) - return; - mlx4_txq_mp2mr(txq, mp); -} - /** * Retrieves information needed in order to directly access the Tx queue. * @@ -144,9 +86,9 @@ mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv) uint32_t headroom_size = 2048 + (1 << dqp->sq.wqe_shift); /* Continuous headroom size bytes must always stay freed. */ sq->remain_size = sq->size - headroom_size; - sq->owner_opcode = MLX4_OPCODE_SEND | (0 << MLX4_SQ_OWNER_BIT); + sq->owner_opcode = MLX4_OPCODE_SEND | (0u << MLX4_SQ_OWNER_BIT); sq->stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL | - (0 << MLX4_SQ_OWNER_BIT)); + (0u << MLX4_SQ_OWNER_BIT)); sq->db = dqp->sdb; sq->doorbell_qpn = dqp->doorbell_qpn; cq->buf = dcq->buf.buf; @@ -174,31 +116,17 @@ mlx4_get_tx_port_offloads(struct priv *priv) DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM); } - if (priv->hw_csum_l2tun) + if (priv->tso) + offloads |= DEV_TX_OFFLOAD_TCP_TSO; + if (priv->hw_csum_l2tun) { offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; + if (priv->tso) + offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO | + DEV_TX_OFFLOAD_GRE_TNL_TSO); + } return offloads; } -/** - * Checks if the per-queue offload configuration is valid. - * - * @param priv - * Pointer to private structure. - * @param requested - * Per-queue offloads configuration. - * - * @return - * Nonzero when configuration is valid. - */ -static int -mlx4_check_tx_queue_offloads(struct priv *priv, uint64_t requested) -{ - uint64_t mandatory = priv->dev->data->dev_conf.txmode.offloads; - uint64_t supported = mlx4_get_tx_port_offloads(priv); - - return !((mandatory ^ requested) & supported); -} - /** * DPDK callback to configure a Tx queue. * @@ -246,23 +174,13 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, }, }; int ret; + uint64_t offloads; + + offloads = conf->offloads | dev->data->dev_conf.txmode.offloads; DEBUG("%p: configuring queue %u for %u descriptors", (void *)dev, idx, desc); - /* - * Don't verify port offloads for application which - * use the old API. - */ - if ((conf->txq_flags & ETH_TXQ_FLAGS_IGNORE) && - !mlx4_check_tx_queue_offloads(priv, conf->offloads)) { - rte_errno = ENOTSUP; - ERROR("%p: Tx queue offloads 0x%" PRIx64 " don't match port " - "offloads 0x%" PRIx64 " or supported offloads 0x%" PRIx64, - (void *)dev, conf->offloads, - dev->data->dev_conf.txmode.offloads, - mlx4_get_tx_port_offloads(priv)); - return -rte_errno; - } + if (idx >= dev->data->nb_tx_queues) { rte_errno = EOVERFLOW; ERROR("%p: queue index out of range (%u >= %u)", @@ -313,11 +231,11 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, .elts_comp_cd_init = RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4), .csum = priv->hw_csum && - (conf->offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | + (offloads & (DEV_TX_OFFLOAD_IPV4_CKSUM | DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM)), .csum_l2tun = priv->hw_csum_l2tun && - (conf->offloads & + (offloads & DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM), /* Enable Tx loopback for VF devices. */ .lb = !!priv->vf, @@ -404,8 +322,13 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, /* Save first wqe pointer in the first element. */ (&(*txq->elts)[0])->wqe = (volatile struct mlx4_wqe_ctrl_seg *)txq->msq.buf; - /* Pre-register known mempools. */ - rte_mempool_walk(mlx4_txq_mp2mr_iter, txq); + if (mlx4_mr_btree_init(&txq->mr_ctrl.cache_bh, + MLX4_MR_BTREE_CACHE_N, socket)) { + /* rte_errno is already set. */ + goto error; + } + /* Save pointer of global generation number to check memory event. */ + txq->mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen; DEBUG("%p: adding Tx queue %p to list", (void *)dev, (void *)txq); dev->data->tx_queues[idx] = txq; return 0; @@ -446,11 +369,6 @@ mlx4_tx_queue_release(void *dpdk_txq) claim_zero(mlx4_glue->destroy_qp(txq->qp)); if (txq->cq) claim_zero(mlx4_glue->destroy_cq(txq->cq)); - for (i = 0; i != RTE_DIM(txq->mp2mr); ++i) { - if (!txq->mp2mr[i].mp) - break; - assert(txq->mp2mr[i].mr); - mlx4_mr_put(txq->mp2mr[i].mr); - } + mlx4_mr_btree_free(&txq->mr_ctrl.cache_bh); rte_free(txq); } diff --git a/drivers/net/mlx4/mlx4_utils.c b/drivers/net/mlx4/mlx4_utils.c index d10812ec..a727d703 100644 --- a/drivers/net/mlx4/mlx4_utils.c +++ b/drivers/net/mlx4/mlx4_utils.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ /** diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h index 9fdbacad..86abb3b7 100644 --- a/drivers/net/mlx4/mlx4_utils.h +++ b/drivers/net/mlx4/mlx4_utils.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox + * Copyright 2017 Mellanox Technologies, Ltd */ #ifndef MLX4_UTILS_H_ -- cgit 1.2.3-korg