diff options
Diffstat (limited to 'drivers/net/mlx5')
28 files changed, 7195 insertions, 3676 deletions
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index 3bc9736c..8a5229e6 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -1,33 +1,6 @@ -# BSD LICENSE -# +# SPDX-License-Identifier: BSD-3-Clause # Copyright 2015 6WIND S.A. -# Copyright 2015 Mellanox. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of 6WIND S.A. nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Copyright 2015 Mellanox Technologies, Ltd include $(RTE_SDK)/mk/rte.vars.mk @@ -35,7 +8,7 @@ include $(RTE_SDK)/mk/rte.vars.mk LIB = librte_pmd_mlx5.a LIB_GLUE = $(LIB_GLUE_BASE).$(LIB_GLUE_VERSION) LIB_GLUE_BASE = librte_pmd_mlx5_glue.so -LIB_GLUE_VERSION = 18.02.0 +LIB_GLUE_VERSION = 18.05.0 # Sources. SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c @@ -59,6 +32,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y) INSTALL-$(CONFIG_RTE_LIBRTE_MLX5_PMD)-lib += $(LIB_GLUE) @@ -92,6 +66,9 @@ CFLAGS += -Wno-error=cast-qual EXPORT_MAP := rte_pmd_mlx5_version.map LIBABIVER := 1 +# memseg walk is not part of stable API +CFLAGS += -DALLOW_EXPERIMENTAL_API + # DEBUG which is usually provided on the command-line may enable # CONFIG_RTE_LIBRTE_MLX5_DEBUG. ifeq ($(DEBUG),1) @@ -105,10 +82,6 @@ else CFLAGS += -DNDEBUG -UPEDANTIC endif -ifdef CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE -CFLAGS += -DMLX5_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE) -endif - include $(RTE_SDK)/mk/rte.lib.mk # Generate and clean-up mlx5_autoconf.h. @@ -125,9 +98,19 @@ mlx5_autoconf.h.new: FORCE mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh $Q $(RM) -f -- '$@' $Q sh -- '$<' '$@' \ - HAVE_IBV_DEVICE_VXLAN_SUPPORT \ + HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT \ + infiniband/mlx5dv.h \ + enum MLX5DV_CONTEXT_MASK_STRIDING_RQ \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_IBV_DEVICE_TUNNEL_SUPPORT \ + infiniband/mlx5dv.h \ + enum MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_IBV_DEVICE_MPLS_SUPPORT \ infiniband/verbs.h \ - enum IBV_DEVICE_VXLAN_SUPPORT \ + enum IBV_FLOW_SPEC_MPLS \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ HAVE_IBV_WQ_FLAG_RX_END_PADDING \ @@ -135,6 +118,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh enum IBV_WQ_FLAG_RX_END_PADDING \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ + HAVE_IBV_MLX5_MOD_SWP \ + infiniband/mlx5dv.h \ + type 'struct mlx5dv_sw_parsing_caps' \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ HAVE_IBV_MLX5_MOD_MPW \ infiniband/mlx5dv.h \ enum MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED \ @@ -181,8 +169,13 @@ ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y) $(LIB): $(LIB_GLUE) +ifeq ($(LINK_USING_CC),1) +GLUE_LDFLAGS := $(call linkerprefix,$(LDFLAGS)) +else +GLUE_LDFLAGS := $(LDFLAGS) +endif $(LIB_GLUE): mlx5_glue.o - $Q $(LD) $(LDFLAGS) $(EXTRA_LDFLAGS) \ + $Q $(LD) $(GLUE_LDFLAGS) $(EXTRA_LDFLAGS) \ -Wl,-h,$(LIB_GLUE) \ -s -shared -o $@ $< -libverbs -lmlx5 diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 6c0985bd..c933e274 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -13,6 +13,7 @@ #include <errno.h> #include <net/if.h> #include <sys/mman.h> +#include <linux/rtnetlink.h> /* Verbs header. */ /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ @@ -33,6 +34,8 @@ #include <rte_config.h> #include <rte_eal_memconfig.h> #include <rte_kvargs.h> +#include <rte_rwlock.h> +#include <rte_spinlock.h> #include "mlx5.h" #include "mlx5_utils.h" @@ -40,10 +43,23 @@ #include "mlx5_autoconf.h" #include "mlx5_defs.h" #include "mlx5_glue.h" +#include "mlx5_mr.h" /* Device parameter to enable RX completion queue compression. */ #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" +/* Device parameter to enable Multi-Packet Rx queue. */ +#define MLX5_RX_MPRQ_EN "mprq_en" + +/* Device parameter to configure log 2 of the number of strides for MPRQ. */ +#define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num" + +/* Device parameter to limit the size of memcpy'd packet for MPRQ. */ +#define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len" + +/* Device parameter to set the minimum number of Rx queues to enable MPRQ. */ +#define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq" + /* Device parameter to configure inline send. */ #define MLX5_TXQ_INLINE "txq_inline" @@ -68,6 +84,12 @@ /* Device parameter to enable hardware Rx vector. */ #define MLX5_RX_VEC_EN "rx_vec_en" +/* Allow L3 VXLAN flow creation. */ +#define MLX5_L3_VXLAN_EN "l3_vxlan_en" + +/* Activate Netlink support in VF mode. */ +#define MLX5_VF_NL_EN "vf_nl_en" + #ifndef HAVE_IBV_MLX5_MOD_MPW #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) @@ -77,6 +99,50 @@ #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) #endif +static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; + +/* Shared memory between primary and secondary processes. */ +struct mlx5_shared_data *mlx5_shared_data; + +/* Spinlock for mlx5_shared_data allocation. */ +static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; + +/** Driver-specific log messages type. */ +int mlx5_logtype; + +/** + * Prepare shared data between primary and secondary process. + */ +static void +mlx5_prepare_shared_data(void) +{ + const struct rte_memzone *mz; + + rte_spinlock_lock(&mlx5_shared_data_lock); + if (mlx5_shared_data == NULL) { + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* Allocate shared memory. */ + mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, + sizeof(*mlx5_shared_data), + SOCKET_ID_ANY, 0); + } else { + /* Lookup allocated shared memory. */ + mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); + } + if (mz == NULL) + rte_panic("Cannot allocate mlx5 shared data\n"); + mlx5_shared_data = mz->addr; + /* Initialize shared data. */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + LIST_INIT(&mlx5_shared_data->mem_event_cb_list); + rte_rwlock_init(&mlx5_shared_data->mem_event_rwlock); + } + rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", + mlx5_mr_mem_event_cb, NULL); + } + rte_spinlock_unlock(&mlx5_shared_data_lock); +} + /** * Retrieve integer value from environment variable. * @@ -108,7 +174,7 @@ mlx5_getenv_int(const char *name) * A pointer to the callback data. * * @return - * a pointer to the allocate space. + * Allocated buffer, NULL otherwise and rte_errno is set. */ static void * mlx5_alloc_verbs_buf(size_t size, void *data) @@ -130,7 +196,8 @@ mlx5_alloc_verbs_buf(size_t size, void *data) } assert(data != NULL); ret = rte_malloc_socket(__func__, size, alignment, socket); - DEBUG("Extern alloc size: %lu, align: %lu: %p", size, alignment, ret); + if (!ret && size) + rte_errno = ENOMEM; return ret; } @@ -146,7 +213,6 @@ static void mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) { assert(data != NULL); - DEBUG("Extern free request: %p", ptr); rte_free(ptr); } @@ -165,13 +231,12 @@ mlx5_dev_close(struct rte_eth_dev *dev) unsigned int i; int ret; - priv_lock(priv); - DEBUG("%p: closing device \"%s\"", - (void *)dev, - ((priv->ctx != NULL) ? priv->ctx->device->name : "")); + DRV_LOG(DEBUG, "port %u closing device \"%s\"", + dev->data->port_id, + ((priv->ctx != NULL) ? priv->ctx->device->name : "")); /* In case mlx5_dev_stop() has not been called. */ - priv_dev_interrupt_handler_uninstall(priv, dev); - priv_dev_traffic_disable(priv, dev); + mlx5_dev_interrupt_handler_uninstall(dev); + mlx5_traffic_disable(dev); /* Prevent crashes when queues are still in use. */ dev->rx_pkt_burst = removed_rx_burst; dev->tx_pkt_burst = removed_tx_burst; @@ -179,7 +244,7 @@ mlx5_dev_close(struct rte_eth_dev *dev) /* XXX race condition if mlx5_rx_burst() is still running. */ usleep(1000); for (i = 0; (i != priv->rxqs_n); ++i) - mlx5_priv_rxq_release(priv, i); + mlx5_rxq_release(dev, i); priv->rxqs_n = 0; priv->rxqs = NULL; } @@ -187,10 +252,13 @@ mlx5_dev_close(struct rte_eth_dev *dev) /* XXX race condition if mlx5_tx_burst() is still running. */ usleep(1000); for (i = 0; (i != priv->txqs_n); ++i) - mlx5_priv_txq_release(priv, i); + mlx5_txq_release(dev, i); priv->txqs_n = 0; priv->txqs = NULL; } + mlx5_flow_delete_drop_queue(dev); + mlx5_mprq_free_mp(dev); + mlx5_mr_release(dev); if (priv->pd != NULL) { assert(priv->ctx != NULL); claim_zero(mlx5_glue->dealloc_pd(priv->pd)); @@ -202,32 +270,39 @@ mlx5_dev_close(struct rte_eth_dev *dev) if (priv->reta_idx != NULL) rte_free(priv->reta_idx); if (priv->primary_socket) - priv_socket_uninit(priv); - ret = mlx5_priv_hrxq_ibv_verify(priv); - if (ret) - WARN("%p: some Hash Rx queue still remain", (void *)priv); - ret = mlx5_priv_ind_table_ibv_verify(priv); + mlx5_socket_uninit(dev); + if (priv->config.vf) + mlx5_nl_mac_addr_flush(dev); + if (priv->nl_socket >= 0) + close(priv->nl_socket); + ret = mlx5_hrxq_ibv_verify(dev); if (ret) - WARN("%p: some Indirection table still remain", (void *)priv); - ret = mlx5_priv_rxq_ibv_verify(priv); + DRV_LOG(WARNING, "port %u some hash Rx queue still remain", + dev->data->port_id); + ret = mlx5_ind_table_ibv_verify(dev); if (ret) - WARN("%p: some Verbs Rx queue still remain", (void *)priv); - ret = mlx5_priv_rxq_verify(priv); + DRV_LOG(WARNING, "port %u some indirection table still remain", + dev->data->port_id); + ret = mlx5_rxq_ibv_verify(dev); if (ret) - WARN("%p: some Rx Queues still remain", (void *)priv); - ret = mlx5_priv_txq_ibv_verify(priv); + DRV_LOG(WARNING, "port %u some Verbs Rx queue still remain", + dev->data->port_id); + ret = mlx5_rxq_verify(dev); if (ret) - WARN("%p: some Verbs Tx queue still remain", (void *)priv); - ret = mlx5_priv_txq_verify(priv); + DRV_LOG(WARNING, "port %u some Rx queues still remain", + dev->data->port_id); + ret = mlx5_txq_ibv_verify(dev); if (ret) - WARN("%p: some Tx Queues still remain", (void *)priv); - ret = priv_flow_verify(priv); + DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain", + dev->data->port_id); + ret = mlx5_txq_verify(dev); if (ret) - WARN("%p: some flows still remain", (void *)priv); - ret = priv_mr_verify(priv); + DRV_LOG(WARNING, "port %u some Tx queues still remain", + dev->data->port_id); + ret = mlx5_flow_verify(dev); if (ret) - WARN("%p: some Memory Region still remain", (void *)priv); - priv_unlock(priv); + DRV_LOG(WARNING, "port %u some flows still remain", + dev->data->port_id); memset(priv, 0, sizeof(*priv)); } @@ -260,6 +335,7 @@ const struct eth_dev_ops mlx5_dev_ops = { .mac_addr_remove = mlx5_mac_addr_remove, .mac_addr_add = mlx5_mac_addr_add, .mac_addr_set = mlx5_mac_addr_set, + .set_mc_addr_list = mlx5_set_mc_addr_list, .mtu_set = mlx5_dev_set_mtu, .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, .vlan_offload_set = mlx5_vlan_offload_set, @@ -312,6 +388,7 @@ const struct eth_dev_ops mlx5_dev_ops_isolate = { .mac_addr_remove = mlx5_mac_addr_remove, .mac_addr_add = mlx5_mac_addr_add, .mac_addr_set = mlx5_mac_addr_set, + .set_mc_addr_list = mlx5_set_mc_addr_list, .mtu_set = mlx5_dev_set_mtu, .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, .vlan_offload_set = mlx5_vlan_offload_set, @@ -367,7 +444,7 @@ mlx5_dev_idx(struct rte_pci_addr *pci_addr) * User data. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_args_check(const char *key, const char *val, void *opaque) @@ -378,11 +455,20 @@ mlx5_args_check(const char *key, const char *val, void *opaque) errno = 0; tmp = strtoul(val, NULL, 0); if (errno) { - WARN("%s: \"%s\" is not a valid integer", key, val); - return errno; + rte_errno = errno; + DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val); + return -rte_errno; } if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { config->cqe_comp = !!tmp; + } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) { + config->mprq.enabled = !!tmp; + } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) { + config->mprq.stride_num_n = tmp; + } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) { + config->mprq.max_memcpy_len = tmp; + } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) { + config->mprq.min_rxqs_num = tmp; } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { config->txq_inline = tmp; } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { @@ -397,9 +483,14 @@ mlx5_args_check(const char *key, const char *val, void *opaque) config->tx_vec_en = !!tmp; } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { config->rx_vec_en = !!tmp; + } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) { + config->l3_vxlan_en = !!tmp; + } else if (strcmp(MLX5_VF_NL_EN, key) == 0) { + config->vf_nl_en = !!tmp; } else { - WARN("%s: unknown parameter", key); - return -EINVAL; + DRV_LOG(WARNING, "%s: unknown parameter", key); + rte_errno = EINVAL; + return -rte_errno; } return 0; } @@ -413,13 +504,17 @@ mlx5_args_check(const char *key, const char *val, void *opaque) * Device arguments structure. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) { const char **params = (const char *[]){ MLX5_RXQ_CQE_COMP_EN, + MLX5_RX_MPRQ_EN, + MLX5_RX_MPRQ_LOG_STRIDE_NUM, + MLX5_RX_MPRQ_MAX_MEMCPY_LEN, + MLX5_RXQS_MIN_MPRQ, MLX5_TXQ_INLINE, MLX5_TXQS_MIN_INLINE, MLX5_TXQ_MPW_EN, @@ -427,6 +522,8 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) MLX5_TXQ_MAX_INLINE_LEN, MLX5_TX_VEC_EN, MLX5_RX_VEC_EN, + MLX5_L3_VXLAN_EN, + MLX5_VF_NL_EN, NULL, }; struct rte_kvargs *kvlist; @@ -444,9 +541,10 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) if (rte_kvargs_count(kvlist, params[i])) { ret = rte_kvargs_process(kvlist, params[i], mlx5_args_check, config); - if (ret != 0) { + if (ret) { + rte_errno = EINVAL; rte_kvargs_free(kvlist); - return ret; + return -rte_errno; } } } @@ -465,50 +563,60 @@ static struct rte_pci_driver mlx5_driver; */ static void *uar_base; +static int +find_lower_va_bound(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + void **addr = arg; + + if (*addr == NULL) + *addr = ms->addr; + else + *addr = RTE_MIN(*addr, ms->addr); + + return 0; +} + /** * Reserve UAR address space for primary process. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_uar_init_primary(struct priv *priv) +mlx5_uar_init_primary(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; void *addr = (void *)0; - int i; - const struct rte_mem_config *mcfg; - int ret; if (uar_base) { /* UAR address space mapped. */ priv->uar_base = uar_base; return 0; } /* find out lower bound of hugepage segments */ - mcfg = rte_eal_get_configuration()->mem_config; - for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) { - if (addr) - addr = RTE_MIN(addr, mcfg->memseg[i].addr); - else - addr = mcfg->memseg[i].addr; - } + rte_memseg_walk(find_lower_va_bound, &addr); + /* keep distance to hugepages to minimize potential conflicts. */ addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE); /* anonymous mmap, no real memory consumption. */ addr = mmap(addr, MLX5_UAR_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) { - ERROR("Failed to reserve UAR address space, please adjust " - "MLX5_UAR_SIZE or try --base-virtaddr"); - ret = ENOMEM; - return ret; + DRV_LOG(ERR, + "port %u failed to reserve UAR address space, please" + " adjust MLX5_UAR_SIZE or try --base-virtaddr", + dev->data->port_id); + rte_errno = ENOMEM; + return -rte_errno; } /* Accept either same addr or a new addr returned from mmap if target * range occupied. */ - INFO("Reserved UAR address space: %p", addr); + DRV_LOG(INFO, "port %u reserved UAR address space: %p", + dev->data->port_id, addr); priv->uar_base = addr; /* for primary and secondary UAR re-mmap. */ uar_base = addr; /* process local, don't reserve again. */ return 0; @@ -518,17 +626,17 @@ priv_uar_init_primary(struct priv *priv) * Reserve UAR address space for secondary process, align with * primary process. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_uar_init_secondary(struct priv *priv) +mlx5_uar_init_secondary(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; void *addr; - int ret; assert(priv->uar_base); if (uar_base) { /* already reserved. */ @@ -539,20 +647,23 @@ priv_uar_init_secondary(struct priv *priv) addr = mmap(priv->uar_base, MLX5_UAR_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) { - ERROR("UAR mmap failed: %p size: %llu", - priv->uar_base, MLX5_UAR_SIZE); - ret = ENXIO; - return ret; + DRV_LOG(ERR, "port %u UAR mmap failed: %p size: %llu", + dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE); + rte_errno = ENXIO; + return -rte_errno; } if (priv->uar_base != addr) { - ERROR("UAR address %p size %llu occupied, please adjust " - "MLX5_UAR_OFFSET or try EAL parameter --base-virtaddr", - priv->uar_base, MLX5_UAR_SIZE); - ret = ENXIO; - return ret; + DRV_LOG(ERR, + "port %u UAR address %p size %llu occupied, please" + " adjust MLX5_UAR_OFFSET or try EAL parameter" + " --base-virtaddr", + dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE); + rte_errno = ENXIO; + return -rte_errno; } uar_base = addr; /* process local, don't reserve again */ - INFO("Reserved UAR address space: %p", addr); + DRV_LOG(INFO, "port %u reserved UAR address space: %p", + dev->data->port_id, addr); return 0; } @@ -568,45 +679,57 @@ priv_uar_init_secondary(struct priv *priv) * PCI device information. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, + struct rte_pci_device *pci_dev) { - struct ibv_device **list; + struct ibv_device **list = NULL; struct ibv_device *ibv_dev; int err = 0; struct ibv_context *attr_ctx = NULL; struct ibv_device_attr_ex device_attr; - unsigned int sriov; + unsigned int vf = 0; unsigned int mps; unsigned int cqe_comp; unsigned int tunnel_en = 0; + unsigned int mpls_en = 0; + unsigned int swp = 0; + unsigned int verb_priorities = 0; + unsigned int mprq = 0; + unsigned int mprq_min_stride_size_n = 0; + unsigned int mprq_max_stride_size_n = 0; + unsigned int mprq_min_stride_num_n = 0; + unsigned int mprq_max_stride_num_n = 0; int idx; int i; - struct mlx5dv_context attrs_out; + struct mlx5dv_context attrs_out = {0}; #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT struct ibv_counter_set_description cs_desc; #endif - (void)pci_drv; + /* Prepare shared data between primary and secondary process. */ + mlx5_prepare_shared_data(); assert(pci_drv == &mlx5_driver); /* Get mlx5_dev[] index. */ idx = mlx5_dev_idx(&pci_dev->addr); if (idx == -1) { - ERROR("this driver cannot support any more adapters"); - return -ENOMEM; + DRV_LOG(ERR, "this driver cannot support any more adapters"); + err = ENOMEM; + goto error; } - DEBUG("using driver device index %d", idx); - + DRV_LOG(DEBUG, "using driver device index %d", idx); /* Save PCI address. */ mlx5_dev[idx].pci_addr = pci_dev->addr; list = mlx5_glue->get_device_list(&i); if (list == NULL) { assert(errno); + err = errno; if (errno == ENOSYS) - ERROR("cannot list devices, is ib_uverbs loaded?"); - return -errno; + DRV_LOG(ERR, + "cannot list devices, is ib_uverbs loaded?"); + goto error; } assert(i >= 0); /* @@ -617,7 +740,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) struct rte_pci_addr pci_addr; --i; - DEBUG("checking device \"%s\"", list[i]->name); + DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name); if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr)) continue; if ((pci_dev->addr.domain != pci_addr.domain) || @@ -625,7 +748,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) (pci_dev->addr.devid != pci_addr.devid) || (pci_dev->addr.function != pci_addr.function)) continue; - sriov = ((pci_dev->id.device_id == + DRV_LOG(INFO, "PCI information matches, using device \"%s\"", + list[i]->name); + vf = ((pci_dev->id.device_id == PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) || (pci_dev->id.device_id == PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) || @@ -633,70 +758,121 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) || (pci_dev->id.device_id == PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)); - switch (pci_dev->id.device_id) { - case PCI_DEVICE_ID_MELLANOX_CONNECTX4: - tunnel_en = 1; - break; - case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: - case PCI_DEVICE_ID_MELLANOX_CONNECTX5: - case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: - case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: - case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: - tunnel_en = 1; - break; - default: - break; - } - INFO("PCI information matches, using device \"%s\"" - " (SR-IOV: %s)", - list[i]->name, - sriov ? "true" : "false"); attr_ctx = mlx5_glue->open_device(list[i]); - err = errno; + rte_errno = errno; + err = rte_errno; break; } if (attr_ctx == NULL) { - mlx5_glue->free_device_list(list); switch (err) { case 0: - ERROR("cannot access device, is mlx5_ib loaded?"); - return -ENODEV; + DRV_LOG(ERR, + "cannot access device, is mlx5_ib loaded?"); + err = ENODEV; + break; case EINVAL: - ERROR("cannot use device, are drivers up to date?"); - return -EINVAL; + DRV_LOG(ERR, + "cannot use device, are drivers up to date?"); + break; } - assert(err > 0); - return -err; + goto error; } ibv_dev = list[i]; - - DEBUG("device opened"); + DRV_LOG(DEBUG, "device opened"); +#ifdef HAVE_IBV_MLX5_MOD_SWP + attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; +#endif /* * Multi-packet send is supported by ConnectX-4 Lx PF as well * as all ConnectX-5 devices. */ +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; +#endif +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; +#endif mlx5_glue->dv_query_device(attr_ctx, &attrs_out); if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { - DEBUG("Enhanced MPW is supported"); + DRV_LOG(DEBUG, "enhanced MPW is supported"); mps = MLX5_MPW_ENHANCED; } else { - DEBUG("MPW is supported"); + DRV_LOG(DEBUG, "MPW is supported"); mps = MLX5_MPW; } } else { - DEBUG("MPW isn't supported"); + DRV_LOG(DEBUG, "MPW isn't supported"); mps = MLX5_MPW_DISABLED; } +#ifdef HAVE_IBV_MLX5_MOD_SWP + if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_SWP) + swp = attrs_out.sw_parsing_caps.sw_parsing_offloads; + DRV_LOG(DEBUG, "SWP support: %u", swp); +#endif +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { + struct mlx5dv_striding_rq_caps mprq_caps = + attrs_out.striding_rq_caps; + + DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", + mprq_caps.min_single_stride_log_num_of_bytes); + DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", + mprq_caps.max_single_stride_log_num_of_bytes); + DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", + mprq_caps.min_single_wqe_log_num_of_strides); + DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", + mprq_caps.max_single_wqe_log_num_of_strides); + DRV_LOG(DEBUG, "\tsupported_qpts: %d", + mprq_caps.supported_qpts); + DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); + mprq = 1; + mprq_min_stride_size_n = + mprq_caps.min_single_stride_log_num_of_bytes; + mprq_max_stride_size_n = + mprq_caps.max_single_stride_log_num_of_bytes; + mprq_min_stride_num_n = + mprq_caps.min_single_wqe_log_num_of_strides; + mprq_max_stride_num_n = + mprq_caps.max_single_wqe_log_num_of_strides; + } +#endif if (RTE_CACHE_LINE_SIZE == 128 && !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) cqe_comp = 0; else cqe_comp = 1; - if (mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr)) +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { + tunnel_en = ((attrs_out.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && + (attrs_out.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE)); + } + DRV_LOG(DEBUG, "tunnel offloading is %ssupported", + tunnel_en ? "" : "not "); +#else + DRV_LOG(WARNING, + "tunnel offloading disabled due to old OFED/rdma-core version"); +#endif +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + mpls_en = ((attrs_out.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && + (attrs_out.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); + DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", + mpls_en ? "" : "not "); +#else + DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" + " old OFED/rdma-core version or firmware configuration"); +#endif + err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr); + if (err) { + DEBUG("ibv_query_device_ex() failed"); goto error; - INFO("%u port(s) detected", device_attr.orig_attr.phys_port_cnt); - + } + DRV_LOG(INFO, "%u port(s) detected", + device_attr.orig_attr.phys_port_cnt); for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) { char name[RTE_ETH_NAME_MAX_LEN]; int len; @@ -706,21 +882,29 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) struct ibv_port_attr port_attr; struct ibv_pd *pd = NULL; struct priv *priv = NULL; - struct rte_eth_dev *eth_dev; + struct rte_eth_dev *eth_dev = NULL; struct ibv_device_attr_ex device_attr_ex; struct ether_addr mac; - uint16_t num_vfs = 0; - struct ibv_device_attr_ex device_attr; struct mlx5_dev_config config = { .cqe_comp = cqe_comp, .mps = mps, .tunnel_en = tunnel_en, + .mpls_en = mpls_en, .tx_vec_en = 1, .rx_vec_en = 1, .mpw_hdr_dseg = 0, .txq_inline = MLX5_ARG_UNSET, .txqs_inline = MLX5_ARG_UNSET, .inline_max_packet_sz = MLX5_ARG_UNSET, + .vf_nl_en = 1, + .swp = !!swp, + .mprq = { + .enabled = 0, /* Disabled by default. */ + .stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, + mprq_min_stride_num_n), + .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, + .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, + }, }; len = snprintf(name, sizeof(name), PCI_PRI_FMT, @@ -728,94 +912,87 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) pci_dev->addr.devid, pci_dev->addr.function); if (device_attr.orig_attr.phys_port_cnt > 1) snprintf(name + len, sizeof(name), " port %u", i); - mlx5_dev[idx].ports |= test; - if (rte_eal_process_type() == RTE_PROC_SECONDARY) { eth_dev = rte_eth_dev_attach_secondary(name); if (eth_dev == NULL) { - ERROR("can not attach rte ethdev"); - err = ENOMEM; + DRV_LOG(ERR, "can not attach rte ethdev"); + rte_errno = ENOMEM; + err = rte_errno; goto error; } eth_dev->device = &pci_dev->device; eth_dev->dev_ops = &mlx5_dev_sec_ops; - priv = eth_dev->data->dev_private; - err = priv_uar_init_secondary(priv); - if (err < 0) { - err = -err; + err = mlx5_uar_init_secondary(eth_dev); + if (err) { + err = rte_errno; goto error; } /* Receive command fd from primary process */ - err = priv_socket_connect(priv); + err = mlx5_socket_connect(eth_dev); if (err < 0) { - err = -err; + err = rte_errno; goto error; } /* Remap UAR for Tx queues. */ - err = priv_tx_uar_remap(priv, err); - if (err) + err = mlx5_tx_uar_remap(eth_dev, err); + if (err) { + err = rte_errno; goto error; + } /* * Ethdev pointer is still required as input since * the primary device is not accessible from the * secondary process. */ eth_dev->rx_pkt_burst = - priv_select_rx_function(priv, eth_dev); + mlx5_select_rx_function(eth_dev); eth_dev->tx_pkt_burst = - priv_select_tx_function(priv, eth_dev); + mlx5_select_tx_function(eth_dev); + rte_eth_dev_probing_finish(eth_dev); continue; } - - DEBUG("using port %u (%08" PRIx32 ")", port, test); - + DRV_LOG(DEBUG, "using port %u (%08" PRIx32 ")", port, test); ctx = mlx5_glue->open_device(ibv_dev); if (ctx == NULL) { err = ENODEV; goto port_error; } - - mlx5_glue->query_device_ex(ctx, NULL, &device_attr); /* Check port status. */ err = mlx5_glue->query_port(ctx, port, &port_attr); if (err) { - ERROR("port query failed: %s", strerror(err)); + DRV_LOG(ERR, "port query failed: %s", strerror(err)); goto port_error; } - if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { - ERROR("port %d is not configured in Ethernet mode", - port); + DRV_LOG(ERR, + "port %d is not configured in Ethernet mode", + port); err = EINVAL; goto port_error; } - if (port_attr.state != IBV_PORT_ACTIVE) - DEBUG("port %d is not active: \"%s\" (%d)", - port, mlx5_glue->port_state_str(port_attr.state), - port_attr.state); - + DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)", + port, + mlx5_glue->port_state_str(port_attr.state), + port_attr.state); /* Allocate protection domain. */ pd = mlx5_glue->alloc_pd(ctx); if (pd == NULL) { - ERROR("PD allocation failure"); + DRV_LOG(ERR, "PD allocation failure"); err = ENOMEM; goto port_error; } - mlx5_dev[idx].ports |= test; - /* from rte_ethdev.c */ priv = rte_zmalloc("ethdev private structure", sizeof(*priv), RTE_CACHE_LINE_SIZE); if (priv == NULL) { - ERROR("priv allocation failure"); + DRV_LOG(ERR, "priv allocation failure"); err = ENOMEM; goto port_error; } - priv->ctx = ctx; strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path, sizeof(priv->ibdev_path)); @@ -825,34 +1002,27 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) priv->mtu = ETHER_MTU; err = mlx5_args(&config, pci_dev->device.devargs); if (err) { - ERROR("failed to process device arguments: %s", - strerror(err)); + DRV_LOG(ERR, "failed to process device arguments: %s", + strerror(err)); + err = rte_errno; goto port_error; } - if (mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex)) { - ERROR("ibv_query_device_ex() failed"); + err = mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex); + if (err) { + DRV_LOG(ERR, "ibv_query_device_ex() failed"); goto port_error; } - config.hw_csum = !!(device_attr_ex.device_cap_flags_ex & IBV_DEVICE_RAW_IP_CSUM); - DEBUG("checksum offloading is %ssupported", - (config.hw_csum ? "" : "not ")); - -#ifdef HAVE_IBV_DEVICE_VXLAN_SUPPORT - config.hw_csum_l2tun = - !!(exp_device_attr.exp_device_cap_flags & - IBV_DEVICE_VXLAN_SUPPORT); -#endif - DEBUG("Rx L2 tunnel checksum offloads are %ssupported", - (config.hw_csum_l2tun ? "" : "not ")); - + DRV_LOG(DEBUG, "checksum offloading is %ssupported", + (config.hw_csum ? "" : "not ")); #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT config.flow_counter_en = !!(device_attr.max_counter_sets); mlx5_glue->describe_counter_set(ctx, 0, &cs_desc); - DEBUG("counter type = %d, num of cs = %ld, attributes = %d", - cs_desc.counter_type, cs_desc.num_of_cs, - cs_desc.attributes); + DRV_LOG(DEBUG, + "counter type = %d, num of cs = %ld, attributes = %d", + cs_desc.counter_type, cs_desc.num_of_cs, + cs_desc.attributes); #endif config.ind_table_max_size = device_attr_ex.rss_caps.max_rwq_indirection_table_size; @@ -861,26 +1031,25 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; - DEBUG("maximum RX indirection table size is %u", - config.ind_table_max_size); + DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", + config.ind_table_max_size); config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps & IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); - DEBUG("VLAN stripping is %ssupported", - (config.hw_vlan_strip ? "" : "not ")); + DRV_LOG(DEBUG, "VLAN stripping is %ssupported", + (config.hw_vlan_strip ? "" : "not ")); config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps & IBV_RAW_PACKET_CAP_SCATTER_FCS); - DEBUG("FCS stripping configuration is %ssupported", - (config.hw_fcs_strip ? "" : "not ")); + DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", + (config.hw_fcs_strip ? "" : "not ")); #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align; #endif - DEBUG("hardware RX end alignment padding is %ssupported", - (config.hw_padding ? "" : "not ")); - - priv_get_num_vfs(priv, &num_vfs); - config.sriov = (num_vfs || sriov); + DRV_LOG(DEBUG, + "hardware Rx end alignment padding is %ssupported", + (config.hw_padding ? "" : "not ")); + config.vf = vf; config.tso = ((device_attr_ex.tso_caps.max_tso > 0) && (device_attr_ex.tso_caps.supported_qpts & (1 << IBV_QPT_RAW_PACKET))); @@ -888,71 +1057,106 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) config.tso_max_payload_sz = device_attr_ex.tso_caps.max_tso; if (config.mps && !mps) { - ERROR("multi-packet send not supported on this device" - " (" MLX5_TXQ_MPW_EN ")"); + DRV_LOG(ERR, + "multi-packet send not supported on this device" + " (" MLX5_TXQ_MPW_EN ")"); err = ENOTSUP; goto port_error; } - INFO("%sMPS is %s", - config.mps == MLX5_MPW_ENHANCED ? "Enhanced " : "", - config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); + DRV_LOG(INFO, "%s MPS is %s", + config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "", + config.mps != MLX5_MPW_DISABLED ? "enabled" : + "disabled"); if (config.cqe_comp && !cqe_comp) { - WARN("Rx CQE compression isn't supported"); + DRV_LOG(WARNING, "Rx CQE compression isn't supported"); config.cqe_comp = 0; } - err = priv_uar_init_primary(priv); - if (err) + config.mprq.enabled = config.mprq.enabled && mprq; + if (config.mprq.enabled) { + if (config.mprq.stride_num_n > mprq_max_stride_num_n || + config.mprq.stride_num_n < mprq_min_stride_num_n) { + config.mprq.stride_num_n = + RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, + mprq_min_stride_num_n); + DRV_LOG(WARNING, + "the number of strides" + " for Multi-Packet RQ is out of range," + " setting default value (%u)", + 1 << config.mprq.stride_num_n); + } + config.mprq.min_stride_size_n = mprq_min_stride_size_n; + config.mprq.max_stride_size_n = mprq_max_stride_size_n; + } + eth_dev = rte_eth_dev_allocate(name); + if (eth_dev == NULL) { + DRV_LOG(ERR, "can not allocate rte ethdev"); + err = ENOMEM; goto port_error; + } + eth_dev->data->dev_private = priv; + priv->dev_data = eth_dev->data; + eth_dev->data->mac_addrs = priv->mac; + eth_dev->device = &pci_dev->device; + rte_eth_copy_pci_info(eth_dev, pci_dev); + eth_dev->device->driver = &mlx5_driver.driver; + err = mlx5_uar_init_primary(eth_dev); + if (err) { + err = rte_errno; + goto port_error; + } /* Configure the first MAC address by default. */ - if (priv_get_mac(priv, &mac.addr_bytes)) { - ERROR("cannot get MAC address, is mlx5_en loaded?" - " (errno: %s)", strerror(errno)); + if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { + DRV_LOG(ERR, + "port %u cannot get MAC address, is mlx5_en" + " loaded? (errno: %s)", + eth_dev->data->port_id, strerror(errno)); err = ENODEV; goto port_error; } - INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", - priv->port, - mac.addr_bytes[0], mac.addr_bytes[1], - mac.addr_bytes[2], mac.addr_bytes[3], - mac.addr_bytes[4], mac.addr_bytes[5]); + DRV_LOG(INFO, + "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", + eth_dev->data->port_id, + mac.addr_bytes[0], mac.addr_bytes[1], + mac.addr_bytes[2], mac.addr_bytes[3], + mac.addr_bytes[4], mac.addr_bytes[5]); #ifndef NDEBUG { char ifname[IF_NAMESIZE]; - if (priv_get_ifname(priv, &ifname) == 0) - DEBUG("port %u ifname is \"%s\"", - priv->port, ifname); + if (mlx5_get_ifname(eth_dev, &ifname) == 0) + DRV_LOG(DEBUG, "port %u ifname is \"%s\"", + eth_dev->data->port_id, ifname); else - DEBUG("port %u ifname is unknown", priv->port); + DRV_LOG(DEBUG, "port %u ifname is unknown", + eth_dev->data->port_id); } #endif /* Get actual MTU if possible. */ - priv_get_mtu(priv, &priv->mtu); - DEBUG("port %u MTU is %u", priv->port, priv->mtu); - - eth_dev = rte_eth_dev_allocate(name); - if (eth_dev == NULL) { - ERROR("can not allocate rte ethdev"); - err = ENOMEM; + err = mlx5_get_mtu(eth_dev, &priv->mtu); + if (err) { + err = rte_errno; goto port_error; } - eth_dev->data->dev_private = priv; - eth_dev->data->mac_addrs = priv->mac; - eth_dev->device = &pci_dev->device; - rte_eth_copy_pci_info(eth_dev, pci_dev); - eth_dev->device->driver = &mlx5_driver.driver; + DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, + priv->mtu); /* * Initialize burst functions to prevent crashes before link-up. */ eth_dev->rx_pkt_burst = removed_rx_burst; eth_dev->tx_pkt_burst = removed_tx_burst; - priv->dev = eth_dev; eth_dev->dev_ops = &mlx5_dev_ops; /* Register MAC address. */ claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); + priv->nl_socket = -1; + priv->nl_sn = 0; + if (vf && config.vf_nl_en) { + priv->nl_socket = mlx5_nl_init(RTMGRP_LINK); + if (priv->nl_socket < 0) + priv->nl_socket = -1; + mlx5_nl_mac_addr_sync(eth_dev); + } TAILQ_INIT(&priv->flows); TAILQ_INIT(&priv->ctrl_flows); - /* Hint libmlx5 to use PMD allocator for data plane resources */ struct mlx5dv_ctx_allocators alctr = { .alloc = &mlx5_alloc_verbs_buf, @@ -962,14 +1166,55 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) mlx5_glue->dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS, (void *)((uintptr_t)&alctr)); - /* Bring Ethernet device up. */ - DEBUG("forcing Ethernet interface up"); - priv_set_flags(priv, ~IFF_UP, IFF_UP); + DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", + eth_dev->data->port_id); + mlx5_set_link_up(eth_dev); + /* + * Even though the interrupt handler is not installed yet, + * interrupts will still trigger on the asyn_fd from + * Verbs context returned by ibv_open_device(). + */ + mlx5_link_update(eth_dev, 0); /* Store device configuration on private structure. */ priv->config = config; + /* Create drop queue. */ + err = mlx5_flow_create_drop_queue(eth_dev); + if (err) { + DRV_LOG(ERR, "port %u drop queue allocation failed: %s", + eth_dev->data->port_id, strerror(rte_errno)); + err = rte_errno; + goto port_error; + } + /* Supported Verbs flow priority number detection. */ + if (verb_priorities == 0) + verb_priorities = mlx5_get_max_verbs_prio(eth_dev); + if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) { + DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u", + eth_dev->data->port_id, verb_priorities); + goto port_error; + } + priv->config.max_verbs_prio = verb_priorities; + /* + * Once the device is added to the list of memory event + * callback, its global MR cache table cannot be expanded + * on the fly because of deadlock. If it overflows, lookup + * should be done by searching MR list linearly, which is slow. + */ + err = mlx5_mr_btree_init(&priv->mr.cache, + MLX5_MR_BTREE_CACHE_N * 2, + eth_dev->device->numa_node); + if (err) { + err = rte_errno; + goto port_error; + } + /* Add device to memory callback list. */ + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, + priv, mem_event_cb); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); + rte_eth_dev_probing_finish(eth_dev); continue; - port_error: if (priv) rte_free(priv); @@ -977,29 +1222,31 @@ port_error: claim_zero(mlx5_glue->dealloc_pd(pd)); if (ctx) claim_zero(mlx5_glue->close_device(ctx)); + if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_eth_dev_release_port(eth_dev); break; } - /* * XXX if something went wrong in the loop above, there is a resource * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as * long as the dpdk does not provide a way to deallocate a ethdev and a * way to enumerate the registered ethdevs to free the previous ones. */ - /* no port found, complain */ if (!mlx5_dev[idx].ports) { - err = ENODEV; - goto error; + rte_errno = ENODEV; + err = rte_errno; } - error: if (attr_ctx) claim_zero(mlx5_glue->close_device(attr_ctx)); if (list) mlx5_glue->free_device_list(list); - assert(err >= 0); - return -err; + if (err) { + rte_errno = err; + return -rte_errno; + } + return 0; } static const struct rte_pci_id mlx5_pci_id_map[] = { @@ -1036,6 +1283,10 @@ static const struct rte_pci_id mlx5_pci_id_map[] = { PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) }, { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) + }, + { .vendor_id = 0 } }; @@ -1052,11 +1303,54 @@ static struct rte_pci_driver mlx5_driver = { #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS /** + * Suffix RTE_EAL_PMD_PATH with "-glue". + * + * This function performs a sanity check on RTE_EAL_PMD_PATH before + * suffixing its last component. + * + * @param buf[out] + * Output buffer, should be large enough otherwise NULL is returned. + * @param size + * Size of @p out. + * + * @return + * Pointer to @p buf or @p NULL in case suffix cannot be appended. + */ +static char * +mlx5_glue_path(char *buf, size_t size) +{ + static const char *const bad[] = { "/", ".", "..", NULL }; + const char *path = RTE_EAL_PMD_PATH; + size_t len = strlen(path); + size_t off; + int i; + + while (len && path[len - 1] == '/') + --len; + for (off = len; off && path[off - 1] != '/'; --off) + ; + for (i = 0; bad[i]; ++i) + if (!strncmp(path + off, bad[i], (int)(len - off))) + goto error; + i = snprintf(buf, size, "%.*s-glue", (int)len, path); + if (i == -1 || (size_t)i >= size) + goto error; + return buf; +error: + DRV_LOG(ERR, + "unable to append \"-glue\" to last component of" + " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\")," + " please re-configure DPDK"); + return NULL; +} + +/** * Initialization routine for run-time dependency on rdma-core. */ static int mlx5_glue_init(void) { + char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; const char *path[] = { /* * A basic security check is necessary before trusting @@ -1064,7 +1358,13 @@ mlx5_glue_init(void) */ (geteuid() == getuid() && getegid() == getgid() ? getenv("MLX5_GLUE_PATH") : NULL), - RTE_EAL_PMD_PATH, + /* + * When RTE_EAL_PMD_PATH is set, use its glue-suffixed + * variant, otherwise let dlopen() look up libraries on its + * own. + */ + (*RTE_EAL_PMD_PATH ? + mlx5_glue_path(glue_path, sizeof(glue_path)) : ""), }; unsigned int i = 0; void *handle = NULL; @@ -1095,7 +1395,8 @@ mlx5_glue_init(void) break; if (sizeof(name) != (size_t)ret + 1) continue; - DEBUG("looking for rdma-core glue as \"%s\"", name); + DRV_LOG(DEBUG, "looking for rdma-core glue as \"%s\"", + name); handle = dlopen(name, RTLD_LAZY); break; } while (1); @@ -1107,7 +1408,7 @@ mlx5_glue_init(void) rte_errno = EINVAL; dlmsg = dlerror(); if (dlmsg) - WARN("cannot load glue library: %s", dlmsg); + DRV_LOG(WARNING, "cannot load glue library: %s", dlmsg); goto glue_error; } sym = dlsym(handle, "mlx5_glue"); @@ -1115,7 +1416,7 @@ mlx5_glue_init(void) rte_errno = EINVAL; dlmsg = dlerror(); if (dlmsg) - ERROR("cannot resolve glue symbol: %s", dlmsg); + DRV_LOG(ERR, "cannot resolve glue symbol: %s", dlmsg); goto glue_error; } mlx5_glue = *sym; @@ -1123,9 +1424,9 @@ mlx5_glue_init(void) glue_error: if (handle) dlclose(handle); - WARN("cannot initialize PMD due to missing run-time" - " dependency on rdma-core libraries (libibverbs," - " libmlx5)"); + DRV_LOG(WARNING, + "cannot initialize PMD due to missing run-time dependency on" + " rdma-core libraries (libibverbs, libmlx5)"); return -rte_errno; } @@ -1138,8 +1439,10 @@ RTE_INIT(rte_mlx5_pmd_init); static void rte_mlx5_pmd_init(void) { - /* Build the static table for ptype conversion. */ + /* Build the static tables for Verbs conversion. */ mlx5_set_ptype_table(); + mlx5_set_cksum_table(); + mlx5_set_swp_types_table(); /* * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use * huge pages. Calling ibv_fork_init() during init allows @@ -1165,8 +1468,9 @@ rte_mlx5_pmd_init(void) } #endif if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) { - ERROR("rdma-core glue \"%s\" mismatch: \"%s\" is required", - mlx5_glue->version, MLX5_GLUE_VERSION); + DRV_LOG(ERR, + "rdma-core glue \"%s\" mismatch: \"%s\" is required", + mlx5_glue->version, MLX5_GLUE_VERSION); return; } mlx5_glue->fork_init(); @@ -1176,3 +1480,11 @@ rte_mlx5_pmd_init(void) RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__); RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map); RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib"); + +/** Initialize driver log type. */ +RTE_INIT(vdev_netvsc_init_log) +{ + mlx5_logtype = rte_log_register("pmd.net.mlx5"); + if (mlx5_logtype >= 0) + rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE); +} diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 965c19f2..997b04a3 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_H_ @@ -26,12 +26,13 @@ #include <rte_pci.h> #include <rte_ether.h> #include <rte_ethdev_driver.h> -#include <rte_spinlock.h> +#include <rte_rwlock.h> #include <rte_interrupts.h> #include <rte_errno.h> #include <rte_flow.h> #include "mlx5_utils.h" +#include "mlx5_mr.h" #include "mlx5_rxtx.h" #include "mlx5_autoconf.h" #include "mlx5_defs.h" @@ -49,8 +50,19 @@ enum { PCI_DEVICE_ID_MELLANOX_CONNECTX5VF = 0x1018, PCI_DEVICE_ID_MELLANOX_CONNECTX5EX = 0x1019, PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF = 0x101a, + PCI_DEVICE_ID_MELLANOX_CONNECTX5BF = 0xa2d2, }; +LIST_HEAD(mlx5_dev_list, priv); + +/* Shared memory between primary and secondary processes. */ +struct mlx5_shared_data { + struct mlx5_dev_list mem_event_cb_list; + rte_rwlock_t mem_event_rwlock; +}; + +extern struct mlx5_shared_data *mlx5_shared_data; + struct mlx5_xstats_ctrl { /* Number of device stats. */ uint16_t stats_n; @@ -75,19 +87,34 @@ TAILQ_HEAD(mlx5_flows, rte_flow); */ struct mlx5_dev_config { unsigned int hw_csum:1; /* Checksum offload is supported. */ - unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */ unsigned int hw_vlan_strip:1; /* VLAN stripping is supported. */ unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */ unsigned int hw_padding:1; /* End alignment padding is supported. */ - unsigned int sriov:1; /* This is a VF or PF with VF devices. */ + unsigned int vf:1; /* This is a VF. */ unsigned int mps:2; /* Multi-packet send supported mode. */ - unsigned int tunnel_en:1; /* Whether tunnel is supported. */ + unsigned int tunnel_en:1; + /* Whether tunnel stateless offloads are supported. */ + unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */ unsigned int flow_counter_en:1; /* Whether flow counter is supported. */ unsigned int cqe_comp:1; /* CQE compression is enabled. */ unsigned int tso:1; /* Whether TSO is supported. */ unsigned int tx_vec_en:1; /* Tx vector is enabled. */ unsigned int rx_vec_en:1; /* Rx vector is enabled. */ unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */ + unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */ + unsigned int vf_nl_en:1; /* Enable Netlink requests in VF mode. */ + unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload. */ + struct { + unsigned int enabled:1; /* Whether MPRQ is enabled. */ + unsigned int stride_num_n; /* Number of strides. */ + unsigned int min_stride_size_n; /* Min size of a stride. */ + unsigned int max_stride_size_n; /* Max size of a stride. */ + unsigned int max_memcpy_len; + /* Maximum packet size to memcpy Rx packets. */ + unsigned int min_rxqs_num; + /* Rx queue count threshold to enable MPRQ. */ + } mprq; /* Configurations for Multi-Packet RQ. */ + unsigned int max_verbs_prio; /* Number of Verb flow priorities. */ unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */ unsigned int ind_table_max_size; /* Maximum indirection table size. */ int txq_inline; /* Maximum packet size for inlining. */ @@ -104,6 +131,9 @@ enum mlx5_verbs_alloc_type { MLX5_VERBS_ALLOC_TYPE_RX_QUEUE, }; +/* 8 Verbs priorities. */ +#define MLX5_VERBS_FLOW_PRIO_8 8 + /** * Verbs allocator needs a context to know in the callback which kind of * resources it is allocating. @@ -113,25 +143,30 @@ struct mlx5_verbs_alloc_ctx { const void *obj; /* Pointer to the DPDK object. */ }; +LIST_HEAD(mlx5_mr_list, mlx5_mr); + struct priv { - struct rte_eth_dev *dev; /* Ethernet device of master process. */ + LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */ + struct rte_eth_dev_data *dev_data; /* Pointer to device data. */ struct ibv_context *ctx; /* Verbs context. */ struct ibv_device_attr_ex device_attr; /* Device properties. */ struct ibv_pd *pd; /* Protection Domain. */ char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */ struct ether_addr mac[MLX5_MAX_MAC_ADDRESSES]; /* MAC addresses. */ + BITFIELD_DECLARE(mac_own, uint64_t, MLX5_MAX_MAC_ADDRESSES); + /* Bit-field of MAC addresses owned by the PMD. */ uint16_t vlan_filter[MLX5_MAX_VLAN_IDS]; /* VLAN filters table. */ unsigned int vlan_filter_n; /* Number of configured VLAN filters. */ /* Device properties. */ uint16_t mtu; /* Configured MTU. */ uint8_t port; /* Physical port number. */ - unsigned int pending_alarm:1; /* An alarm is pending. */ unsigned int isolated:1; /* Whether isolated mode is enabled. */ /* RX/TX queues. */ unsigned int rxqs_n; /* RX queues array size. */ unsigned int txqs_n; /* TX queues array size. */ struct mlx5_rxq_data *(*rxqs)[]; /* RX queues. */ struct mlx5_txq_data *(*txqs)[]; /* TX queues. */ + struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */ struct rte_eth_rss_conf rss_conf; /* RSS configuration. */ struct rte_intr_handle intr_handle; /* Interrupt handler. */ unsigned int (*reta_idx)[]; /* RETA index table. */ @@ -139,7 +174,13 @@ struct priv { struct mlx5_hrxq_drop *flow_drop_queue; /* Flow drop queue. */ struct mlx5_flows flows; /* RTE Flow rules. */ struct mlx5_flows ctrl_flows; /* Control flow rules. */ - LIST_HEAD(mr, mlx5_mr) mr; /* Memory region. */ + struct { + uint32_t dev_gen; /* Generation number to flush local caches. */ + rte_rwlock_t rwlock; /* MR Lock. */ + struct mlx5_mr_btree cache; /* Global MR cache table. */ + struct mlx5_mr_list mr_list; /* Registered MR list. */ + struct mlx5_mr_list mr_free_list; /* Freed MR list. */ + } mr; LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */ LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */ LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */ @@ -149,55 +190,18 @@ struct priv { LIST_HEAD(ind_tables, mlx5_ind_table_ibv) ind_tbls; uint32_t link_speed_capa; /* Link speed capabilities. */ struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */ - rte_spinlock_t lock; /* Lock for control functions. */ int primary_socket; /* Unix socket for primary process. */ void *uar_base; /* Reserved address space for UAR mapping */ struct rte_intr_handle intr_handle_socket; /* Interrupt handler. */ struct mlx5_dev_config config; /* Device configuration. */ struct mlx5_verbs_alloc_ctx verbs_alloc_ctx; /* Context for Verbs allocator. */ + int nl_socket; /* Netlink socket. */ + uint32_t nl_sn; /* Netlink message sequence number. */ }; -/** - * Lock private structure to protect it from concurrent access in the - * control path. - * - * @param priv - * Pointer to private structure. - */ -static inline void -priv_lock(struct priv *priv) -{ - rte_spinlock_lock(&priv->lock); -} - -/** - * Try to lock private structure to protect it from concurrent access in the - * control path. - * - * @param priv - * Pointer to private structure. - * - * @return - * 1 if the lock is successfully taken; 0 otherwise. - */ -static inline int -priv_trylock(struct priv *priv) -{ - return rte_spinlock_trylock(&priv->lock); -} - -/** - * Unlock private structure. - * - * @param priv - * Pointer to private structure. - */ -static inline void -priv_unlock(struct priv *priv) -{ - rte_spinlock_unlock(&priv->lock); -} +#define PORT_ID(priv) ((priv)->dev_data->port_id) +#define ETH_DEV(priv) (&rte_eth_devices[PORT_ID(priv)]) /* mlx5.c */ @@ -205,131 +209,148 @@ int mlx5_getenv_int(const char *); /* mlx5_ethdev.c */ -struct priv *mlx5_get_priv(struct rte_eth_dev *dev); -int mlx5_is_secondary(void); -int priv_get_ifname(const struct priv *, char (*)[IF_NAMESIZE]); -int priv_ifreq(const struct priv *, int req, struct ifreq *); -int priv_is_ib_cntr(const char *); -int priv_get_cntr_sysfs(struct priv *, const char *, uint64_t *); -int priv_get_num_vfs(struct priv *, uint16_t *); -int priv_get_mtu(struct priv *, uint16_t *); -int priv_set_flags(struct priv *, unsigned int, unsigned int); -int mlx5_dev_configure(struct rte_eth_dev *); -void mlx5_dev_infos_get(struct rte_eth_dev *, struct rte_eth_dev_info *); +int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]); +int mlx5_ifindex(const struct rte_eth_dev *dev); +int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr); +int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu); +int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, + unsigned int flags); +int mlx5_dev_configure(struct rte_eth_dev *dev); +void mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info); const uint32_t *mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev); -int priv_link_update(struct priv *, int); -int priv_force_link_status_change(struct priv *, int); -int mlx5_link_update(struct rte_eth_dev *, int); -int mlx5_dev_set_mtu(struct rte_eth_dev *, uint16_t); -int mlx5_dev_get_flow_ctrl(struct rte_eth_dev *, struct rte_eth_fc_conf *); -int mlx5_dev_set_flow_ctrl(struct rte_eth_dev *, struct rte_eth_fc_conf *); -int mlx5_ibv_device_to_pci_addr(const struct ibv_device *, - struct rte_pci_addr *); -void mlx5_dev_link_status_handler(void *); -void mlx5_dev_interrupt_handler(void *); -void priv_dev_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *); -void priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *); +int mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete); +int mlx5_force_link_status_change(struct rte_eth_dev *dev, int status); +int mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu); +int mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +int mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +int mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, + struct rte_pci_addr *pci_addr); +void mlx5_dev_link_status_handler(void *arg); +void mlx5_dev_interrupt_handler(void *arg); +void mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev); +void mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev); int mlx5_set_link_down(struct rte_eth_dev *dev); int mlx5_set_link_up(struct rte_eth_dev *dev); int mlx5_is_removed(struct rte_eth_dev *dev); -eth_tx_burst_t priv_select_tx_function(struct priv *, struct rte_eth_dev *); -eth_rx_burst_t priv_select_rx_function(struct priv *, struct rte_eth_dev *); +eth_tx_burst_t mlx5_select_tx_function(struct rte_eth_dev *dev); +eth_rx_burst_t mlx5_select_rx_function(struct rte_eth_dev *dev); /* mlx5_mac.c */ -int priv_get_mac(struct priv *, uint8_t (*)[ETHER_ADDR_LEN]); -void mlx5_mac_addr_remove(struct rte_eth_dev *, uint32_t); -int mlx5_mac_addr_add(struct rte_eth_dev *, struct ether_addr *, uint32_t, - uint32_t); -void mlx5_mac_addr_set(struct rte_eth_dev *, struct ether_addr *); +int mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN]); +void mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index); +int mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index, uint32_t vmdq); +int mlx5_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr); +int mlx5_set_mc_addr_list(struct rte_eth_dev *dev, + struct ether_addr *mc_addr_set, uint32_t nb_mc_addr); /* mlx5_rss.c */ -int mlx5_rss_hash_update(struct rte_eth_dev *, struct rte_eth_rss_conf *); -int mlx5_rss_hash_conf_get(struct rte_eth_dev *, struct rte_eth_rss_conf *); -int priv_rss_reta_index_resize(struct priv *, unsigned int); -int mlx5_dev_rss_reta_query(struct rte_eth_dev *, - struct rte_eth_rss_reta_entry64 *, uint16_t); -int mlx5_dev_rss_reta_update(struct rte_eth_dev *, - struct rte_eth_rss_reta_entry64 *, uint16_t); +int mlx5_rss_hash_update(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf); +int mlx5_rss_hash_conf_get(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf); +int mlx5_rss_reta_index_resize(struct rte_eth_dev *dev, unsigned int reta_size); +int mlx5_dev_rss_reta_query(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size); +int mlx5_dev_rss_reta_update(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size); /* mlx5_rxmode.c */ -void mlx5_promiscuous_enable(struct rte_eth_dev *); -void mlx5_promiscuous_disable(struct rte_eth_dev *); -void mlx5_allmulticast_enable(struct rte_eth_dev *); -void mlx5_allmulticast_disable(struct rte_eth_dev *); +void mlx5_promiscuous_enable(struct rte_eth_dev *dev); +void mlx5_promiscuous_disable(struct rte_eth_dev *dev); +void mlx5_allmulticast_enable(struct rte_eth_dev *dev); +void mlx5_allmulticast_disable(struct rte_eth_dev *dev); /* mlx5_stats.c */ -void priv_xstats_init(struct priv *); -int mlx5_stats_get(struct rte_eth_dev *, struct rte_eth_stats *); -void mlx5_stats_reset(struct rte_eth_dev *); -int mlx5_xstats_get(struct rte_eth_dev *, - struct rte_eth_xstat *, unsigned int); -void mlx5_xstats_reset(struct rte_eth_dev *); -int mlx5_xstats_get_names(struct rte_eth_dev *, - struct rte_eth_xstat_name *, unsigned int); +void mlx5_xstats_init(struct rte_eth_dev *dev); +int mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats); +void mlx5_stats_reset(struct rte_eth_dev *dev); +int mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats, + unsigned int n); +void mlx5_xstats_reset(struct rte_eth_dev *dev); +int mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_xstat_name *xstats_names, + unsigned int n); /* mlx5_vlan.c */ -int mlx5_vlan_filter_set(struct rte_eth_dev *, uint16_t, int); -int mlx5_vlan_offload_set(struct rte_eth_dev *, int); -void mlx5_vlan_strip_queue_set(struct rte_eth_dev *, uint16_t, int); +int mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on); +void mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on); +int mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask); /* mlx5_trigger.c */ -int mlx5_dev_start(struct rte_eth_dev *); -void mlx5_dev_stop(struct rte_eth_dev *); -int priv_dev_traffic_enable(struct priv *, struct rte_eth_dev *); -int priv_dev_traffic_disable(struct priv *, struct rte_eth_dev *); -int priv_dev_traffic_restart(struct priv *, struct rte_eth_dev *); -int mlx5_traffic_restart(struct rte_eth_dev *); +int mlx5_dev_start(struct rte_eth_dev *dev); +void mlx5_dev_stop(struct rte_eth_dev *dev); +int mlx5_traffic_enable(struct rte_eth_dev *dev); +void mlx5_traffic_disable(struct rte_eth_dev *dev); +int mlx5_traffic_restart(struct rte_eth_dev *dev); /* mlx5_flow.c */ -int mlx5_dev_filter_ctrl(struct rte_eth_dev *, enum rte_filter_type, - enum rte_filter_op, void *); -int mlx5_flow_validate(struct rte_eth_dev *, const struct rte_flow_attr *, - const struct rte_flow_item [], - const struct rte_flow_action [], - struct rte_flow_error *); -struct rte_flow *mlx5_flow_create(struct rte_eth_dev *, - const struct rte_flow_attr *, - const struct rte_flow_item [], - const struct rte_flow_action [], - struct rte_flow_error *); -int mlx5_flow_destroy(struct rte_eth_dev *, struct rte_flow *, - struct rte_flow_error *); -void priv_flow_flush(struct priv *, struct mlx5_flows *); -int mlx5_flow_flush(struct rte_eth_dev *, struct rte_flow_error *); -int mlx5_flow_query(struct rte_eth_dev *, struct rte_flow *, - enum rte_flow_action_type, void *, - struct rte_flow_error *); -int mlx5_flow_isolate(struct rte_eth_dev *, int, struct rte_flow_error *); -int priv_flow_start(struct priv *, struct mlx5_flows *); -void priv_flow_stop(struct priv *, struct mlx5_flows *); -int priv_flow_verify(struct priv *); -int mlx5_ctrl_flow_vlan(struct rte_eth_dev *, struct rte_flow_item_eth *, - struct rte_flow_item_eth *, struct rte_flow_item_vlan *, - struct rte_flow_item_vlan *); -int mlx5_ctrl_flow(struct rte_eth_dev *, struct rte_flow_item_eth *, - struct rte_flow_item_eth *); -int priv_flow_create_drop_queue(struct priv *); -void priv_flow_delete_drop_queue(struct priv *); +unsigned int mlx5_get_max_verbs_prio(struct rte_eth_dev *dev); +int mlx5_flow_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); +struct rte_flow *mlx5_flow_create(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); +int mlx5_flow_destroy(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error); +void mlx5_flow_list_flush(struct rte_eth_dev *dev, struct mlx5_flows *list); +int mlx5_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error); +int mlx5_flow_query(struct rte_eth_dev *dev, struct rte_flow *flow, + const struct rte_flow_action *action, void *data, + struct rte_flow_error *error); +int mlx5_flow_isolate(struct rte_eth_dev *dev, int enable, + struct rte_flow_error *error); +int mlx5_dev_filter_ctrl(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg); +int mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list); +void mlx5_flow_stop(struct rte_eth_dev *dev, struct mlx5_flows *list); +int mlx5_flow_verify(struct rte_eth_dev *dev); +int mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, + struct rte_flow_item_eth *eth_spec, + struct rte_flow_item_eth *eth_mask, + struct rte_flow_item_vlan *vlan_spec, + struct rte_flow_item_vlan *vlan_mask); +int mlx5_ctrl_flow(struct rte_eth_dev *dev, + struct rte_flow_item_eth *eth_spec, + struct rte_flow_item_eth *eth_mask); +int mlx5_flow_create_drop_queue(struct rte_eth_dev *dev); +void mlx5_flow_delete_drop_queue(struct rte_eth_dev *dev); /* mlx5_socket.c */ -int priv_socket_init(struct priv *priv); -int priv_socket_uninit(struct priv *priv); -void priv_socket_handle(struct priv *priv); -int priv_socket_connect(struct priv *priv); - -/* mlx5_mr.c */ - -struct mlx5_mr *priv_mr_new(struct priv *, struct rte_mempool *); -struct mlx5_mr *priv_mr_get(struct priv *, struct rte_mempool *); -int priv_mr_release(struct priv *, struct mlx5_mr *); -int priv_mr_verify(struct priv *); +int mlx5_socket_init(struct rte_eth_dev *priv); +void mlx5_socket_uninit(struct rte_eth_dev *priv); +void mlx5_socket_handle(struct rte_eth_dev *priv); +int mlx5_socket_connect(struct rte_eth_dev *priv); + +/* mlx5_nl.c */ + +int mlx5_nl_init(uint32_t nlgroups); +int mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index); +int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index); +void mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev); +void mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev); +int mlx5_nl_promisc(struct rte_eth_dev *dev, int enable); +int mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable); #endif /* RTE_PMD_MLX5_H_ */ diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index c3334ca3..51124cdc 100644 --- a/drivers/net/mlx5/mlx5_defs.h +++ b/drivers/net/mlx5/mlx5_defs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_DEFS_H_ @@ -13,8 +13,13 @@ /* Reported driver name. */ #define MLX5_DRIVER_NAME "net_mlx5" +/* Maximum number of simultaneous unicast MAC addresses. */ +#define MLX5_MAX_UC_MAC_ADDRESSES 128 +/* Maximum number of simultaneous Multicast MAC addresses. */ +#define MLX5_MAX_MC_MAC_ADDRESSES 128 /* Maximum number of simultaneous MAC addresses. */ -#define MLX5_MAX_MAC_ADDRESSES 128 +#define MLX5_MAX_MAC_ADDRESSES \ + (MLX5_MAX_UC_MAC_ADDRESSES + MLX5_MAX_MC_MAC_ADDRESSES) /* Maximum number of simultaneous VLAN filters. */ #define MLX5_MAX_VLAN_IDS 128 @@ -32,16 +37,11 @@ */ #define MLX5_TX_COMP_THRESH_INLINE_DIV (1 << 3) -/* - * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP - * from which buffers are to be transmitted will have to be mapped by this - * driver to their own Memory Region (MR). This is a slow operation. - * - * This value is always 1 for RX queues. - */ -#ifndef MLX5_PMD_TX_MP_CACHE -#define MLX5_PMD_TX_MP_CACHE 8 -#endif +/* Size of per-queue MR cache array for linear search. */ +#define MLX5_MR_CACHE_N 8 + +/* Size of MR cache table for binary search. */ +#define MLX5_MR_BTREE_CACHE_N 256 /* * If defined, only use software counters. The PMD will never ask the hardware @@ -58,7 +58,7 @@ #define MLX5_MAX_XSTATS 32 /* Maximum Packet headers size (L2+L3+L4) for TSO. */ -#define MLX5_MAX_TSO_HEADER 128 +#define MLX5_MAX_TSO_HEADER 192 /* Default minimum number of Tx queues for vectorized Tx. */ #define MLX5_VPMD_MIN_TXQS 4 @@ -82,8 +82,8 @@ /* Supported RSS */ #define MLX5_RSS_HF_MASK (~(ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP)) -/* Maximum number of attempts to query link status before giving up. */ -#define MLX5_MAX_LINK_QUERY_ATTEMPTS 5 +/* Timeout in seconds to get a valid link status. */ +#define MLX5_LINK_STATUS_TIMEOUT 10 /* Reserved address space for UAR mapping. */ #define MLX5_UAR_SIZE (1ULL << 32) @@ -95,4 +95,22 @@ */ #define MLX5_UAR_OFFSET (1ULL << 32) +/* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */ +#define MLX5_MPRQ_STRIDE_NUM_N 4U + +/* Two-byte shift is disabled for Multi-Packet RQ. */ +#define MLX5_MPRQ_TWO_BYTE_SHIFT 0 + +/* + * Minimum size of packet to be memcpy'd instead of being attached as an + * external buffer. + */ +#define MLX5_MPRQ_MEMCPY_DEFAULT_LEN 128 + +/* Minimum number Rx queues to enable Multi-Packet RQ. */ +#define MLX5_MPRQ_MIN_RXQS 12 + +/* Cache size of mempool for Multi-Packet RQ. */ +#define MLX5_MPRQ_MP_CACHE_SZ 32 + #endif /* RTE_PMD_MLX5_DEFS_H_ */ diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index 66650769..90488af3 100644 --- a/drivers/net/mlx5/mlx5_ethdev.c +++ b/drivers/net/mlx5/mlx5_ethdev.c @@ -1,12 +1,13 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #define _GNU_SOURCE #include <stddef.h> #include <assert.h> +#include <inttypes.h> #include <unistd.h> #include <stdint.h> #include <stdio.h> @@ -17,14 +18,13 @@ #include <net/if.h> #include <sys/ioctl.h> #include <sys/socket.h> -#include <sys/utsname.h> #include <netinet/in.h> #include <linux/ethtool.h> #include <linux/sockios.h> -#include <linux/version.h> #include <fcntl.h> #include <stdalign.h> #include <sys/un.h> +#include <time.h> #include <rte_atomic.h> #include <rte_ethdev_driver.h> @@ -32,8 +32,9 @@ #include <rte_mbuf.h> #include <rte_common.h> #include <rte_interrupts.h> -#include <rte_alarm.h> #include <rte_malloc.h> +#include <rte_string_fns.h> +#include <rte_rwlock.h> #include "mlx5.h" #include "mlx5_glue.h" @@ -94,17 +95,18 @@ struct ethtool_link_settings { /** * Get interface name from private structure. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * @param[out] ifname * Interface name output buffer. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) +mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]) { + struct priv *priv = dev->data->dev_private; DIR *dir; struct dirent *dent; unsigned int dev_type = 0; @@ -115,8 +117,10 @@ priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) MKSTR(path, "%s/device/net", priv->ibdev_path); dir = opendir(path); - if (dir == NULL) - return -1; + if (dir == NULL) { + rte_errno = errno; + return -rte_errno; + } } while ((dent = readdir(dir)) != NULL) { char *name = dent->d_name; @@ -163,358 +167,161 @@ try_dev_id: goto try_dev_id; dev_port_prev = dev_port; if (dev_port == (priv->port - 1u)) - snprintf(match, sizeof(match), "%s", name); + strlcpy(match, name, sizeof(match)); } closedir(dir); - if (match[0] == '\0') - return -1; + if (match[0] == '\0') { + rte_errno = ENOENT; + return -rte_errno; + } strncpy(*ifname, match, sizeof(*ifname)); return 0; } /** - * Check if the counter is located on ib counters file. + * Get the interface index from device name. * - * @param[in] cntr - * Counter name. + * @param[in] dev + * Pointer to Ethernet device. * * @return - * 1 if counter is located on ib counters file , 0 otherwise. + * Interface index on success, a negative errno value otherwise and + * rte_errno is set. */ int -priv_is_ib_cntr(const char *cntr) -{ - if (!strcmp(cntr, "out_of_buffer")) - return 1; - return 0; -} - -/** - * Read from sysfs entry. - * - * @param[in] priv - * Pointer to private structure. - * @param[in] entry - * Entry name relative to sysfs path. - * @param[out] buf - * Data output buffer. - * @param size - * Buffer size. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_sysfs_read(const struct priv *priv, const char *entry, - char *buf, size_t size) -{ - char ifname[IF_NAMESIZE]; - FILE *file; - int ret; - int err; - - if (priv_get_ifname(priv, &ifname)) - return -1; - - if (priv_is_ib_cntr(entry)) { - MKSTR(path, "%s/ports/1/hw_counters/%s", - priv->ibdev_path, entry); - file = fopen(path, "rb"); - } else { - MKSTR(path, "%s/device/net/%s/%s", - priv->ibdev_path, ifname, entry); - file = fopen(path, "rb"); - } - if (file == NULL) - return -1; - ret = fread(buf, 1, size, file); - err = errno; - if (((size_t)ret < size) && (ferror(file))) - ret = -1; - else - ret = size; - fclose(file); - errno = err; - return ret; -} - -/** - * Write to sysfs entry. - * - * @param[in] priv - * Pointer to private structure. - * @param[in] entry - * Entry name relative to sysfs path. - * @param[in] buf - * Data buffer. - * @param size - * Buffer size. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_sysfs_write(const struct priv *priv, const char *entry, - char *buf, size_t size) +mlx5_ifindex(const struct rte_eth_dev *dev) { char ifname[IF_NAMESIZE]; - FILE *file; int ret; - int err; - - if (priv_get_ifname(priv, &ifname)) - return -1; - MKSTR(path, "%s/device/net/%s/%s", priv->ibdev_path, ifname, entry); - - file = fopen(path, "wb"); - if (file == NULL) - return -1; - ret = fwrite(buf, 1, size, file); - err = errno; - if (((size_t)ret < size) || (ferror(file))) - ret = -1; - else - ret = size; - fclose(file); - errno = err; - return ret; -} - -/** - * Get unsigned long sysfs property. - * - * @param priv - * Pointer to private structure. - * @param[in] name - * Entry name relative to sysfs path. - * @param[out] value - * Value output buffer. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) -{ - int ret; - unsigned long value_ret; - char value_str[32]; - - ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); - if (ret == -1) { - DEBUG("cannot read %s value from sysfs: %s", - name, strerror(errno)); - return -1; - } - value_str[ret] = '\0'; - errno = 0; - value_ret = strtoul(value_str, NULL, 0); - if (errno) { - DEBUG("invalid %s value `%s': %s", name, value_str, - strerror(errno)); - return -1; - } - *value = value_ret; - return 0; -} - -/** - * Set unsigned long sysfs property. - * - * @param priv - * Pointer to private structure. - * @param[in] name - * Entry name relative to sysfs path. - * @param value - * Value to set. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) -{ - int ret; - MKSTR(value_str, "%lu", value); - - ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); + ret = mlx5_get_ifname(dev, &ifname); + if (ret) + return ret; + ret = if_nametoindex(ifname); if (ret == -1) { - DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", - name, value_str, value, strerror(errno)); - return -1; + rte_errno = errno; + return -rte_errno; } - return 0; + return ret; } /** * Perform ifreq ioctl() on associated Ethernet device. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * @param req * Request number to pass to ioctl(). * @param[out] ifr * Interface request structure output buffer. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) +mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr) { int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); - int ret = -1; + int ret = 0; - if (sock == -1) - return ret; - if (priv_get_ifname(priv, &ifr->ifr_name) == 0) - ret = ioctl(sock, req, ifr); + if (sock == -1) { + rte_errno = errno; + return -rte_errno; + } + ret = mlx5_get_ifname(dev, &ifr->ifr_name); + if (ret) + goto error; + ret = ioctl(sock, req, ifr); + if (ret == -1) { + rte_errno = errno; + goto error; + } close(sock); - return ret; -} - -/** - * Return the number of active VFs for the current device. - * - * @param[in] priv - * Pointer to private structure. - * @param[out] num_vfs - * Number of active VFs. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -int -priv_get_num_vfs(struct priv *priv, uint16_t *num_vfs) -{ - /* The sysfs entry name depends on the operating system. */ - const char **name = (const char *[]){ - "device/sriov_numvfs", - "device/mlx5_num_vfs", - NULL, - }; - int ret; - - do { - unsigned long ulong_num_vfs; - - ret = priv_get_sysfs_ulong(priv, *name, &ulong_num_vfs); - if (!ret) - *num_vfs = ulong_num_vfs; - } while (*(++name) && ret); - return ret; + return 0; +error: + close(sock); + return -rte_errno; } /** * Get device MTU. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param[out] mtu * MTU value output buffer. * * @return - * 0 on success, -1 on failure and errno is set. - */ -int -priv_get_mtu(struct priv *priv, uint16_t *mtu) -{ - unsigned long ulong_mtu; - - if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) - return -1; - *mtu = ulong_mtu; - return 0; -} - -/** - * Read device counter from sysfs. - * - * @param priv - * Pointer to private structure. - * @param name - * Counter name. - * @param[out] cntr - * Counter output buffer. - * - * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_get_cntr_sysfs(struct priv *priv, const char *name, uint64_t *cntr) +mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu) { - unsigned long ulong_ctr; + struct ifreq request; + int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request); - if (priv_get_sysfs_ulong(priv, name, &ulong_ctr) == -1) - return -1; - *cntr = ulong_ctr; + if (ret) + return ret; + *mtu = request.ifr_mtu; return 0; } /** * Set device MTU. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param mtu * MTU value to set. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_set_mtu(struct priv *priv, uint16_t mtu) +mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) { - uint16_t new_mtu; + struct ifreq request = { .ifr_mtu = mtu, }; - if (priv_set_sysfs_ulong(priv, "mtu", mtu) || - priv_get_mtu(priv, &new_mtu)) - return -1; - if (new_mtu == mtu) - return 0; - errno = EINVAL; - return -1; + return mlx5_ifreq(dev, SIOCSIFMTU, &request); } /** * Set device flags. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param keep * Bitmask for flags that must remain untouched. * @param flags * Bitmask for flags to modify. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) +mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags) { - unsigned long tmp; + struct ifreq request; + int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request); - if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) - return -1; - tmp &= keep; - tmp |= (flags & (~keep)); - return priv_set_sysfs_ulong(priv, "flags", tmp); + if (ret) + return ret; + request.ifr_flags &= keep; + request.ifr_flags |= flags & ~keep; + return mlx5_ifreq(dev, SIOCSIFFLAGS, &request); } /** - * Ethernet device configuration. - * - * Prepare the driver for a given number of TX and RX queues. + * DPDK callback for Ethernet device configuration. * * @param dev * Pointer to Ethernet device structure. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -dev_configure(struct rte_eth_dev *dev) +int +mlx5_dev_configure(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; unsigned int rxqs_n = dev->data->nb_rx_queues; @@ -524,37 +331,24 @@ dev_configure(struct rte_eth_dev *dev) unsigned int reta_idx_n; const uint8_t use_app_rss_key = !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key; - uint64_t supp_tx_offloads = mlx5_priv_get_tx_port_offloads(priv); - uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; - uint64_t supp_rx_offloads = - (mlx5_priv_get_rx_port_offloads(priv) | - mlx5_priv_get_rx_queue_offloads(priv)); - uint64_t rx_offloads = dev->data->dev_conf.rxmode.offloads; - - if ((tx_offloads & supp_tx_offloads) != tx_offloads) { - ERROR("Some Tx offloads are not supported " - "requested 0x%" PRIx64 " supported 0x%" PRIx64, - tx_offloads, supp_tx_offloads); - return ENOTSUP; - } - if ((rx_offloads & supp_rx_offloads) != rx_offloads) { - ERROR("Some Rx offloads are not supported " - "requested 0x%" PRIx64 " supported 0x%" PRIx64, - rx_offloads, supp_rx_offloads); - return ENOTSUP; - } + int ret = 0; + if (use_app_rss_key && (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len != rss_hash_default_key_len)) { - /* MLX5 RSS only support 40bytes key. */ - return EINVAL; + DRV_LOG(ERR, "port %u RSS key len must be %zu Bytes long", + dev->data->port_id, rss_hash_default_key_len); + rte_errno = EINVAL; + return -rte_errno; } priv->rss_conf.rss_key = rte_realloc(priv->rss_conf.rss_key, rss_hash_default_key_len, 0); if (!priv->rss_conf.rss_key) { - ERROR("cannot allocate RSS hash key memory (%u)", rxqs_n); - return ENOMEM; + DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)", + dev->data->port_id, rxqs_n); + rte_errno = ENOMEM; + return -rte_errno; } memcpy(priv->rss_conf.rss_key, use_app_rss_key ? @@ -566,18 +360,20 @@ dev_configure(struct rte_eth_dev *dev) priv->rxqs = (void *)dev->data->rx_queues; priv->txqs = (void *)dev->data->tx_queues; if (txqs_n != priv->txqs_n) { - INFO("%p: TX queues number update: %u -> %u", - (void *)dev, priv->txqs_n, txqs_n); + DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u", + dev->data->port_id, priv->txqs_n, txqs_n); priv->txqs_n = txqs_n; } if (rxqs_n > priv->config.ind_table_max_size) { - ERROR("cannot handle this many RX queues (%u)", rxqs_n); - return EINVAL; + DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)", + dev->data->port_id, rxqs_n); + rte_errno = EINVAL; + return -rte_errno; } if (rxqs_n == priv->rxqs_n) return 0; - INFO("%p: RX queues number update: %u -> %u", - (void *)dev, priv->rxqs_n, rxqs_n); + DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u", + dev->data->port_id, priv->rxqs_n, rxqs_n); priv->rxqs_n = rxqs_n; /* If the requested number of RX queues is not a power of two, use the * maximum indirection table size for better balancing. @@ -585,8 +381,9 @@ dev_configure(struct rte_eth_dev *dev) reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? priv->config.ind_table_max_size : rxqs_n)); - if (priv_rss_reta_index_resize(priv, reta_idx_n)) - return ENOMEM; + ret = mlx5_rss_reta_index_resize(dev, reta_idx_n); + if (ret) + return ret; /* When the number of RX queues is not a power of two, the remaining * table entries are padded with reused WQs and hashes are not spread * uniformly. */ @@ -599,25 +396,42 @@ dev_configure(struct rte_eth_dev *dev) } /** - * DPDK callback for Ethernet device configuration. + * Sets default tuning parameters. * * @param dev - * Pointer to Ethernet device structure. - * - * @return - * 0 on success, negative errno value on failure. + * Pointer to Ethernet device. + * @param[out] info + * Info structure output buffer. */ -int -mlx5_dev_configure(struct rte_eth_dev *dev) +static void +mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) { struct priv *priv = dev->data->dev_private; - int ret; - priv_lock(priv); - ret = dev_configure(dev); - assert(ret >= 0); - priv_unlock(priv); - return -ret; + /* Minimum CPU utilization. */ + info->default_rxportconf.ring_size = 256; + info->default_txportconf.ring_size = 256; + info->default_rxportconf.burst_size = 64; + info->default_txportconf.burst_size = 64; + if (priv->link_speed_capa & ETH_LINK_SPEED_100G) { + info->default_rxportconf.nb_queues = 16; + info->default_txportconf.nb_queues = 16; + if (dev->data->nb_rx_queues > 2 || + dev->data->nb_tx_queues > 2) { + /* Max Throughput. */ + info->default_rxportconf.ring_size = 2048; + info->default_txportconf.ring_size = 2048; + } + } else { + info->default_rxportconf.nb_queues = 8; + info->default_txportconf.nb_queues = 8; + if (dev->data->nb_rx_queues > 2 || + dev->data->nb_tx_queues > 2) { + /* Max Throughput. */ + info->default_rxportconf.ring_size = 4096; + info->default_txportconf.ring_size = 4096; + } + } } /** @@ -636,9 +450,6 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) unsigned int max; char ifname[IF_NAMESIZE]; - info->pci_dev = RTE_ETH_DEV_TO_PCI(dev); - - priv_lock(priv); /* FIXME: we should ask the device for these values. */ info->min_rx_bufsize = 32; info->max_rx_pktlen = 65536; @@ -653,22 +464,30 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) max = 65535; info->max_rx_queues = max; info->max_tx_queues = max; - info->max_mac_addrs = RTE_DIM(priv->mac); - info->rx_queue_offload_capa = - mlx5_priv_get_rx_queue_offloads(priv); - info->rx_offload_capa = (mlx5_priv_get_rx_port_offloads(priv) | + info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES; + info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev); + info->rx_offload_capa = (mlx5_get_rx_port_offloads() | info->rx_queue_offload_capa); - info->tx_offload_capa = mlx5_priv_get_tx_port_offloads(priv); - if (priv_get_ifname(priv, &ifname) == 0) + info->tx_offload_capa = mlx5_get_tx_port_offloads(dev); + if (mlx5_get_ifname(dev, &ifname) == 0) info->if_index = if_nametoindex(ifname); info->reta_size = priv->reta_idx_n ? priv->reta_idx_n : config->ind_table_max_size; - info->hash_key_size = priv->rss_conf.rss_key_len; + info->hash_key_size = rss_hash_default_key_len; info->speed_capa = priv->link_speed_capa; info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK; - priv_unlock(priv); + mlx5_set_default_params(dev, info); } +/** + * Get supported packet types. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * A pointer to the supported Packet types array. + */ const uint32_t * mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) { @@ -691,6 +510,7 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) }; if (dev->rx_pkt_burst == mlx5_rx_burst || + dev->rx_pkt_burst == mlx5_rx_burst_mprq || dev->rx_pkt_burst == mlx5_rx_burst_vec) return ptypes; return NULL; @@ -701,11 +521,15 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) * * @param dev * Pointer to Ethernet device structure. - * @param wait_to_complete - * Wait for request completion (ignored). + * @param[out] link + * Storage for current link status. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete) +mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, + struct rte_eth_link *link) { struct priv *priv = dev->data->dev_private; struct ethtool_cmd edata = { @@ -714,26 +538,28 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete) struct ifreq ifr; struct rte_eth_link dev_link; int link_speed = 0; + int ret; - /* priv_lock() is not taken to allow concurrent calls. */ - - (void)wait_to_complete; - if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { - WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); - return -1; + ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); + if (ret) { + DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } memset(&dev_link, 0, sizeof(dev_link)); dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING)); ifr.ifr_data = (void *)&edata; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", - strerror(errno)); - return -1; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } link_speed = ethtool_cmd_speed(&edata); if (link_speed == -1) - dev_link.link_speed = 0; + dev_link.link_speed = ETH_SPEED_NUM_NONE; else dev_link.link_speed = link_speed; priv->link_speed_capa = 0; @@ -753,13 +579,13 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete) ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & ETH_LINK_SPEED_FIXED); - if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { - /* Link status changed. */ - dev->data->dev_link = dev_link; - return 0; + if ((dev_link.link_speed && !dev_link.link_status) || + (!dev_link.link_speed && dev_link.link_status)) { + rte_errno = EAGAIN; + return -rte_errno; } - /* Link status is still the same. */ - return -1; + *link = dev_link; + return 0; } /** @@ -767,31 +593,41 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete) * * @param dev * Pointer to Ethernet device structure. - * @param wait_to_complete - * Wait for request completion (ignored). + * @param[out] link + * Storage for current link status. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete) +mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, + struct rte_eth_link *link) + { struct priv *priv = dev->data->dev_private; struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS }; struct ifreq ifr; struct rte_eth_link dev_link; uint64_t sc; + int ret; - (void)wait_to_complete; - if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { - WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); - return -1; + ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); + if (ret) { + DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } memset(&dev_link, 0, sizeof(dev_link)); dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING)); ifr.ifr_data = (void *)&gcmd; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s", - strerror(errno)); - return -1; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(DEBUG, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)" + " failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords; @@ -802,10 +638,13 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete) *ecmd = gcmd; ifr.ifr_data = (void *)ecmd; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s", - strerror(errno)); - return -1; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(DEBUG, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)" + " failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } dev_link.link_speed = ecmd->speed; sc = ecmd->link_mode_masks[0] | @@ -849,121 +688,13 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete) ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & ETH_LINK_SPEED_FIXED); - if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { - /* Link status changed. */ - dev->data->dev_link = dev_link; - return 0; - } - /* Link status is still the same. */ - return -1; -} - -/** - * Enable receiving and transmitting traffic. - * - * @param priv - * Pointer to private structure. - */ -static void -priv_link_start(struct priv *priv) -{ - struct rte_eth_dev *dev = priv->dev; - int err; - - dev->tx_pkt_burst = priv_select_tx_function(priv, dev); - dev->rx_pkt_burst = priv_select_rx_function(priv, dev); - err = priv_dev_traffic_enable(priv, dev); - if (err) - ERROR("%p: error occurred while configuring control flows: %s", - (void *)priv, strerror(err)); - err = priv_flow_start(priv, &priv->flows); - if (err) - ERROR("%p: error occurred while configuring flows: %s", - (void *)priv, strerror(err)); -} - -/** - * Disable receiving and transmitting traffic. - * - * @param priv - * Pointer to private structure. - */ -static void -priv_link_stop(struct priv *priv) -{ - struct rte_eth_dev *dev = priv->dev; - - priv_flow_stop(priv, &priv->flows); - priv_dev_traffic_disable(priv, dev); - dev->rx_pkt_burst = removed_rx_burst; - dev->tx_pkt_burst = removed_tx_burst; -} - -/** - * Retrieve physical link information and update rx/tx_pkt_burst callbacks - * accordingly. - * - * @param priv - * Pointer to private structure. - * @param wait_to_complete - * Wait for request completion (ignored). - */ -int -priv_link_update(struct priv *priv, int wait_to_complete) -{ - struct rte_eth_dev *dev = priv->dev; - struct utsname utsname; - int ver[3]; - int ret; - struct rte_eth_link dev_link = dev->data->dev_link; - - if (uname(&utsname) == -1 || - sscanf(utsname.release, "%d.%d.%d", - &ver[0], &ver[1], &ver[2]) != 3 || - KERNEL_VERSION(ver[0], ver[1], ver[2]) < KERNEL_VERSION(4, 9, 0)) - ret = mlx5_link_update_unlocked_gset(dev, wait_to_complete); - else - ret = mlx5_link_update_unlocked_gs(dev, wait_to_complete); - /* If lsc interrupt is disabled, should always be ready for traffic. */ - if (!dev->data->dev_conf.intr_conf.lsc) { - priv_link_start(priv); - return ret; + if ((dev_link.link_speed && !dev_link.link_status) || + (!dev_link.link_speed && dev_link.link_status)) { + rte_errno = EAGAIN; + return -rte_errno; } - /* Re-select burst callbacks only if link status has been changed. */ - if (!ret && dev_link.link_status != dev->data->dev_link.link_status) { - if (dev->data->dev_link.link_status == ETH_LINK_UP) - priv_link_start(priv); - else - priv_link_stop(priv); - } - return ret; -} - -/** - * Querying the link status till it changes to the desired state. - * Number of query attempts is bounded by MLX5_MAX_LINK_QUERY_ATTEMPTS. - * - * @param priv - * Pointer to private structure. - * @param status - * Link desired status. - * - * @return - * 0 on success, negative errno value on failure. - */ -int -priv_force_link_status_change(struct priv *priv, int status) -{ - int try = 0; - - while (try < MLX5_MAX_LINK_QUERY_ATTEMPTS) { - priv_link_update(priv, 0); - if (priv->dev->data->dev_link.link_status == status) - return 0; - try++; - sleep(1); - } - return -EAGAIN; + *link = dev_link; + return 0; } /** @@ -972,17 +703,42 @@ priv_force_link_status_change(struct priv *priv, int status) * @param dev * Pointer to Ethernet device structure. * @param wait_to_complete - * Wait for request completion (ignored). + * Wait for request completion. + * + * @return + * 0 if link status was not updated, positive if it was, a negative errno + * value otherwise and rte_errno is set. */ int mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) { - struct priv *priv = dev->data->dev_private; int ret; + struct rte_eth_link dev_link; + time_t start_time = time(NULL); - priv_lock(priv); - ret = priv_link_update(priv, wait_to_complete); - priv_unlock(priv); + do { + ret = mlx5_link_update_unlocked_gs(dev, &dev_link); + if (ret) + ret = mlx5_link_update_unlocked_gset(dev, &dev_link); + if (ret == 0) + break; + /* Handle wait to complete situation. */ + if (wait_to_complete && ret == -EAGAIN) { + if (abs((int)difftime(time(NULL), start_time)) < + MLX5_LINK_STATUS_TIMEOUT) { + usleep(0); + continue; + } else { + rte_errno = EBUSY; + return -rte_errno; + } + } else if (ret < 0) { + return ret; + } + } while (wait_to_complete); + ret = !!memcmp(&dev->data->dev_link, &dev_link, + sizeof(struct rte_eth_link)); + dev->data->dev_link = dev_link; return ret; } @@ -995,39 +751,33 @@ mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) * New MTU. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) { struct priv *priv = dev->data->dev_private; - uint16_t kern_mtu; - int ret = 0; + uint16_t kern_mtu = 0; + int ret; - priv_lock(priv); - ret = priv_get_mtu(priv, &kern_mtu); + ret = mlx5_get_mtu(dev, &kern_mtu); if (ret) - goto out; + return ret; /* Set kernel interface MTU first. */ - ret = priv_set_mtu(priv, mtu); + ret = mlx5_set_mtu(dev, mtu); if (ret) - goto out; - ret = priv_get_mtu(priv, &kern_mtu); + return ret; + ret = mlx5_get_mtu(dev, &kern_mtu); if (ret) - goto out; + return ret; if (kern_mtu == mtu) { priv->mtu = mtu; - DEBUG("adapter port %u MTU set to %u", priv->port, mtu); + DRV_LOG(DEBUG, "port %u adapter MTU set to %u", + dev->data->port_id, mtu); + return 0; } - priv_unlock(priv); - return 0; -out: - ret = errno; - WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, - strerror(ret)); - priv_unlock(priv); - assert(ret >= 0); - return -ret; + rte_errno = EAGAIN; + return -rte_errno; } /** @@ -1039,12 +789,11 @@ out: * Flow control output buffer. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) { - struct priv *priv = dev->data->dev_private; struct ifreq ifr; struct ethtool_pauseparam ethpause = { .cmd = ETHTOOL_GPAUSEPARAM @@ -1052,15 +801,14 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) int ret; ifr.ifr_data = (void *)ðpause; - priv_lock(priv); - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - ret = errno; - WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" - " failed: %s", - strerror(ret)); - goto out; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:" + " %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } - fc_conf->autoneg = ethpause.autoneg; if (ethpause.rx_pause && ethpause.tx_pause) fc_conf->mode = RTE_FC_FULL; @@ -1070,12 +818,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) fc_conf->mode = RTE_FC_TX_PAUSE; else fc_conf->mode = RTE_FC_NONE; - ret = 0; - -out: - priv_unlock(priv); - assert(ret >= 0); - return -ret; + return 0; } /** @@ -1087,12 +830,11 @@ out: * Flow control parameters. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) { - struct priv *priv = dev->data->dev_private; struct ifreq ifr; struct ethtool_pauseparam ethpause = { .cmd = ETHTOOL_SPAUSEPARAM @@ -1112,21 +854,15 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) ethpause.tx_pause = 1; else ethpause.tx_pause = 0; - - priv_lock(priv); - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - ret = errno; - WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" - " failed: %s", - strerror(ret)); - goto out; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" + " failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } - ret = 0; - -out: - priv_unlock(priv); - assert(ret >= 0); - return -ret; + return 0; } /** @@ -1138,7 +874,7 @@ out: * PCI bus address output buffer. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, @@ -1149,8 +885,10 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, MKSTR(path, "%s/device/uevent", device->ibdev_path); file = fopen(path, "rb"); - if (file == NULL) - return -1; + if (file == NULL) { + rte_errno = errno; + return -rte_errno; + } while (fgets(line, sizeof(line), file) == line) { size_t len = strlen(line); int ret; @@ -1180,46 +918,10 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, } /** - * Update the link status. - * - * @param priv - * Pointer to private structure. - * - * @return - * Zero if the callback process can be called immediately. - */ -static int -priv_link_status_update(struct priv *priv) -{ - struct rte_eth_link *link = &priv->dev->data->dev_link; - - priv_link_update(priv, 0); - if (((link->link_speed == 0) && link->link_status) || - ((link->link_speed != 0) && !link->link_status)) { - /* - * Inconsistent status. Event likely occurred before the - * kernel netdevice exposes the new status. - */ - if (!priv->pending_alarm) { - priv->pending_alarm = 1; - rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US, - mlx5_dev_link_status_handler, - priv->dev); - } - return 1; - } else if (unlikely(priv->pending_alarm)) { - /* Link interrupt occurred while alarm is already scheduled. */ - priv->pending_alarm = 0; - rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev); - } - return 0; -} - -/** * Device status handler. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param events * Pointer to event flags holder. * @@ -1227,60 +929,37 @@ priv_link_status_update(struct priv *priv) * Events bitmap of callback process which can be called immediately. */ static uint32_t -priv_dev_status_handler(struct priv *priv) +mlx5_dev_status_handler(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct ibv_async_event event; uint32_t ret = 0; + if (mlx5_link_update(dev, 0) == -EAGAIN) { + usleep(0); + return 0; + } /* Read all message and acknowledge them. */ for (;;) { if (mlx5_glue->get_async_event(priv->ctx, &event)) break; if ((event.event_type == IBV_EVENT_PORT_ACTIVE || event.event_type == IBV_EVENT_PORT_ERR) && - (priv->dev->data->dev_conf.intr_conf.lsc == 1)) + (dev->data->dev_conf.intr_conf.lsc == 1)) ret |= (1 << RTE_ETH_EVENT_INTR_LSC); else if (event.event_type == IBV_EVENT_DEVICE_FATAL && - priv->dev->data->dev_conf.intr_conf.rmv == 1) + dev->data->dev_conf.intr_conf.rmv == 1) ret |= (1 << RTE_ETH_EVENT_INTR_RMV); else - DEBUG("event type %d on port %d not handled", - event.event_type, event.element.port_num); + DRV_LOG(DEBUG, + "port %u event type %d on not handled", + dev->data->port_id, event.event_type); mlx5_glue->ack_async_event(&event); } - if (ret & (1 << RTE_ETH_EVENT_INTR_LSC)) - if (priv_link_status_update(priv)) - ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC); return ret; } /** - * Handle delayed link status event. - * - * @param arg - * Registered argument. - */ -void -mlx5_dev_link_status_handler(void *arg) -{ - struct rte_eth_dev *dev = arg; - struct priv *priv = dev->data->dev_private; - int ret; - - while (!priv_trylock(priv)) { - /* Alarm is being canceled. */ - if (priv->pending_alarm == 0) - return; - rte_pause(); - } - priv->pending_alarm = 0; - ret = priv_link_status_update(priv); - priv_unlock(priv); - if (!ret) - _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); -} - -/** * Handle interrupts from the NIC. * * @param[in] intr_handle @@ -1292,12 +971,9 @@ void mlx5_dev_interrupt_handler(void *cb_arg) { struct rte_eth_dev *dev = cb_arg; - struct priv *priv = dev->data->dev_private; uint32_t events; - priv_lock(priv); - events = priv_dev_status_handler(priv); - priv_unlock(priv); + events = mlx5_dev_status_handler(dev); if (events & (1 << RTE_ETH_EVENT_INTR_LSC)) _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); if (events & (1 << RTE_ETH_EVENT_INTR_RMV)) @@ -1314,24 +990,21 @@ static void mlx5_dev_handler_socket(void *cb_arg) { struct rte_eth_dev *dev = cb_arg; - struct priv *priv = dev->data->dev_private; - priv_lock(priv); - priv_socket_handle(priv); - priv_unlock(priv); + mlx5_socket_handle(dev); } /** * Uninstall interrupt handler. * - * @param priv - * Pointer to private structure. * @param dev - * Pointer to the rte_eth_dev structure. + * Pointer to Ethernet device. */ void -priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) +mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; + if (dev->data->dev_conf.intr_conf.lsc || dev->data->dev_conf.intr_conf.rmv) rte_intr_callback_unregister(&priv->intr_handle, @@ -1339,10 +1012,6 @@ priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) if (priv->primary_socket) rte_intr_callback_unregister(&priv->intr_handle_socket, mlx5_dev_handler_socket, dev); - if (priv->pending_alarm) { - priv->pending_alarm = 0; - rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev); - } priv->intr_handle.fd = 0; priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; priv->intr_handle_socket.fd = 0; @@ -1352,21 +1021,24 @@ priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) /** * Install interrupt handler. * - * @param priv - * Pointer to private structure. * @param dev - * Pointer to the rte_eth_dev structure. + * Pointer to Ethernet device. */ void -priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) +mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev) { - int rc, flags; + struct priv *priv = dev->data->dev_private; + int ret; + int flags; assert(priv->ctx->async_fd > 0); flags = fcntl(priv->ctx->async_fd, F_GETFL); - rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); - if (rc < 0) { - INFO("failed to change file descriptor async event queue"); + ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); + if (ret) { + DRV_LOG(INFO, + "port %u failed to change file descriptor async event" + " queue", + dev->data->port_id); dev->data->dev_conf.intr_conf.lsc = 0; dev->data->dev_conf.intr_conf.rmv = 0; } @@ -1377,9 +1049,11 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) rte_intr_callback_register(&priv->intr_handle, mlx5_dev_interrupt_handler, dev); } - - rc = priv_socket_init(priv); - if (!rc && priv->primary_socket) { + ret = mlx5_socket_init(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot initialise socket: %s", + dev->data->port_id, strerror(rte_errno)); + else if (priv->primary_socket) { priv->intr_handle_socket.fd = priv->primary_socket; priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT; rte_intr_callback_register(&priv->intr_handle_socket, @@ -1388,41 +1062,18 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) } /** - * Change the link state (UP / DOWN). - * - * @param priv - * Pointer to private data structure. - * @param up - * Nonzero for link up, otherwise link down. - * - * @return - * 0 on success, errno value on failure. - */ -static int -priv_dev_set_link(struct priv *priv, int up) -{ - return priv_set_flags(priv, ~IFF_UP, up ? IFF_UP : ~IFF_UP); -} - -/** * DPDK callback to bring the link DOWN. * * @param dev * Pointer to Ethernet device structure. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_set_link_down(struct rte_eth_dev *dev) { - struct priv *priv = dev->data->dev_private; - int err; - - priv_lock(priv); - err = priv_dev_set_link(priv, 0); - priv_unlock(priv); - return err; + return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP); } /** @@ -1432,63 +1083,68 @@ mlx5_set_link_down(struct rte_eth_dev *dev) * Pointer to Ethernet device structure. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_set_link_up(struct rte_eth_dev *dev) { - struct priv *priv = dev->data->dev_private; - int err; - - priv_lock(priv); - err = priv_dev_set_link(priv, 1); - priv_unlock(priv); - return err; + return mlx5_set_flags(dev, ~IFF_UP, IFF_UP); } /** * Configure the TX function to use. * - * @param priv - * Pointer to private data structure. * @param dev - * Pointer to rte_eth_dev structure. + * Pointer to private data structure. * * @return * Pointer to selected Tx burst function. */ eth_tx_burst_t -priv_select_tx_function(struct priv *priv, struct rte_eth_dev *dev) +mlx5_select_tx_function(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst; struct mlx5_dev_config *config = &priv->config; uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | DEV_TX_OFFLOAD_VXLAN_TNL_TSO | - DEV_TX_OFFLOAD_GRE_TNL_TSO)); + DEV_TX_OFFLOAD_GRE_TNL_TSO | + DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO)); + int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO | + DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)); int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT); assert(priv != NULL); /* Select appropriate TX function. */ - if (vlan_insert || tso) + if (vlan_insert || tso || swp) return tx_pkt_burst; if (config->mps == MLX5_MPW_ENHANCED) { - if (priv_check_vec_tx_support(priv, dev) > 0) { - if (priv_check_raw_vec_tx_support(priv, dev) > 0) + if (mlx5_check_vec_tx_support(dev) > 0) { + if (mlx5_check_raw_vec_tx_support(dev) > 0) tx_pkt_burst = mlx5_tx_burst_raw_vec; else tx_pkt_burst = mlx5_tx_burst_vec; - DEBUG("selected Enhanced MPW TX vectorized function"); + DRV_LOG(DEBUG, + "port %u selected enhanced MPW Tx vectorized" + " function", + dev->data->port_id); } else { tx_pkt_burst = mlx5_tx_burst_empw; - DEBUG("selected Enhanced MPW TX function"); + DRV_LOG(DEBUG, + "port %u selected enhanced MPW Tx function", + dev->data->port_id); } } else if (config->mps && (config->txq_inline > 0)) { tx_pkt_burst = mlx5_tx_burst_mpw_inline; - DEBUG("selected MPW inline TX function"); + DRV_LOG(DEBUG, "port %u selected MPW inline Tx function", + dev->data->port_id); } else if (config->mps) { tx_pkt_burst = mlx5_tx_burst_mpw; - DEBUG("selected MPW TX function"); + DRV_LOG(DEBUG, "port %u selected MPW Tx function", + dev->data->port_id); } return tx_pkt_burst; } @@ -1496,23 +1152,24 @@ priv_select_tx_function(struct priv *priv, struct rte_eth_dev *dev) /** * Configure the RX function to use. * - * @param priv - * Pointer to private data structure. * @param dev - * Pointer to rte_eth_dev structure. + * Pointer to private data structure. * * @return * Pointer to selected Rx burst function. */ eth_rx_burst_t -priv_select_rx_function(struct priv *priv, __rte_unused struct rte_eth_dev *dev) +mlx5_select_rx_function(struct rte_eth_dev *dev) { eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst; - assert(priv != NULL); - if (priv_check_vec_rx_support(priv) > 0) { + assert(dev != NULL); + if (mlx5_check_vec_rx_support(dev) > 0) { rx_pkt_burst = mlx5_rx_burst_vec; - DEBUG("selected RX vectorized function"); + DRV_LOG(DEBUG, "port %u selected Rx vectorized function", + dev->data->port_id); + } else if (mlx5_mprq_enabled(dev)) { + rx_pkt_burst = mlx5_rx_burst_mprq; } return rx_pkt_burst; } diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index 26002c4b..994be05b 100644 --- a/drivers/net/mlx5/mlx5_flow.c +++ b/drivers/net/mlx5/mlx5_flow.c @@ -1,9 +1,10 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2016 6WIND S.A. - * Copyright 2016 Mellanox. + * Copyright 2016 Mellanox Technologies, Ltd */ #include <sys/queue.h> +#include <stdint.h> #include <string.h> /* Verbs header. */ @@ -16,6 +17,9 @@ #pragma GCC diagnostic error "-Wpedantic" #endif +#include <rte_common.h> +#include <rte_ether.h> +#include <rte_eth_ctrl.h> #include <rte_ethdev_driver.h> #include <rte_flow.h> #include <rte_flow_driver.h> @@ -27,12 +31,13 @@ #include "mlx5_prm.h" #include "mlx5_glue.h" -/* Define minimal priority for control plane flows. */ -#define MLX5_CTRL_FLOW_PRIORITY 4 +/* Flow priority for control plane flows. */ +#define MLX5_CTRL_FLOW_PRIORITY 1 /* Internet Protocol versions. */ #define MLX5_IPV4 4 #define MLX5_IPV6 6 +#define MLX5_GRE 47 #ifndef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT struct ibv_flow_spec_counter_action { @@ -44,40 +49,62 @@ struct ibv_flow_spec_counter_action { extern const struct eth_dev_ops mlx5_dev_ops; extern const struct eth_dev_ops mlx5_dev_ops_isolate; +/** Structure give to the conversion functions. */ +struct mlx5_flow_data { + struct rte_eth_dev *dev; /** Ethernet device. */ + struct mlx5_flow_parse *parser; /** Parser context. */ + struct rte_flow_error *error; /** Error context. */ +}; + static int mlx5_flow_create_eth(const struct rte_flow_item *item, const void *default_mask, - void *data); + struct mlx5_flow_data *data); static int mlx5_flow_create_vlan(const struct rte_flow_item *item, const void *default_mask, - void *data); + struct mlx5_flow_data *data); static int mlx5_flow_create_ipv4(const struct rte_flow_item *item, const void *default_mask, - void *data); + struct mlx5_flow_data *data); static int mlx5_flow_create_ipv6(const struct rte_flow_item *item, const void *default_mask, - void *data); + struct mlx5_flow_data *data); static int mlx5_flow_create_udp(const struct rte_flow_item *item, const void *default_mask, - void *data); + struct mlx5_flow_data *data); static int mlx5_flow_create_tcp(const struct rte_flow_item *item, const void *default_mask, - void *data); + struct mlx5_flow_data *data); static int mlx5_flow_create_vxlan(const struct rte_flow_item *item, const void *default_mask, - void *data); + struct mlx5_flow_data *data); + +static int +mlx5_flow_create_vxlan_gpe(const struct rte_flow_item *item, + const void *default_mask, + struct mlx5_flow_data *data); + +static int +mlx5_flow_create_gre(const struct rte_flow_item *item, + const void *default_mask, + struct mlx5_flow_data *data); + +static int +mlx5_flow_create_mpls(const struct rte_flow_item *item, + const void *default_mask, + struct mlx5_flow_data *data); struct mlx5_flow_parse; @@ -89,7 +116,7 @@ static int mlx5_flow_create_flag_mark(struct mlx5_flow_parse *parser, uint32_t mark_id); static int -mlx5_flow_create_count(struct priv *priv, struct mlx5_flow_parse *parser); +mlx5_flow_create_count(struct rte_eth_dev *dev, struct mlx5_flow_parse *parser); /* Hash RX queue types. */ enum hash_rxq_type { @@ -100,6 +127,7 @@ enum hash_rxq_type { HASH_RXQ_UDPV6, HASH_RXQ_IPV6, HASH_RXQ_ETH, + HASH_RXQ_TUNNEL, }; /* Initialization data for hash RX queue. */ @@ -206,10 +234,10 @@ struct rte_flow { TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */ uint32_t mark:1; /**< Set if the flow is marked. */ uint32_t drop:1; /**< Drop queue. */ - uint16_t queues_n; /**< Number of entries in queue[]. */ + struct rte_flow_action_rss rss_conf; /**< RSS configuration */ uint16_t (*queues)[]; /**< Queues indexes to use. */ - struct rte_eth_rss_conf rss_conf; /**< RSS configuration */ uint8_t rss_key[40]; /**< copy of the RSS key. */ + uint32_t tunnel; /**< Tunnel type of RTE_PTYPE_TUNNEL_XXX. */ struct ibv_counter_set *cs; /**< Holds the counters for the rule. */ struct mlx5_flow_counter_stats counter_stats;/**<The counter stats. */ struct mlx5_flow frxq[RTE_DIM(hash_rxq_init)]; @@ -222,6 +250,33 @@ struct rte_flow { __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \ } +#define IS_TUNNEL(type) ( \ + (type) == RTE_FLOW_ITEM_TYPE_VXLAN || \ + (type) == RTE_FLOW_ITEM_TYPE_VXLAN_GPE || \ + (type) == RTE_FLOW_ITEM_TYPE_GRE || \ + (type) == RTE_FLOW_ITEM_TYPE_MPLS) + +const uint32_t flow_ptype[] = { + [RTE_FLOW_ITEM_TYPE_VXLAN] = RTE_PTYPE_TUNNEL_VXLAN, + [RTE_FLOW_ITEM_TYPE_VXLAN_GPE] = RTE_PTYPE_TUNNEL_VXLAN_GPE, + [RTE_FLOW_ITEM_TYPE_GRE] = RTE_PTYPE_TUNNEL_GRE, + [RTE_FLOW_ITEM_TYPE_MPLS] = RTE_PTYPE_TUNNEL_MPLS_IN_GRE, +}; + +#define PTYPE_IDX(t) ((RTE_PTYPE_TUNNEL_MASK & (t)) >> 12) + +const uint32_t ptype_ext[] = { + [PTYPE_IDX(RTE_PTYPE_TUNNEL_VXLAN)] = RTE_PTYPE_TUNNEL_VXLAN | + RTE_PTYPE_L4_UDP, + [PTYPE_IDX(RTE_PTYPE_TUNNEL_VXLAN_GPE)] = RTE_PTYPE_TUNNEL_VXLAN_GPE | + RTE_PTYPE_L4_UDP, + [PTYPE_IDX(RTE_PTYPE_TUNNEL_GRE)] = RTE_PTYPE_TUNNEL_GRE, + [PTYPE_IDX(RTE_PTYPE_TUNNEL_MPLS_IN_GRE)] = + RTE_PTYPE_TUNNEL_MPLS_IN_GRE, + [PTYPE_IDX(RTE_PTYPE_TUNNEL_MPLS_IN_UDP)] = + RTE_PTYPE_TUNNEL_MPLS_IN_GRE | RTE_PTYPE_L4_UDP, +}; + /** Structure to generate a simple graph of layers supported by the NIC. */ struct mlx5_flow_items { /** List of possible actions for these items. */ @@ -247,11 +302,12 @@ struct mlx5_flow_items { * Internal structure to store the conversion. * * @return - * 0 on success, negative value otherwise. + * 0 on success, a negative errno value otherwise and rte_errno is + * set. */ int (*convert)(const struct rte_flow_item *item, const void *default_mask, - void *data); + struct mlx5_flow_data *data); /** Size in bytes of the destination structure. */ const unsigned int dst_sz; /** List of possible following items. */ @@ -274,7 +330,9 @@ static const enum rte_flow_action_type valid_actions[] = { static const struct mlx5_flow_items mlx5_flow_items[] = { [RTE_FLOW_ITEM_TYPE_END] = { .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH, - RTE_FLOW_ITEM_TYPE_VXLAN), + RTE_FLOW_ITEM_TYPE_VXLAN, + RTE_FLOW_ITEM_TYPE_VXLAN_GPE, + RTE_FLOW_ITEM_TYPE_GRE), }, [RTE_FLOW_ITEM_TYPE_ETH] = { .items = ITEMS(RTE_FLOW_ITEM_TYPE_VLAN, @@ -297,6 +355,7 @@ static const struct mlx5_flow_items mlx5_flow_items[] = { .actions = valid_actions, .mask = &(const struct rte_flow_item_vlan){ .tci = -1, + .inner_type = -1, }, .default_mask = &rte_flow_item_vlan_mask, .mask_sz = sizeof(struct rte_flow_item_vlan), @@ -305,7 +364,8 @@ static const struct mlx5_flow_items mlx5_flow_items[] = { }, [RTE_FLOW_ITEM_TYPE_IPV4] = { .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, - RTE_FLOW_ITEM_TYPE_TCP), + RTE_FLOW_ITEM_TYPE_TCP, + RTE_FLOW_ITEM_TYPE_GRE), .actions = valid_actions, .mask = &(const struct rte_flow_item_ipv4){ .hdr = { @@ -322,7 +382,8 @@ static const struct mlx5_flow_items mlx5_flow_items[] = { }, [RTE_FLOW_ITEM_TYPE_IPV6] = { .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, - RTE_FLOW_ITEM_TYPE_TCP), + RTE_FLOW_ITEM_TYPE_TCP, + RTE_FLOW_ITEM_TYPE_GRE), .actions = valid_actions, .mask = &(const struct rte_flow_item_ipv6){ .hdr = { @@ -349,7 +410,9 @@ static const struct mlx5_flow_items mlx5_flow_items[] = { .dst_sz = sizeof(struct ibv_flow_spec_ipv6), }, [RTE_FLOW_ITEM_TYPE_UDP] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_VXLAN), + .items = ITEMS(RTE_FLOW_ITEM_TYPE_VXLAN, + RTE_FLOW_ITEM_TYPE_VXLAN_GPE, + RTE_FLOW_ITEM_TYPE_MPLS), .actions = valid_actions, .mask = &(const struct rte_flow_item_udp){ .hdr = { @@ -375,8 +438,43 @@ static const struct mlx5_flow_items mlx5_flow_items[] = { .convert = mlx5_flow_create_tcp, .dst_sz = sizeof(struct ibv_flow_spec_tcp_udp), }, + [RTE_FLOW_ITEM_TYPE_GRE] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH, + RTE_FLOW_ITEM_TYPE_IPV4, + RTE_FLOW_ITEM_TYPE_IPV6, + RTE_FLOW_ITEM_TYPE_MPLS), + .actions = valid_actions, + .mask = &(const struct rte_flow_item_gre){ + .protocol = -1, + }, + .default_mask = &rte_flow_item_gre_mask, + .mask_sz = sizeof(struct rte_flow_item_gre), + .convert = mlx5_flow_create_gre, +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + .dst_sz = sizeof(struct ibv_flow_spec_gre), +#else + .dst_sz = sizeof(struct ibv_flow_spec_tunnel), +#endif + }, + [RTE_FLOW_ITEM_TYPE_MPLS] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH, + RTE_FLOW_ITEM_TYPE_IPV4, + RTE_FLOW_ITEM_TYPE_IPV6), + .actions = valid_actions, + .mask = &(const struct rte_flow_item_mpls){ + .label_tc_s = "\xff\xff\xf0", + }, + .default_mask = &rte_flow_item_mpls_mask, + .mask_sz = sizeof(struct rte_flow_item_mpls), + .convert = mlx5_flow_create_mpls, +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + .dst_sz = sizeof(struct ibv_flow_spec_mpls), +#endif + }, [RTE_FLOW_ITEM_TYPE_VXLAN] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH), + .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH, + RTE_FLOW_ITEM_TYPE_IPV4, /* For L3 VXLAN. */ + RTE_FLOW_ITEM_TYPE_IPV6), /* For L3 VXLAN. */ .actions = valid_actions, .mask = &(const struct rte_flow_item_vxlan){ .vni = "\xff\xff\xff", @@ -386,28 +484,43 @@ static const struct mlx5_flow_items mlx5_flow_items[] = { .convert = mlx5_flow_create_vxlan, .dst_sz = sizeof(struct ibv_flow_spec_tunnel), }, + [RTE_FLOW_ITEM_TYPE_VXLAN_GPE] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH, + RTE_FLOW_ITEM_TYPE_IPV4, + RTE_FLOW_ITEM_TYPE_IPV6), + .actions = valid_actions, + .mask = &(const struct rte_flow_item_vxlan_gpe){ + .vni = "\xff\xff\xff", + }, + .default_mask = &rte_flow_item_vxlan_gpe_mask, + .mask_sz = sizeof(struct rte_flow_item_vxlan_gpe), + .convert = mlx5_flow_create_vxlan_gpe, + .dst_sz = sizeof(struct ibv_flow_spec_tunnel), + }, }; /** Structure to pass to the conversion function. */ struct mlx5_flow_parse { - uint32_t inner; /**< Set once VXLAN is encountered. */ + uint32_t inner; /**< Verbs value, set once tunnel is encountered. */ uint32_t create:1; /**< Whether resources should remain after a validate. */ uint32_t drop:1; /**< Target is a drop queue. */ uint32_t mark:1; /**< Mark is present in the flow. */ uint32_t count:1; /**< Count is present in the flow. */ uint32_t mark_id; /**< Mark identifier. */ + struct rte_flow_action_rss rss_conf; /**< RSS configuration */ uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queues indexes to use. */ - uint16_t queues_n; /**< Number of entries in queue[]. */ - struct rte_eth_rss_conf rss_conf; /**< RSS configuration */ uint8_t rss_key[40]; /**< copy of the RSS key. */ enum hash_rxq_type layer; /**< Last pattern layer detected. */ + enum hash_rxq_type out_layer; /**< Last outer pattern layer detected. */ + uint32_t tunnel; /**< Tunnel type of RTE_PTYPE_TUNNEL_XXX. */ struct ibv_counter_set *cs; /**< Holds the counter set for the rule */ struct { struct ibv_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */ unsigned int offset; /**< Current position or total size of the attribute. */ + uint64_t hash_fields; /**< Verbs hash fields. */ } queue[RTE_DIM(hash_rxq_init)]; }; @@ -436,9 +549,17 @@ struct mlx5_fdir { struct rte_flow_item_ipv6 ipv6; } l3; union { + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv6 ipv6; + } l3_mask; + union { struct rte_flow_item_udp udp; struct rte_flow_item_tcp tcp; } l4; + union { + struct rte_flow_item_udp udp; + struct rte_flow_item_tcp tcp; + } l4_mask; struct rte_flow_action_queue queue; }; @@ -449,7 +570,7 @@ struct ibv_spec_header { }; /** - * Check support for a given item. + * Check item is fully supported by the NIC matching capability. * * @param item[in] * Item specification. @@ -460,121 +581,56 @@ struct ibv_spec_header { * Bit-Mask size in bytes. * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_flow_item_validate(const struct rte_flow_item *item, const uint8_t *mask, unsigned int size) { - int ret = 0; - - if (!item->spec && (item->mask || item->last)) - return -1; - if (item->spec && !item->mask) { - unsigned int i; - const uint8_t *spec = item->spec; - - for (i = 0; i < size; ++i) - if ((spec[i] | mask[i]) != mask[i]) - return -1; - } - if (item->last && !item->mask) { - unsigned int i; - const uint8_t *spec = item->last; - - for (i = 0; i < size; ++i) - if ((spec[i] | mask[i]) != mask[i]) - return -1; - } - if (item->mask) { - unsigned int i; - const uint8_t *spec = item->spec; - - for (i = 0; i < size; ++i) - if ((spec[i] | mask[i]) != mask[i]) - return -1; - } - if (item->spec && item->last) { - uint8_t spec[size]; - uint8_t last[size]; - const uint8_t *apply = mask; - unsigned int i; - - if (item->mask) - apply = item->mask; - for (i = 0; i < size; ++i) { - spec[i] = ((const uint8_t *)item->spec)[i] & apply[i]; - last[i] = ((const uint8_t *)item->last)[i] & apply[i]; - } - ret = memcmp(spec, last, size); - } - return ret; -} + unsigned int i; + const uint8_t *spec = item->spec; + const uint8_t *last = item->last; + const uint8_t *m = item->mask ? item->mask : mask; -/** - * Copy the RSS configuration from the user ones, of the rss_conf is null, - * uses the driver one. - * - * @param priv - * Pointer to private structure. - * @param parser - * Internal parser structure. - * @param rss_conf - * User RSS configuration to save. - * - * @return - * 0 on success, errno value on failure. - */ -static int -priv_flow_convert_rss_conf(struct priv *priv, - struct mlx5_flow_parse *parser, - const struct rte_eth_rss_conf *rss_conf) -{ + if (!spec && (item->mask || last)) + goto error; + if (!spec) + return 0; /* - * This function is also called at the beginning of - * priv_flow_convert_actions() to initialize the parser with the - * device default RSS configuration. + * Single-pass check to make sure that: + * - item->mask is supported, no bits are set outside mask. + * - Both masked item->spec and item->last are equal (no range + * supported). */ - (void)priv; - if (rss_conf) { - if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) - return EINVAL; - if (rss_conf->rss_key_len != 40) - return EINVAL; - if (rss_conf->rss_key_len && rss_conf->rss_key) { - parser->rss_conf.rss_key_len = rss_conf->rss_key_len; - memcpy(parser->rss_key, rss_conf->rss_key, - rss_conf->rss_key_len); - parser->rss_conf.rss_key = parser->rss_key; - } - parser->rss_conf.rss_hf = rss_conf->rss_hf; + for (i = 0; i < size; i++) { + if (!m[i]) + continue; + if ((m[i] | mask[i]) != mask[i]) + goto error; + if (last && ((spec[i] & m[i]) != (last[i] & m[i]))) + goto error; } return 0; +error: + rte_errno = ENOTSUP; + return -rte_errno; } /** * Extract attribute to the parser. * - * @param priv - * Pointer to private structure. * @param[in] attr * Flow rule attributes. * @param[out] error * Perform verbose error reporting if not NULL. - * @param[in, out] parser - * Internal parser structure. * * @return * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_flow_convert_attributes(struct priv *priv, - const struct rte_flow_attr *attr, - struct rte_flow_error *error, - struct mlx5_flow_parse *parser) +mlx5_flow_convert_attributes(const struct rte_flow_attr *attr, + struct rte_flow_error *error) { - (void)priv; - (void)parser; if (attr->group) { rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_GROUP, @@ -596,6 +652,13 @@ priv_flow_convert_attributes(struct priv *priv, "egress is not supported"); return -rte_errno; } + if (attr->transfer) { + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, + NULL, + "transfer is not supported"); + return -rte_errno; + } if (!attr->ingress) { rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, @@ -609,8 +672,8 @@ priv_flow_convert_attributes(struct priv *priv, /** * Extract actions request to the parser. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param[in] actions * Associated actions (list terminated by the END action). * @param[out] error @@ -622,83 +685,115 @@ priv_flow_convert_attributes(struct priv *priv, * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_flow_convert_actions(struct priv *priv, +mlx5_flow_convert_actions(struct rte_eth_dev *dev, const struct rte_flow_action actions[], struct rte_flow_error *error, struct mlx5_flow_parse *parser) { - /* - * Add default RSS configuration necessary for Verbs to create QP even - * if no RSS is necessary. - */ - priv_flow_convert_rss_conf(priv, parser, - (const struct rte_eth_rss_conf *) - &priv->rss_conf); + enum { FATE = 1, MARK = 2, COUNT = 4, }; + uint32_t overlap = 0; + struct priv *priv = dev->data->dev_private; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) { continue; } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { + if (overlap & FATE) + goto exit_action_overlap; + overlap |= FATE; parser->drop = 1; } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) { const struct rte_flow_action_queue *queue = (const struct rte_flow_action_queue *) actions->conf; - uint16_t n; - uint16_t found = 0; + if (overlap & FATE) + goto exit_action_overlap; + overlap |= FATE; if (!queue || (queue->index > (priv->rxqs_n - 1))) goto exit_action_not_supported; - for (n = 0; n < parser->queues_n; ++n) { - if (parser->queues[n] == queue->index) { - found = 1; - break; - } - } - if (parser->queues_n > 1 && !found) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "queue action not in RSS queues"); - return -rte_errno; - } - if (!found) { - parser->queues_n = 1; - parser->queues[0] = queue->index; - } + parser->queues[0] = queue->index; + parser->rss_conf = (struct rte_flow_action_rss){ + .queue_num = 1, + .queue = parser->queues, + }; } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) { const struct rte_flow_action_rss *rss = (const struct rte_flow_action_rss *) actions->conf; + const uint8_t *rss_key; + uint32_t rss_key_len; uint16_t n; - if (!rss || !rss->num) { + if (overlap & FATE) + goto exit_action_overlap; + overlap |= FATE; + if (rss->func && + rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "the only supported RSS hash" + " function is Toeplitz"); + return -rte_errno; + } +#ifndef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + if (parser->rss_conf.level > 1) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "a nonzero RSS encapsulation" + " level is not supported"); + return -rte_errno; + } +#endif + if (parser->rss_conf.level > 2) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "RSS encapsulation level" + " > 1 is not supported"); + return -rte_errno; + } + if (rss->types & MLX5_RSS_HF_MASK) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "unsupported RSS type" + " requested"); + return -rte_errno; + } + if (rss->key_len) { + rss_key_len = rss->key_len; + rss_key = rss->key; + } else { + rss_key_len = rss_hash_default_key_len; + rss_key = rss_hash_default_key; + } + if (rss_key_len != RTE_DIM(parser->rss_key)) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "RSS hash key must be" + " exactly 40 bytes long"); + return -rte_errno; + } + if (!rss->queue_num) { rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, actions, "no valid queues"); return -rte_errno; } - if (parser->queues_n == 1) { - uint16_t found = 0; - - assert(parser->queues_n); - for (n = 0; n < rss->num; ++n) { - if (parser->queues[0] == - rss->queue[n]) { - found = 1; - break; - } - } - if (!found) { - rte_flow_error_set(error, ENOTSUP, + if (rss->queue_num > RTE_DIM(parser->queues)) { + rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, actions, - "queue action not in RSS" - " queues"); - return -rte_errno; - } + "too many queues for RSS" + " context"); + return -rte_errno; } - for (n = 0; n < rss->num; ++n) { + for (n = 0; n < rss->queue_num; ++n) { if (rss->queue[n] >= priv->rxqs_n) { rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, @@ -708,22 +803,26 @@ priv_flow_convert_actions(struct priv *priv, return -rte_errno; } } - for (n = 0; n < rss->num; ++n) - parser->queues[n] = rss->queue[n]; - parser->queues_n = rss->num; - if (priv_flow_convert_rss_conf(priv, parser, - rss->rss_conf)) { - rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "wrong RSS configuration"); - return -rte_errno; - } + parser->rss_conf = (struct rte_flow_action_rss){ + .func = RTE_ETH_HASH_FUNCTION_DEFAULT, + .level = rss->level ? rss->level : 1, + .types = rss->types, + .key_len = rss_key_len, + .queue_num = rss->queue_num, + .key = memcpy(parser->rss_key, rss_key, + sizeof(*rss_key) * rss_key_len), + .queue = memcpy(parser->queues, rss->queue, + sizeof(*rss->queue) * + rss->queue_num), + }; } else if (actions->type == RTE_FLOW_ACTION_TYPE_MARK) { const struct rte_flow_action_mark *mark = (const struct rte_flow_action_mark *) actions->conf; + if (overlap & MARK) + goto exit_action_overlap; + overlap |= MARK; if (!mark) { rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, @@ -741,17 +840,26 @@ priv_flow_convert_actions(struct priv *priv, parser->mark = 1; parser->mark_id = mark->id; } else if (actions->type == RTE_FLOW_ACTION_TYPE_FLAG) { + if (overlap & MARK) + goto exit_action_overlap; + overlap |= MARK; parser->mark = 1; } else if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT && priv->config.flow_counter_en) { + if (overlap & COUNT) + goto exit_action_overlap; + overlap |= COUNT; parser->count = 1; } else { goto exit_action_not_supported; } } + /* When fate is unknown, drop traffic. */ + if (!(overlap & FATE)) + parser->drop = 1; if (parser->drop && parser->mark) parser->mark = 0; - if (!parser->queues_n && !parser->drop) { + if (!parser->rss_conf.queue_num && !parser->drop) { rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, "no valid action"); return -rte_errno; @@ -761,13 +869,15 @@ exit_action_not_supported: rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, actions, "action not supported"); return -rte_errno; +exit_action_overlap: + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, + actions, "overlapping actions are not supported"); + return -rte_errno; } /** * Validate items. * - * @param priv - * Pointer to private structure. * @param[in] items * Pattern specification (list terminated by the END pattern item). * @param[out] error @@ -779,25 +889,28 @@ exit_action_not_supported: * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_flow_convert_items_validate(struct priv *priv, +mlx5_flow_convert_items_validate(struct rte_eth_dev *dev, const struct rte_flow_item items[], struct rte_flow_error *error, struct mlx5_flow_parse *parser) { + struct priv *priv = dev->data->dev_private; const struct mlx5_flow_items *cur_item = mlx5_flow_items; unsigned int i; + unsigned int last_voids = 0; + int ret = 0; - (void)priv; /* Initialise the offsets to start after verbs attribute. */ for (i = 0; i != hash_rxq_init_n; ++i) parser->queue[i].offset = sizeof(struct ibv_flow_attr); for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) { const struct mlx5_flow_items *token = NULL; unsigned int n; - int err; - if (items->type == RTE_FLOW_ITEM_TYPE_VOID) + if (items->type == RTE_FLOW_ITEM_TYPE_VOID) { + last_voids++; continue; + } for (i = 0; cur_item->items && cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END; @@ -807,24 +920,48 @@ priv_flow_convert_items_validate(struct priv *priv, break; } } - if (!token) + if (!token) { + ret = -ENOTSUP; goto exit_item_not_supported; + } cur_item = token; - err = mlx5_flow_item_validate(items, + ret = mlx5_flow_item_validate(items, (const uint8_t *)cur_item->mask, cur_item->mask_sz); - if (err) + if (ret) goto exit_item_not_supported; - if (items->type == RTE_FLOW_ITEM_TYPE_VXLAN) { - if (parser->inner) { + if (IS_TUNNEL(items->type)) { + if (parser->tunnel && + !((items - last_voids - 1)->type == + RTE_FLOW_ITEM_TYPE_GRE && items->type == + RTE_FLOW_ITEM_TYPE_MPLS)) { rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, items, - "cannot recognize multiple" - " VXLAN encapsulations"); + "Cannot recognize multiple" + " tunnel encapsulations."); + return -rte_errno; + } + if (items->type == RTE_FLOW_ITEM_TYPE_MPLS && + !priv->config.mpls_en) { + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "MPLS not supported or" + " disabled in firmware" + " configuration."); + return -rte_errno; + } + if (!priv->config.tunnel_en && + parser->rss_conf.level > 1) { + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "RSS on tunnel is not supported"); return -rte_errno; } parser->inner = IBV_FLOW_SPEC_INNER; + parser->tunnel = flow_ptype[items->type]; } if (parser->drop) { parser->queue[HASH_RXQ_ETH].offset += cur_item->dst_sz; @@ -832,6 +969,7 @@ priv_flow_convert_items_validate(struct priv *priv, for (n = 0; n != hash_rxq_init_n; ++n) parser->queue[n].offset += cur_item->dst_sz; } + last_voids = 0; } if (parser->drop) { parser->queue[HASH_RXQ_ETH].offset += @@ -850,102 +988,103 @@ priv_flow_convert_items_validate(struct priv *priv, } return 0; exit_item_not_supported: - rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, - items, "item not supported"); - return -rte_errno; + return rte_flow_error_set(error, -ret, RTE_FLOW_ERROR_TYPE_ITEM, + items, "item not supported"); } /** * Allocate memory space to store verbs flow attributes. * - * @param priv - * Pointer to private structure. - * @param[in] priority - * Flow priority. * @param[in] size * Amount of byte to allocate. * @param[out] error * Perform verbose error reporting if not NULL. * * @return - * A verbs flow attribute on success, NULL otherwise. + * A verbs flow attribute on success, NULL otherwise and rte_errno is set. */ -static struct ibv_flow_attr* -priv_flow_convert_allocate(struct priv *priv, - unsigned int priority, - unsigned int size, - struct rte_flow_error *error) +static struct ibv_flow_attr * +mlx5_flow_convert_allocate(unsigned int size, struct rte_flow_error *error) { struct ibv_flow_attr *ibv_attr; - (void)priv; ibv_attr = rte_calloc(__func__, 1, size, 0); if (!ibv_attr) { rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, - "cannot allocate verbs spec attributes."); + "cannot allocate verbs spec attributes"); return NULL; } - ibv_attr->priority = priority; return ibv_attr; } /** - * Finalise verbs flow attributes. + * Make inner packet matching with an higher priority from the non Inner + * matching. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param[in, out] parser * Internal parser structure. + * @param attr + * User flow attribute. */ static void -priv_flow_convert_finalise(struct priv *priv, struct mlx5_flow_parse *parser) +mlx5_flow_update_priority(struct rte_eth_dev *dev, + struct mlx5_flow_parse *parser, + const struct rte_flow_attr *attr) { - const unsigned int ipv4 = - hash_rxq_init[parser->layer].ip_version == MLX5_IPV4; - const enum hash_rxq_type hmin = ipv4 ? HASH_RXQ_TCPV4 : HASH_RXQ_TCPV6; - const enum hash_rxq_type hmax = ipv4 ? HASH_RXQ_IPV4 : HASH_RXQ_IPV6; - const enum hash_rxq_type ohmin = ipv4 ? HASH_RXQ_TCPV6 : HASH_RXQ_TCPV4; - const enum hash_rxq_type ohmax = ipv4 ? HASH_RXQ_IPV6 : HASH_RXQ_IPV4; - const enum hash_rxq_type ip = ipv4 ? HASH_RXQ_IPV4 : HASH_RXQ_IPV6; + struct priv *priv = dev->data->dev_private; unsigned int i; + uint16_t priority; - (void)priv; - if (parser->layer == HASH_RXQ_ETH) { - goto fill; - } else { - /* - * This layer becomes useless as the pattern define under - * layers. - */ - rte_free(parser->queue[HASH_RXQ_ETH].ibv_attr); - parser->queue[HASH_RXQ_ETH].ibv_attr = NULL; + /* 8 priorities >= 16 priorities + * Control flow: 4-7 8-15 + * User normal flow: 1-3 4-7 + * User tunnel flow: 0-2 0-3 + */ + priority = attr->priority * MLX5_VERBS_FLOW_PRIO_8; + if (priv->config.max_verbs_prio == MLX5_VERBS_FLOW_PRIO_8) + priority /= 2; + /* + * Lower non-tunnel flow Verbs priority 1 if only support 8 Verbs + * priorities, lower 4 otherwise. + */ + if (!parser->inner) { + if (priv->config.max_verbs_prio == MLX5_VERBS_FLOW_PRIO_8) + priority += 1; + else + priority += MLX5_VERBS_FLOW_PRIO_8 / 2; + } + if (parser->drop) { + parser->queue[HASH_RXQ_ETH].ibv_attr->priority = priority + + hash_rxq_init[HASH_RXQ_ETH].flow_priority; + return; } - /* Remove opposite kind of layer e.g. IPv6 if the pattern is IPv4. */ - for (i = ohmin; i != (ohmax + 1); ++i) { + for (i = 0; i != hash_rxq_init_n; ++i) { if (!parser->queue[i].ibv_attr) continue; - rte_free(parser->queue[i].ibv_attr); - parser->queue[i].ibv_attr = NULL; + parser->queue[i].ibv_attr->priority = priority + + hash_rxq_init[i].flow_priority; } - /* Remove impossible flow according to the RSS configuration. */ - if (hash_rxq_init[parser->layer].dpdk_rss_hf & - parser->rss_conf.rss_hf) { - /* Remove any other flow. */ - for (i = hmin; i != (hmax + 1); ++i) { - if ((i == parser->layer) || - (!parser->queue[i].ibv_attr)) - continue; - rte_free(parser->queue[i].ibv_attr); - parser->queue[i].ibv_attr = NULL; - } - } else if (!parser->queue[ip].ibv_attr) { - /* no RSS possible with the current configuration. */ - parser->queues_n = 1; +} + +/** + * Finalise verbs flow attributes. + * + * @param[in, out] parser + * Internal parser structure. + */ +static void +mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser) +{ + unsigned int i; + uint32_t inner = parser->inner; + + /* Don't create extra flows for outer RSS. */ + if (parser->tunnel && parser->rss_conf.level < 2) return; - } -fill: /* * Fill missing layers in verbs specifications, or compute the correct * offset to allocate the memory space for the attributes and @@ -956,23 +1095,25 @@ fill: struct ibv_flow_spec_ipv4_ext ipv4; struct ibv_flow_spec_ipv6 ipv6; struct ibv_flow_spec_tcp_udp udp_tcp; + struct ibv_flow_spec_eth eth; } specs; void *dst; uint16_t size; if (i == parser->layer) continue; - if (parser->layer == HASH_RXQ_ETH) { + if (parser->layer == HASH_RXQ_ETH || + parser->layer == HASH_RXQ_TUNNEL) { if (hash_rxq_init[i].ip_version == MLX5_IPV4) { size = sizeof(struct ibv_flow_spec_ipv4_ext); specs.ipv4 = (struct ibv_flow_spec_ipv4_ext){ - .type = IBV_FLOW_SPEC_IPV4_EXT, + .type = inner | IBV_FLOW_SPEC_IPV4_EXT, .size = size, }; } else { size = sizeof(struct ibv_flow_spec_ipv6); specs.ipv6 = (struct ibv_flow_spec_ipv6){ - .type = IBV_FLOW_SPEC_IPV6, + .type = inner | IBV_FLOW_SPEC_IPV6, .size = size, }; } @@ -989,7 +1130,7 @@ fill: (i == HASH_RXQ_UDPV6) || (i == HASH_RXQ_TCPV6)) { size = sizeof(struct ibv_flow_spec_tcp_udp); specs.udp_tcp = (struct ibv_flow_spec_tcp_udp) { - .type = ((i == HASH_RXQ_UDPV4 || + .type = inner | ((i == HASH_RXQ_UDPV4 || i == HASH_RXQ_UDPV6) ? IBV_FLOW_SPEC_UDP : IBV_FLOW_SPEC_TCP), @@ -1008,10 +1149,110 @@ fill: } /** + * Update flows according to pattern and RSS hash fields. + * + * @param[in, out] parser + * Internal parser structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_convert_rss(struct mlx5_flow_parse *parser) +{ + unsigned int i; + enum hash_rxq_type start; + enum hash_rxq_type layer; + int outer = parser->tunnel && parser->rss_conf.level < 2; + uint64_t rss = parser->rss_conf.types; + + layer = outer ? parser->out_layer : parser->layer; + if (layer == HASH_RXQ_TUNNEL) + layer = HASH_RXQ_ETH; + if (outer) { + /* Only one hash type for outer RSS. */ + if (rss && layer == HASH_RXQ_ETH) { + start = HASH_RXQ_TCPV4; + } else if (rss && layer != HASH_RXQ_ETH && + !(rss & hash_rxq_init[layer].dpdk_rss_hf)) { + /* If RSS not match L4 pattern, try L3 RSS. */ + if (layer < HASH_RXQ_IPV4) + layer = HASH_RXQ_IPV4; + else if (layer > HASH_RXQ_IPV4 && layer < HASH_RXQ_IPV6) + layer = HASH_RXQ_IPV6; + start = layer; + } else { + start = layer; + } + /* Scan first valid hash type. */ + for (i = start; rss && i <= layer; ++i) { + if (!parser->queue[i].ibv_attr) + continue; + if (hash_rxq_init[i].dpdk_rss_hf & rss) + break; + } + if (rss && i <= layer) + parser->queue[layer].hash_fields = + hash_rxq_init[i].hash_fields; + /* Trim unused hash types. */ + for (i = 0; i != hash_rxq_init_n; ++i) { + if (parser->queue[i].ibv_attr && i != layer) { + rte_free(parser->queue[i].ibv_attr); + parser->queue[i].ibv_attr = NULL; + } + } + } else { + /* Expand for inner or normal RSS. */ + if (rss && (layer == HASH_RXQ_ETH || layer == HASH_RXQ_IPV4)) + start = HASH_RXQ_TCPV4; + else if (rss && layer == HASH_RXQ_IPV6) + start = HASH_RXQ_TCPV6; + else + start = layer; + /* For L4 pattern, try L3 RSS if no L4 RSS. */ + /* Trim unused hash types. */ + for (i = 0; i != hash_rxq_init_n; ++i) { + if (!parser->queue[i].ibv_attr) + continue; + if (i < start || i > layer) { + rte_free(parser->queue[i].ibv_attr); + parser->queue[i].ibv_attr = NULL; + continue; + } + if (!rss) + continue; + if (hash_rxq_init[i].dpdk_rss_hf & rss) { + parser->queue[i].hash_fields = + hash_rxq_init[i].hash_fields; + } else if (i != layer) { + /* Remove unused RSS expansion. */ + rte_free(parser->queue[i].ibv_attr); + parser->queue[i].ibv_attr = NULL; + } else if (layer < HASH_RXQ_IPV4 && + (hash_rxq_init[HASH_RXQ_IPV4].dpdk_rss_hf & + rss)) { + /* Allow IPv4 RSS on L4 pattern. */ + parser->queue[i].hash_fields = + hash_rxq_init[HASH_RXQ_IPV4] + .hash_fields; + } else if (i > HASH_RXQ_IPV4 && i < HASH_RXQ_IPV6 && + (hash_rxq_init[HASH_RXQ_IPV6].dpdk_rss_hf & + rss)) { + /* Allow IPv4 RSS on L4 pattern. */ + parser->queue[i].hash_fields = + hash_rxq_init[HASH_RXQ_IPV6] + .hash_fields; + } + } + } + return 0; +} + +/** * Validate and convert a flow supported by the NIC. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param[in] attr * Flow rule attributes. * @param[in] pattern @@ -1027,7 +1268,7 @@ fill: * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_flow_convert(struct priv *priv, +mlx5_flow_convert(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, const struct rte_flow_item items[], const struct rte_flow_action actions[], @@ -1044,48 +1285,36 @@ priv_flow_convert(struct priv *priv, .layer = HASH_RXQ_ETH, .mark_id = MLX5_FLOW_MARK_DEFAULT, }; - ret = priv_flow_convert_attributes(priv, attr, error, parser); + ret = mlx5_flow_convert_attributes(attr, error); if (ret) return ret; - ret = priv_flow_convert_actions(priv, actions, error, parser); + ret = mlx5_flow_convert_actions(dev, actions, error, parser); if (ret) return ret; - ret = priv_flow_convert_items_validate(priv, items, error, parser); + ret = mlx5_flow_convert_items_validate(dev, items, error, parser); if (ret) return ret; - priv_flow_convert_finalise(priv, parser); + mlx5_flow_convert_finalise(parser); /* * Second step. * Allocate the memory space to store verbs specifications. */ if (parser->drop) { - unsigned int priority = - attr->priority + - hash_rxq_init[HASH_RXQ_ETH].flow_priority; unsigned int offset = parser->queue[HASH_RXQ_ETH].offset; parser->queue[HASH_RXQ_ETH].ibv_attr = - priv_flow_convert_allocate(priv, priority, - offset, error); + mlx5_flow_convert_allocate(offset, error); if (!parser->queue[HASH_RXQ_ETH].ibv_attr) - return ENOMEM; + goto exit_enomem; parser->queue[HASH_RXQ_ETH].offset = sizeof(struct ibv_flow_attr); } else { for (i = 0; i != hash_rxq_init_n; ++i) { - unsigned int priority = - attr->priority + - hash_rxq_init[i].flow_priority; unsigned int offset; - if (!(parser->rss_conf.rss_hf & - hash_rxq_init[i].dpdk_rss_hf) && - (i != HASH_RXQ_ETH)) - continue; offset = parser->queue[i].offset; parser->queue[i].ibv_attr = - priv_flow_convert_allocate(priv, priority, - offset, error); + mlx5_flow_convert_allocate(offset, error); if (!parser->queue[i].ibv_attr) goto exit_enomem; parser->queue[i].offset = sizeof(struct ibv_flow_attr); @@ -1093,7 +1322,15 @@ priv_flow_convert(struct priv *priv, } /* Third step. Conversion parse, fill the specifications. */ parser->inner = 0; + parser->tunnel = 0; + parser->layer = HASH_RXQ_ETH; for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) { + struct mlx5_flow_data data = { + .dev = dev, + .parser = parser, + .error = error, + }; + if (items->type == RTE_FLOW_ITEM_TYPE_VOID) continue; cur_item = &mlx5_flow_items[items->type]; @@ -1101,32 +1338,26 @@ priv_flow_convert(struct priv *priv, (cur_item->default_mask ? cur_item->default_mask : cur_item->mask), - parser); - if (ret) { - rte_flow_error_set(error, ret, - RTE_FLOW_ERROR_TYPE_ITEM, - items, "item not supported"); + &data); + if (ret) goto exit_free; - } } + if (!parser->drop) { + /* RSS check, remove unused hash types. */ + ret = mlx5_flow_convert_rss(parser); + if (ret) + goto exit_free; + /* Complete missing specification. */ + mlx5_flow_convert_finalise(parser); + } + mlx5_flow_update_priority(dev, parser, attr); if (parser->mark) mlx5_flow_create_flag_mark(parser, parser->mark_id); if (parser->count && parser->create) { - mlx5_flow_create_count(priv, parser); + mlx5_flow_create_count(dev, parser); if (!parser->cs) goto exit_count_error; } - /* - * Last step. Complete missing specification to reach the RSS - * configuration. - */ - if (!parser->drop) { - priv_flow_convert_finalise(priv, parser); - } else { - parser->queue[HASH_RXQ_ETH].ibv_attr->priority = - attr->priority + - hash_rxq_init[parser->layer].flow_priority; - } exit_free: /* Only verification is expected, all resources should be released. */ if (!parser->create) { @@ -1145,13 +1376,13 @@ exit_enomem: parser->queue[i].ibv_attr = NULL; } } - rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, "cannot allocate verbs spec attributes."); - return ret; + rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot allocate verbs spec attributes"); + return -rte_errno; exit_count_error: rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, "cannot create counter."); - return rte_errno; + NULL, "cannot create counter"); + return -rte_errno; } /** @@ -1174,17 +1405,11 @@ mlx5_flow_create_copy(struct mlx5_flow_parse *parser, void *src, for (i = 0; i != hash_rxq_init_n; ++i) { if (!parser->queue[i].ibv_attr) continue; - /* Specification must be the same l3 type or none. */ - if (parser->layer == HASH_RXQ_ETH || - (hash_rxq_init[parser->layer].ip_version == - hash_rxq_init[i].ip_version) || - (hash_rxq_init[i].ip_version == 0)) { - dst = (void *)((uintptr_t)parser->queue[i].ibv_attr + - parser->queue[i].offset); - memcpy(dst, src, size); - ++parser->queue[i].ibv_attr->num_of_specs; - parser->queue[i].offset += size; - } + dst = (void *)((uintptr_t)parser->queue[i].ibv_attr + + parser->queue[i].offset); + memcpy(dst, src, size); + ++parser->queue[i].ibv_attr->num_of_specs; + parser->queue[i].offset += size; } } @@ -1197,24 +1422,25 @@ mlx5_flow_create_copy(struct mlx5_flow_parse *parser, void *src, * Default bit-masks to use when item->mask is not provided. * @param data[in, out] * User structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_flow_create_eth(const struct rte_flow_item *item, const void *default_mask, - void *data) + struct mlx5_flow_data *data) { const struct rte_flow_item_eth *spec = item->spec; const struct rte_flow_item_eth *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; + struct mlx5_flow_parse *parser = data->parser; const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth); struct ibv_flow_spec_eth eth = { .type = parser->inner | IBV_FLOW_SPEC_ETH, .size = eth_size, }; - /* Don't update layer for the inner pattern. */ - if (!parser->inner) - parser->layer = HASH_RXQ_ETH; + parser->layer = HASH_RXQ_ETH; if (spec) { unsigned int i; @@ -1246,17 +1472,21 @@ mlx5_flow_create_eth(const struct rte_flow_item *item, * Default bit-masks to use when item->mask is not provided. * @param data[in, out] * User structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_flow_create_vlan(const struct rte_flow_item *item, const void *default_mask, - void *data) + struct mlx5_flow_data *data) { const struct rte_flow_item_vlan *spec = item->spec; const struct rte_flow_item_vlan *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; + struct mlx5_flow_parse *parser = data->parser; struct ibv_flow_spec_eth *eth; const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth); + const char *msg = "VLAN cannot be empty"; if (spec) { unsigned int i; @@ -1272,9 +1502,26 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item, eth->val.vlan_tag = spec->tci; eth->mask.vlan_tag = mask->tci; eth->val.vlan_tag &= eth->mask.vlan_tag; + /* + * From verbs perspective an empty VLAN is equivalent + * to a packet without VLAN layer. + */ + if (!eth->mask.vlan_tag) + goto error; + /* Outer TPID cannot be matched. */ + if (eth->mask.ether_type) { + msg = "VLAN TPID matching is not supported"; + goto error; + } + eth->val.ether_type = spec->inner_type; + eth->mask.ether_type = mask->inner_type; + eth->val.ether_type &= eth->mask.ether_type; } + return 0; } - return 0; +error: + return rte_flow_error_set(data->error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg); } /** @@ -1286,24 +1533,35 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item, * Default bit-masks to use when item->mask is not provided. * @param data[in, out] * User structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_flow_create_ipv4(const struct rte_flow_item *item, const void *default_mask, - void *data) + struct mlx5_flow_data *data) { + struct priv *priv = data->dev->data->dev_private; const struct rte_flow_item_ipv4 *spec = item->spec; const struct rte_flow_item_ipv4 *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; + struct mlx5_flow_parse *parser = data->parser; unsigned int ipv4_size = sizeof(struct ibv_flow_spec_ipv4_ext); struct ibv_flow_spec_ipv4_ext ipv4 = { .type = parser->inner | IBV_FLOW_SPEC_IPV4_EXT, .size = ipv4_size, }; - /* Don't update layer for the inner pattern. */ - if (!parser->inner) - parser->layer = HASH_RXQ_IPV4; + if (parser->layer == HASH_RXQ_TUNNEL && + parser->tunnel == ptype_ext[PTYPE_IDX(RTE_PTYPE_TUNNEL_VXLAN)] && + !priv->config.l3_vxlan_en) + return rte_flow_error_set(data->error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L3 VXLAN not enabled by device" + " parameter and/or not configured" + " in firmware"); + parser->layer = HASH_RXQ_IPV4; if (spec) { if (!mask) mask = default_mask; @@ -1338,24 +1596,35 @@ mlx5_flow_create_ipv4(const struct rte_flow_item *item, * Default bit-masks to use when item->mask is not provided. * @param data[in, out] * User structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_flow_create_ipv6(const struct rte_flow_item *item, const void *default_mask, - void *data) + struct mlx5_flow_data *data) { + struct priv *priv = data->dev->data->dev_private; const struct rte_flow_item_ipv6 *spec = item->spec; const struct rte_flow_item_ipv6 *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; + struct mlx5_flow_parse *parser = data->parser; unsigned int ipv6_size = sizeof(struct ibv_flow_spec_ipv6); struct ibv_flow_spec_ipv6 ipv6 = { .type = parser->inner | IBV_FLOW_SPEC_IPV6, .size = ipv6_size, }; - /* Don't update layer for the inner pattern. */ - if (!parser->inner) - parser->layer = HASH_RXQ_IPV6; + if (parser->layer == HASH_RXQ_TUNNEL && + parser->tunnel == ptype_ext[PTYPE_IDX(RTE_PTYPE_TUNNEL_VXLAN)] && + !priv->config.l3_vxlan_en) + return rte_flow_error_set(data->error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L3 VXLAN not enabled by device" + " parameter and/or not configured" + " in firmware"); + parser->layer = HASH_RXQ_IPV6; if (spec) { unsigned int i; uint32_t vtc_flow_val; @@ -1410,28 +1679,28 @@ mlx5_flow_create_ipv6(const struct rte_flow_item *item, * Default bit-masks to use when item->mask is not provided. * @param data[in, out] * User structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_flow_create_udp(const struct rte_flow_item *item, const void *default_mask, - void *data) + struct mlx5_flow_data *data) { const struct rte_flow_item_udp *spec = item->spec; const struct rte_flow_item_udp *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; + struct mlx5_flow_parse *parser = data->parser; unsigned int udp_size = sizeof(struct ibv_flow_spec_tcp_udp); struct ibv_flow_spec_tcp_udp udp = { .type = parser->inner | IBV_FLOW_SPEC_UDP, .size = udp_size, }; - /* Don't update layer for the inner pattern. */ - if (!parser->inner) { - if (parser->layer == HASH_RXQ_IPV4) - parser->layer = HASH_RXQ_UDPV4; - else - parser->layer = HASH_RXQ_UDPV6; - } + if (parser->layer == HASH_RXQ_IPV4) + parser->layer = HASH_RXQ_UDPV4; + else + parser->layer = HASH_RXQ_UDPV6; if (spec) { if (!mask) mask = default_mask; @@ -1456,28 +1725,28 @@ mlx5_flow_create_udp(const struct rte_flow_item *item, * Default bit-masks to use when item->mask is not provided. * @param data[in, out] * User structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_flow_create_tcp(const struct rte_flow_item *item, const void *default_mask, - void *data) + struct mlx5_flow_data *data) { const struct rte_flow_item_tcp *spec = item->spec; const struct rte_flow_item_tcp *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; + struct mlx5_flow_parse *parser = data->parser; unsigned int tcp_size = sizeof(struct ibv_flow_spec_tcp_udp); struct ibv_flow_spec_tcp_udp tcp = { .type = parser->inner | IBV_FLOW_SPEC_TCP, .size = tcp_size, }; - /* Don't update layer for the inner pattern. */ - if (!parser->inner) { - if (parser->layer == HASH_RXQ_IPV4) - parser->layer = HASH_RXQ_TCPV4; - else - parser->layer = HASH_RXQ_TCPV6; - } + if (parser->layer == HASH_RXQ_IPV4) + parser->layer = HASH_RXQ_TCPV4; + else + parser->layer = HASH_RXQ_TCPV6; if (spec) { if (!mask) mask = default_mask; @@ -1502,15 +1771,18 @@ mlx5_flow_create_tcp(const struct rte_flow_item *item, * Default bit-masks to use when item->mask is not provided. * @param data[in, out] * User structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_flow_create_vxlan(const struct rte_flow_item *item, const void *default_mask, - void *data) + struct mlx5_flow_data *data) { const struct rte_flow_item_vxlan *spec = item->spec; const struct rte_flow_item_vxlan *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; + struct mlx5_flow_parse *parser = data->parser; unsigned int size = sizeof(struct ibv_flow_spec_tunnel); struct ibv_flow_spec_tunnel vxlan = { .type = parser->inner | IBV_FLOW_SPEC_VXLAN_TUNNEL, @@ -1523,6 +1795,9 @@ mlx5_flow_create_vxlan(const struct rte_flow_item *item, id.vni[0] = 0; parser->inner = IBV_FLOW_SPEC_INNER; + parser->tunnel = ptype_ext[PTYPE_IDX(RTE_PTYPE_TUNNEL_VXLAN)]; + parser->out_layer = parser->layer; + parser->layer = HASH_RXQ_TUNNEL; if (spec) { if (!mask) mask = default_mask; @@ -1541,19 +1816,260 @@ mlx5_flow_create_vxlan(const struct rte_flow_item *item, * before will also match this rule. * To avoid such situation, VNI 0 is currently refused. */ - if (!vxlan.val.tunnel_id) - return EINVAL; + /* Only allow tunnel w/o tunnel id pattern after proper outer spec. */ + if (parser->out_layer == HASH_RXQ_ETH && !vxlan.val.tunnel_id) + return rte_flow_error_set(data->error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VxLAN vni cannot be 0"); mlx5_flow_create_copy(parser, &vxlan, size); return 0; } /** + * Convert VXLAN-GPE item to Verbs specification. + * + * @param item[in] + * Item specification. + * @param default_mask[in] + * Default bit-masks to use when item->mask is not provided. + * @param data[in, out] + * User structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_create_vxlan_gpe(const struct rte_flow_item *item, + const void *default_mask, + struct mlx5_flow_data *data) +{ + struct priv *priv = data->dev->data->dev_private; + const struct rte_flow_item_vxlan_gpe *spec = item->spec; + const struct rte_flow_item_vxlan_gpe *mask = item->mask; + struct mlx5_flow_parse *parser = data->parser; + unsigned int size = sizeof(struct ibv_flow_spec_tunnel); + struct ibv_flow_spec_tunnel vxlan = { + .type = parser->inner | IBV_FLOW_SPEC_VXLAN_TUNNEL, + .size = size, + }; + union vni { + uint32_t vlan_id; + uint8_t vni[4]; + } id; + + if (!priv->config.l3_vxlan_en) + return rte_flow_error_set(data->error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L3 VXLAN not enabled by device" + " parameter and/or not configured" + " in firmware"); + id.vni[0] = 0; + parser->inner = IBV_FLOW_SPEC_INNER; + parser->tunnel = ptype_ext[PTYPE_IDX(RTE_PTYPE_TUNNEL_VXLAN_GPE)]; + parser->out_layer = parser->layer; + parser->layer = HASH_RXQ_TUNNEL; + if (spec) { + if (!mask) + mask = default_mask; + memcpy(&id.vni[1], spec->vni, 3); + vxlan.val.tunnel_id = id.vlan_id; + memcpy(&id.vni[1], mask->vni, 3); + vxlan.mask.tunnel_id = id.vlan_id; + if (spec->protocol) + return rte_flow_error_set(data->error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VxLAN-GPE protocol not" + " supported"); + /* Remove unwanted bits from values. */ + vxlan.val.tunnel_id &= vxlan.mask.tunnel_id; + } + /* + * Tunnel id 0 is equivalent as not adding a VXLAN layer, if only this + * layer is defined in the Verbs specification it is interpreted as + * wildcard and all packets will match this rule, if it follows a full + * stack layer (ex: eth / ipv4 / udp), all packets matching the layers + * before will also match this rule. + * To avoid such situation, VNI 0 is currently refused. + */ + /* Only allow tunnel w/o tunnel id pattern after proper outer spec. */ + if (parser->out_layer == HASH_RXQ_ETH && !vxlan.val.tunnel_id) + return rte_flow_error_set(data->error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VxLAN-GPE vni cannot be 0"); + mlx5_flow_create_copy(parser, &vxlan, size); + return 0; +} + +/** + * Convert GRE item to Verbs specification. + * + * @param item[in] + * Item specification. + * @param default_mask[in] + * Default bit-masks to use when item->mask is not provided. + * @param data[in, out] + * User structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_create_gre(const struct rte_flow_item *item, + const void *default_mask, + struct mlx5_flow_data *data) +{ + struct mlx5_flow_parse *parser = data->parser; +#ifndef HAVE_IBV_DEVICE_MPLS_SUPPORT + (void)default_mask; + unsigned int size = sizeof(struct ibv_flow_spec_tunnel); + struct ibv_flow_spec_tunnel tunnel = { + .type = parser->inner | IBV_FLOW_SPEC_VXLAN_TUNNEL, + .size = size, + }; +#else + const struct rte_flow_item_gre *spec = item->spec; + const struct rte_flow_item_gre *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_gre); + struct ibv_flow_spec_gre tunnel = { + .type = parser->inner | IBV_FLOW_SPEC_GRE, + .size = size, + }; +#endif + struct ibv_flow_spec_ipv4_ext *ipv4; + struct ibv_flow_spec_ipv6 *ipv6; + unsigned int i; + + parser->inner = IBV_FLOW_SPEC_INNER; + parser->tunnel = ptype_ext[PTYPE_IDX(RTE_PTYPE_TUNNEL_GRE)]; + parser->out_layer = parser->layer; + parser->layer = HASH_RXQ_TUNNEL; +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + if (spec) { + if (!mask) + mask = default_mask; + tunnel.val.c_ks_res0_ver = spec->c_rsvd0_ver; + tunnel.val.protocol = spec->protocol; + tunnel.mask.c_ks_res0_ver = mask->c_rsvd0_ver; + tunnel.mask.protocol = mask->protocol; + /* Remove unwanted bits from values. */ + tunnel.val.c_ks_res0_ver &= tunnel.mask.c_ks_res0_ver; + tunnel.val.protocol &= tunnel.mask.protocol; + tunnel.val.key &= tunnel.mask.key; + } +#endif + /* Update encapsulation IP layer protocol. */ + for (i = 0; i != hash_rxq_init_n; ++i) { + if (!parser->queue[i].ibv_attr) + continue; + if (parser->out_layer == HASH_RXQ_IPV4) { + ipv4 = (void *)((uintptr_t)parser->queue[i].ibv_attr + + parser->queue[i].offset - + sizeof(struct ibv_flow_spec_ipv4_ext)); + if (ipv4->mask.proto && ipv4->val.proto != MLX5_GRE) + break; + ipv4->val.proto = MLX5_GRE; + ipv4->mask.proto = 0xff; + } else if (parser->out_layer == HASH_RXQ_IPV6) { + ipv6 = (void *)((uintptr_t)parser->queue[i].ibv_attr + + parser->queue[i].offset - + sizeof(struct ibv_flow_spec_ipv6)); + if (ipv6->mask.next_hdr && + ipv6->val.next_hdr != MLX5_GRE) + break; + ipv6->val.next_hdr = MLX5_GRE; + ipv6->mask.next_hdr = 0xff; + } + } + if (i != hash_rxq_init_n) + return rte_flow_error_set(data->error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "IP protocol of GRE must be 47"); + mlx5_flow_create_copy(parser, &tunnel, size); + return 0; +} + +/** + * Convert MPLS item to Verbs specification. + * MPLS tunnel types currently supported are MPLS-in-GRE and MPLS-in-UDP. + * + * @param item[in] + * Item specification. + * @param default_mask[in] + * Default bit-masks to use when item->mask is not provided. + * @param data[in, out] + * User structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_create_mpls(const struct rte_flow_item *item, + const void *default_mask, + struct mlx5_flow_data *data) +{ +#ifndef HAVE_IBV_DEVICE_MPLS_SUPPORT + (void)default_mask; + return rte_flow_error_set(data->error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "MPLS is not supported by driver"); +#else + const struct rte_flow_item_mpls *spec = item->spec; + const struct rte_flow_item_mpls *mask = item->mask; + struct mlx5_flow_parse *parser = data->parser; + unsigned int size = sizeof(struct ibv_flow_spec_mpls); + struct ibv_flow_spec_mpls mpls = { + .type = IBV_FLOW_SPEC_MPLS, + .size = size, + }; + + parser->inner = IBV_FLOW_SPEC_INNER; + if (parser->layer == HASH_RXQ_UDPV4 || + parser->layer == HASH_RXQ_UDPV6) { + parser->tunnel = + ptype_ext[PTYPE_IDX(RTE_PTYPE_TUNNEL_MPLS_IN_UDP)]; + parser->out_layer = parser->layer; + } else { + parser->tunnel = + ptype_ext[PTYPE_IDX(RTE_PTYPE_TUNNEL_MPLS_IN_GRE)]; + /* parser->out_layer stays as in GRE out_layer. */ + } + parser->layer = HASH_RXQ_TUNNEL; + if (spec) { + if (!mask) + mask = default_mask; + /* + * The verbs label field includes the entire MPLS header: + * bits 0:19 - label value field. + * bits 20:22 - traffic class field. + * bits 23 - bottom of stack bit. + * bits 24:31 - ttl field. + */ + mpls.val.label = *(const uint32_t *)spec; + mpls.mask.label = *(const uint32_t *)mask; + /* Remove unwanted bits from values. */ + mpls.val.label &= mpls.mask.label; + } + mlx5_flow_create_copy(parser, &mpls, size); + return 0; +#endif +} + +/** * Convert mark/flag action to Verbs specification. * * @param parser * Internal parser structure. * @param mark_id * Mark identifier. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_flow_create_flag_mark(struct mlx5_flow_parse *parser, uint32_t mark_id) @@ -1573,19 +2089,20 @@ mlx5_flow_create_flag_mark(struct mlx5_flow_parse *parser, uint32_t mark_id) /** * Convert count action to Verbs specification. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param parser * Pointer to MLX5 flow parser structure. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx5_flow_create_count(struct priv *priv __rte_unused, +mlx5_flow_create_count(struct rte_eth_dev *dev __rte_unused, struct mlx5_flow_parse *parser __rte_unused) { #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT + struct priv *priv = dev->data->dev_private; unsigned int size = sizeof(struct ibv_flow_spec_counter_action); struct ibv_counter_set_init_attr init_attr = {0}; struct ibv_flow_spec_counter_action counter = { @@ -1596,8 +2113,10 @@ mlx5_flow_create_count(struct priv *priv __rte_unused, init_attr.counter_set_id = 0; parser->cs = mlx5_glue->create_counter_set(priv->ctx, &init_attr); - if (!parser->cs) - return EINVAL; + if (!parser->cs) { + rte_errno = EINVAL; + return -rte_errno; + } counter.counter_set_handle = parser->cs->handle; mlx5_flow_create_copy(parser, &counter, size); #endif @@ -1607,8 +2126,8 @@ mlx5_flow_create_count(struct priv *priv __rte_unused, /** * Complete flow rule creation with a drop queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param parser * Internal parser structure. * @param flow @@ -1617,17 +2136,17 @@ mlx5_flow_create_count(struct priv *priv __rte_unused, * Perform verbose error reporting if not NULL. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_flow_create_action_queue_drop(struct priv *priv, +mlx5_flow_create_action_queue_drop(struct rte_eth_dev *dev, struct mlx5_flow_parse *parser, struct rte_flow *flow, struct rte_flow_error *error) { + struct priv *priv = dev->data->dev_private; struct ibv_flow_spec_action_drop *drop; unsigned int size = sizeof(struct ibv_flow_spec_action_drop); - int err = 0; assert(priv->pd); assert(priv->ctx); @@ -1644,7 +2163,7 @@ priv_flow_create_action_queue_drop(struct priv *priv, parser->queue[HASH_RXQ_ETH].ibv_attr; if (parser->count) flow->cs = parser->cs; - if (!priv->dev->data->dev_started) + if (!dev->data->dev_started) return 0; parser->queue[HASH_RXQ_ETH].ibv_attr = NULL; flow->frxq[HASH_RXQ_ETH].ibv_flow = @@ -1653,7 +2172,6 @@ priv_flow_create_action_queue_drop(struct priv *priv, if (!flow->frxq[HASH_RXQ_ETH].ibv_flow) { rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, "flow rule creation failure"); - err = ENOMEM; goto error; } return 0; @@ -1673,14 +2191,14 @@ error: flow->cs = NULL; parser->cs = NULL; } - return err; + return -rte_errno; } /** * Create hash Rx queues when RSS is enabled. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param parser * Internal parser structure. * @param flow @@ -1689,10 +2207,10 @@ error: * Perform verbose error reporting if not NULL. * * @return - * 0 on success, a errno value otherwise and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_flow_create_action_queue_rss(struct priv *priv, +mlx5_flow_create_action_queue_rss(struct rte_eth_dev *dev, struct mlx5_flow_parse *parser, struct rte_flow *flow, struct rte_flow_error *error) @@ -1700,46 +2218,144 @@ priv_flow_create_action_queue_rss(struct priv *priv, unsigned int i; for (i = 0; i != hash_rxq_init_n; ++i) { - uint64_t hash_fields; - if (!parser->queue[i].ibv_attr) continue; flow->frxq[i].ibv_attr = parser->queue[i].ibv_attr; parser->queue[i].ibv_attr = NULL; - hash_fields = hash_rxq_init[i].hash_fields; - if (!priv->dev->data->dev_started) + flow->frxq[i].hash_fields = parser->queue[i].hash_fields; + if (!dev->data->dev_started) continue; flow->frxq[i].hrxq = - mlx5_priv_hrxq_get(priv, - parser->rss_conf.rss_key, - parser->rss_conf.rss_key_len, - hash_fields, - parser->queues, - parser->queues_n); + mlx5_hrxq_get(dev, + parser->rss_conf.key, + parser->rss_conf.key_len, + flow->frxq[i].hash_fields, + parser->rss_conf.queue, + parser->rss_conf.queue_num, + parser->tunnel, + parser->rss_conf.level); if (flow->frxq[i].hrxq) continue; flow->frxq[i].hrxq = - mlx5_priv_hrxq_new(priv, - parser->rss_conf.rss_key, - parser->rss_conf.rss_key_len, - hash_fields, - parser->queues, - parser->queues_n); + mlx5_hrxq_new(dev, + parser->rss_conf.key, + parser->rss_conf.key_len, + flow->frxq[i].hash_fields, + parser->rss_conf.queue, + parser->rss_conf.queue_num, + parser->tunnel, + parser->rss_conf.level); if (!flow->frxq[i].hrxq) { - rte_flow_error_set(error, ENOMEM, - RTE_FLOW_ERROR_TYPE_HANDLE, - NULL, "cannot create hash rxq"); - return ENOMEM; + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, + "cannot create hash rxq"); } } return 0; } /** + * RXQ update after flow rule creation. + * + * @param dev + * Pointer to Ethernet device. + * @param flow + * Pointer to the flow rule. + */ +static void +mlx5_flow_create_update_rxqs(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct priv *priv = dev->data->dev_private; + unsigned int i; + unsigned int j; + + if (!dev->data->dev_started) + return; + for (i = 0; i != flow->rss_conf.queue_num; ++i) { + struct mlx5_rxq_data *rxq_data = (*priv->rxqs) + [(*flow->queues)[i]]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); + uint8_t tunnel = PTYPE_IDX(flow->tunnel); + + rxq_data->mark |= flow->mark; + if (!tunnel) + continue; + rxq_ctrl->tunnel_types[tunnel] += 1; + /* Clear tunnel type if more than one tunnel types set. */ + for (j = 0; j != RTE_DIM(rxq_ctrl->tunnel_types); ++j) { + if (j == tunnel) + continue; + if (rxq_ctrl->tunnel_types[j] > 0) { + rxq_data->tunnel = 0; + break; + } + } + if (j == RTE_DIM(rxq_ctrl->tunnel_types)) + rxq_data->tunnel = flow->tunnel; + } +} + +/** + * Dump flow hash RX queue detail. + * + * @param dev + * Pointer to Ethernet device. + * @param flow + * Pointer to the rte_flow. + * @param hrxq_idx + * Hash RX queue index. + */ +static void +mlx5_flow_dump(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow __rte_unused, + unsigned int hrxq_idx __rte_unused) +{ +#ifndef NDEBUG + uintptr_t spec_ptr; + uint16_t j; + char buf[256]; + uint8_t off; + uint64_t extra_hash_fields = 0; + +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + if (flow->tunnel && flow->rss_conf.level > 1) + extra_hash_fields = (uint32_t)IBV_RX_HASH_INNER; +#endif + spec_ptr = (uintptr_t)(flow->frxq[hrxq_idx].ibv_attr + 1); + for (j = 0, off = 0; j < flow->frxq[hrxq_idx].ibv_attr->num_of_specs; + j++) { + struct ibv_flow_spec *spec = (void *)spec_ptr; + off += sprintf(buf + off, " %x(%hu)", spec->hdr.type, + spec->hdr.size); + spec_ptr += spec->hdr.size; + } + DRV_LOG(DEBUG, + "port %u Verbs flow %p type %u: hrxq:%p qp:%p ind:%p," + " hash:%" PRIx64 "/%u specs:%hhu(%hu), priority:%hu, type:%d," + " flags:%x, comp_mask:%x specs:%s", + dev->data->port_id, (void *)flow, hrxq_idx, + (void *)flow->frxq[hrxq_idx].hrxq, + (void *)flow->frxq[hrxq_idx].hrxq->qp, + (void *)flow->frxq[hrxq_idx].hrxq->ind_table, + (flow->frxq[hrxq_idx].hash_fields | extra_hash_fields), + flow->rss_conf.queue_num, + flow->frxq[hrxq_idx].ibv_attr->num_of_specs, + flow->frxq[hrxq_idx].ibv_attr->size, + flow->frxq[hrxq_idx].ibv_attr->priority, + flow->frxq[hrxq_idx].ibv_attr->type, + flow->frxq[hrxq_idx].ibv_attr->flags, + flow->frxq[hrxq_idx].ibv_attr->comp_mask, + buf); +#endif +} + +/** * Complete flow rule creation. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param parser * Internal parser structure. * @param flow @@ -1748,26 +2364,28 @@ priv_flow_create_action_queue_rss(struct priv *priv, * Perform verbose error reporting if not NULL. * * @return - * 0 on success, a errno value otherwise and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_flow_create_action_queue(struct priv *priv, +mlx5_flow_create_action_queue(struct rte_eth_dev *dev, struct mlx5_flow_parse *parser, struct rte_flow *flow, struct rte_flow_error *error) { - int err = 0; + struct priv *priv __rte_unused = dev->data->dev_private; + int ret; unsigned int i; + unsigned int flows_n = 0; assert(priv->pd); assert(priv->ctx); assert(!parser->drop); - err = priv_flow_create_action_queue_rss(priv, parser, flow, error); - if (err) + ret = mlx5_flow_create_action_queue_rss(dev, parser, flow, error); + if (ret) goto error; if (parser->count) flow->cs = parser->cs; - if (!priv->dev->data->dev_started) + if (!dev->data->dev_started) return 0; for (i = 0; i != hash_rxq_init_n; ++i) { if (!flow->frxq[i].hrxq) @@ -1775,26 +2393,24 @@ priv_flow_create_action_queue(struct priv *priv, flow->frxq[i].ibv_flow = mlx5_glue->create_flow(flow->frxq[i].hrxq->qp, flow->frxq[i].ibv_attr); + mlx5_flow_dump(dev, flow, i); if (!flow->frxq[i].ibv_flow) { rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, "flow rule creation failure"); - err = ENOMEM; goto error; } - DEBUG("%p type %d QP %p ibv_flow %p", - (void *)flow, i, - (void *)flow->frxq[i].hrxq, - (void *)flow->frxq[i].ibv_flow); + ++flows_n; } - for (i = 0; i != parser->queues_n; ++i) { - struct mlx5_rxq_data *q = - (*priv->rxqs)[parser->queues[i]]; - - q->mark |= parser->mark; + if (!flows_n) { + rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "internal error in flow creation"); + goto error; } + mlx5_flow_create_update_rxqs(dev, flow); return 0; error: + ret = rte_errno; /* Save rte_errno before cleanup. */ assert(flow); for (i = 0; i != hash_rxq_init_n; ++i) { if (flow->frxq[i].ibv_flow) { @@ -1803,7 +2419,7 @@ error: claim_zero(mlx5_glue->destroy_flow(ibv_flow)); } if (flow->frxq[i].hrxq) - mlx5_priv_hrxq_release(priv, flow->frxq[i].hrxq); + mlx5_hrxq_release(dev, flow->frxq[i].hrxq); if (flow->frxq[i].ibv_attr) rte_free(flow->frxq[i].ibv_attr); } @@ -1812,14 +2428,15 @@ error: flow->cs = NULL; parser->cs = NULL; } - return err; + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } /** * Convert a flow. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param list * Pointer to a TAILQ flow list. * @param[in] attr @@ -1832,26 +2449,27 @@ error: * Perform verbose error reporting if not NULL. * * @return - * A flow on success, NULL otherwise. + * A flow on success, NULL otherwise and rte_errno is set. */ static struct rte_flow * -priv_flow_create(struct priv *priv, - struct mlx5_flows *list, - const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - const struct rte_flow_action actions[], - struct rte_flow_error *error) +mlx5_flow_list_create(struct rte_eth_dev *dev, + struct mlx5_flows *list, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) { struct mlx5_flow_parse parser = { .create = 1, }; struct rte_flow *flow = NULL; unsigned int i; - int err; + int ret; - err = priv_flow_convert(priv, attr, items, actions, error, &parser); - if (err) + ret = mlx5_flow_convert(dev, attr, items, actions, error, &parser); + if (ret) goto exit; flow = rte_calloc(__func__, 1, - sizeof(*flow) + parser.queues_n * sizeof(uint16_t), + sizeof(*flow) + + parser.rss_conf.queue_num * sizeof(uint16_t), 0); if (!flow) { rte_flow_error_set(error, ENOMEM, @@ -1860,28 +2478,38 @@ priv_flow_create(struct priv *priv, "cannot allocate flow memory"); return NULL; } - /* Copy queues configuration. */ + /* Copy configuration. */ flow->queues = (uint16_t (*)[])(flow + 1); - memcpy(flow->queues, parser.queues, parser.queues_n * sizeof(uint16_t)); - flow->queues_n = parser.queues_n; + flow->tunnel = parser.tunnel; + flow->rss_conf = (struct rte_flow_action_rss){ + .func = RTE_ETH_HASH_FUNCTION_DEFAULT, + .level = parser.rss_conf.level, + .types = parser.rss_conf.types, + .key_len = parser.rss_conf.key_len, + .queue_num = parser.rss_conf.queue_num, + .key = memcpy(flow->rss_key, parser.rss_conf.key, + sizeof(*parser.rss_conf.key) * + parser.rss_conf.key_len), + .queue = memcpy(flow->queues, parser.rss_conf.queue, + sizeof(*parser.rss_conf.queue) * + parser.rss_conf.queue_num), + }; flow->mark = parser.mark; - /* Copy RSS configuration. */ - flow->rss_conf = parser.rss_conf; - flow->rss_conf.rss_key = flow->rss_key; - memcpy(flow->rss_key, parser.rss_key, parser.rss_conf.rss_key_len); /* finalise the flow. */ if (parser.drop) - err = priv_flow_create_action_queue_drop(priv, &parser, flow, + ret = mlx5_flow_create_action_queue_drop(dev, &parser, flow, error); else - err = priv_flow_create_action_queue(priv, &parser, flow, error); - if (err) + ret = mlx5_flow_create_action_queue(dev, &parser, flow, error); + if (ret) goto exit; TAILQ_INSERT_TAIL(list, flow, next); - DEBUG("Flow created %p", (void *)flow); + DRV_LOG(DEBUG, "port %u flow created %p", dev->data->port_id, + (void *)flow); return flow; exit: - ERROR("flow creation error: %s", error->message); + DRV_LOG(ERR, "port %u flow creation error: %s", dev->data->port_id, + error->message); for (i = 0; i != hash_rxq_init_n; ++i) { if (parser.queue[i].ibv_attr) rte_free(parser.queue[i].ibv_attr); @@ -1903,14 +2531,9 @@ mlx5_flow_validate(struct rte_eth_dev *dev, const struct rte_flow_action actions[], struct rte_flow_error *error) { - struct priv *priv = dev->data->dev_private; - int ret; struct mlx5_flow_parse parser = { .create = 0, }; - priv_lock(priv); - ret = priv_flow_convert(priv, attr, items, actions, error, &parser); - priv_unlock(priv); - return ret; + return mlx5_flow_convert(dev, attr, items, actions, error, &parser); } /** @@ -1927,35 +2550,60 @@ mlx5_flow_create(struct rte_eth_dev *dev, struct rte_flow_error *error) { struct priv *priv = dev->data->dev_private; - struct rte_flow *flow; - priv_lock(priv); - flow = priv_flow_create(priv, &priv->flows, attr, items, actions, - error); - priv_unlock(priv); - return flow; + return mlx5_flow_list_create(dev, &priv->flows, attr, items, actions, + error); } /** - * Destroy a flow. + * Destroy a flow in a list. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param list * Pointer to a TAILQ flow list. * @param[in] flow * Flow to destroy. */ static void -priv_flow_destroy(struct priv *priv, - struct mlx5_flows *list, - struct rte_flow *flow) +mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list, + struct rte_flow *flow) { + struct priv *priv = dev->data->dev_private; unsigned int i; - if (flow->drop || !flow->mark) + if (flow->drop || !dev->data->dev_started) goto free; - for (i = 0; i != flow->queues_n; ++i) { + for (i = 0; flow->tunnel && i != flow->rss_conf.queue_num; ++i) { + /* Update queue tunnel type. */ + struct mlx5_rxq_data *rxq_data = (*priv->rxqs) + [(*flow->queues)[i]]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); + uint8_t tunnel = PTYPE_IDX(flow->tunnel); + + assert(rxq_ctrl->tunnel_types[tunnel] > 0); + rxq_ctrl->tunnel_types[tunnel] -= 1; + if (!rxq_ctrl->tunnel_types[tunnel]) { + /* Update tunnel type. */ + uint8_t j; + uint8_t types = 0; + uint8_t last; + + for (j = 0; j < RTE_DIM(rxq_ctrl->tunnel_types); j++) + if (rxq_ctrl->tunnel_types[j]) { + types += 1; + last = j; + } + /* Keep same if more than one tunnel types left. */ + if (types == 1) + rxq_data->tunnel = ptype_ext[last]; + else if (types == 0) + /* No tunnel type left. */ + rxq_data->tunnel = 0; + } + } + for (i = 0; flow->mark && i != flow->rss_conf.queue_num; ++i) { struct rte_flow *tmp; int mark = 0; @@ -1998,7 +2646,7 @@ free: claim_zero(mlx5_glue->destroy_flow (frxq->ibv_flow)); if (frxq->hrxq) - mlx5_priv_hrxq_release(priv, frxq->hrxq); + mlx5_hrxq_release(dev, frxq->hrxq); if (frxq->ibv_attr) rte_free(frxq->ibv_attr); } @@ -2008,53 +2656,60 @@ free: flow->cs = NULL; } TAILQ_REMOVE(list, flow, next); - DEBUG("Flow destroyed %p", (void *)flow); + DRV_LOG(DEBUG, "port %u flow destroyed %p", dev->data->port_id, + (void *)flow); rte_free(flow); } /** * Destroy all flows. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param list * Pointer to a TAILQ flow list. */ void -priv_flow_flush(struct priv *priv, struct mlx5_flows *list) +mlx5_flow_list_flush(struct rte_eth_dev *dev, struct mlx5_flows *list) { while (!TAILQ_EMPTY(list)) { struct rte_flow *flow; flow = TAILQ_FIRST(list); - priv_flow_destroy(priv, list, flow); + mlx5_flow_list_destroy(dev, list, flow); } } /** * Create drop queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_flow_create_drop_queue(struct priv *priv) +mlx5_flow_create_drop_queue(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_hrxq_drop *fdq = NULL; assert(priv->pd); assert(priv->ctx); fdq = rte_calloc(__func__, 1, sizeof(*fdq), 0); if (!fdq) { - WARN("cannot allocate memory for drop queue"); - goto error; + DRV_LOG(WARNING, + "port %u cannot allocate memory for drop queue", + dev->data->port_id); + rte_errno = ENOMEM; + return -rte_errno; } fdq->cq = mlx5_glue->create_cq(priv->ctx, 1, NULL, NULL, 0); if (!fdq->cq) { - WARN("cannot allocate CQ for drop queue"); + DRV_LOG(WARNING, "port %u cannot allocate CQ for drop queue", + dev->data->port_id); + rte_errno = errno; goto error; } fdq->wq = mlx5_glue->create_wq @@ -2067,7 +2722,9 @@ priv_flow_create_drop_queue(struct priv *priv) .cq = fdq->cq, }); if (!fdq->wq) { - WARN("cannot allocate WQ for drop queue"); + DRV_LOG(WARNING, "port %u cannot allocate WQ for drop queue", + dev->data->port_id); + rte_errno = errno; goto error; } fdq->ind_table = mlx5_glue->create_rwq_ind_table @@ -2078,7 +2735,11 @@ priv_flow_create_drop_queue(struct priv *priv) .comp_mask = 0, }); if (!fdq->ind_table) { - WARN("cannot allocate indirection table for drop queue"); + DRV_LOG(WARNING, + "port %u cannot allocate indirection table for drop" + " queue", + dev->data->port_id); + rte_errno = errno; goto error; } fdq->qp = mlx5_glue->create_qp_ex @@ -2100,7 +2761,9 @@ priv_flow_create_drop_queue(struct priv *priv) .pd = priv->pd }); if (!fdq->qp) { - WARN("cannot allocate QP for drop queue"); + DRV_LOG(WARNING, "port %u cannot allocate QP for drop queue", + dev->data->port_id); + rte_errno = errno; goto error; } priv->flow_drop_queue = fdq; @@ -2117,18 +2780,19 @@ error: if (fdq) rte_free(fdq); priv->flow_drop_queue = NULL; - return -1; + return -rte_errno; } /** * Delete drop queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. */ void -priv_flow_delete_drop_queue(struct priv *priv) +mlx5_flow_delete_drop_queue(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_hrxq_drop *fdq = priv->flow_drop_queue; if (!fdq) @@ -2148,18 +2812,19 @@ priv_flow_delete_drop_queue(struct priv *priv) /** * Remove all flows. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param list * Pointer to a TAILQ flow list. */ void -priv_flow_stop(struct priv *priv, struct mlx5_flows *list) +mlx5_flow_stop(struct rte_eth_dev *dev, struct mlx5_flows *list) { + struct priv *priv = dev->data->dev_private; struct rte_flow *flow; + unsigned int i; TAILQ_FOREACH_REVERSE(flow, list, mlx5_flows, next) { - unsigned int i; struct mlx5_ind_table_ibv *ind_tbl = NULL; if (flow->drop) { @@ -2168,7 +2833,8 @@ priv_flow_stop(struct priv *priv, struct mlx5_flows *list) claim_zero(mlx5_glue->destroy_flow (flow->frxq[HASH_RXQ_ETH].ibv_flow)); flow->frxq[HASH_RXQ_ETH].ibv_flow = NULL; - DEBUG("Flow %p removed", (void *)flow); + DRV_LOG(DEBUG, "port %u flow %p removed", + dev->data->port_id, (void *)flow); /* Next flow. */ continue; } @@ -2198,27 +2864,41 @@ priv_flow_stop(struct priv *priv, struct mlx5_flows *list) claim_zero(mlx5_glue->destroy_flow (flow->frxq[i].ibv_flow)); flow->frxq[i].ibv_flow = NULL; - mlx5_priv_hrxq_release(priv, flow->frxq[i].hrxq); + mlx5_hrxq_release(dev, flow->frxq[i].hrxq); flow->frxq[i].hrxq = NULL; } - DEBUG("Flow %p removed", (void *)flow); + DRV_LOG(DEBUG, "port %u flow %p removed", dev->data->port_id, + (void *)flow); + } + /* Cleanup Rx queue tunnel info. */ + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_data *q = (*priv->rxqs)[i]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(q, struct mlx5_rxq_ctrl, rxq); + + if (!q) + continue; + memset((void *)rxq_ctrl->tunnel_types, 0, + sizeof(rxq_ctrl->tunnel_types)); + q->tunnel = 0; } } /** * Add all flows. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param list * Pointer to a TAILQ flow list. * * @return - * 0 on success, a errno value otherwise and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_flow_start(struct priv *priv, struct mlx5_flows *list) +mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list) { + struct priv *priv = dev->data->dev_private; struct rte_flow *flow; TAILQ_FOREACH(flow, list, next) { @@ -2230,12 +2910,14 @@ priv_flow_start(struct priv *priv, struct mlx5_flows *list) (priv->flow_drop_queue->qp, flow->frxq[HASH_RXQ_ETH].ibv_attr); if (!flow->frxq[HASH_RXQ_ETH].ibv_flow) { - DEBUG("Flow %p cannot be applied", - (void *)flow); + DRV_LOG(DEBUG, + "port %u flow %p cannot be applied", + dev->data->port_id, (void *)flow); rte_errno = EINVAL; - return rte_errno; + return -rte_errno; } - DEBUG("Flow %p applied", (void *)flow); + DRV_LOG(DEBUG, "port %u flow %p applied", + dev->data->port_id, (void *)flow); /* Next flow. */ continue; } @@ -2243,41 +2925,46 @@ priv_flow_start(struct priv *priv, struct mlx5_flows *list) if (!flow->frxq[i].ibv_attr) continue; flow->frxq[i].hrxq = - mlx5_priv_hrxq_get(priv, flow->rss_conf.rss_key, - flow->rss_conf.rss_key_len, - hash_rxq_init[i].hash_fields, - (*flow->queues), - flow->queues_n); + mlx5_hrxq_get(dev, flow->rss_conf.key, + flow->rss_conf.key_len, + flow->frxq[i].hash_fields, + flow->rss_conf.queue, + flow->rss_conf.queue_num, + flow->tunnel, + flow->rss_conf.level); if (flow->frxq[i].hrxq) goto flow_create; flow->frxq[i].hrxq = - mlx5_priv_hrxq_new(priv, flow->rss_conf.rss_key, - flow->rss_conf.rss_key_len, - hash_rxq_init[i].hash_fields, - (*flow->queues), - flow->queues_n); + mlx5_hrxq_new(dev, flow->rss_conf.key, + flow->rss_conf.key_len, + flow->frxq[i].hash_fields, + flow->rss_conf.queue, + flow->rss_conf.queue_num, + flow->tunnel, + flow->rss_conf.level); if (!flow->frxq[i].hrxq) { - DEBUG("Flow %p cannot be applied", - (void *)flow); + DRV_LOG(DEBUG, + "port %u flow %p cannot create hash" + " rxq", + dev->data->port_id, (void *)flow); rte_errno = EINVAL; - return rte_errno; + return -rte_errno; } flow_create: + mlx5_flow_dump(dev, flow, i); flow->frxq[i].ibv_flow = mlx5_glue->create_flow(flow->frxq[i].hrxq->qp, flow->frxq[i].ibv_attr); if (!flow->frxq[i].ibv_flow) { - DEBUG("Flow %p cannot be applied", - (void *)flow); + DRV_LOG(DEBUG, + "port %u flow %p type %u cannot be" + " applied", + dev->data->port_id, (void *)flow, i); rte_errno = EINVAL; - return rte_errno; + return -rte_errno; } - DEBUG("Flow %p applied", (void *)flow); } - if (!flow->mark) - continue; - for (i = 0; i != flow->queues_n; ++i) - (*priv->rxqs)[(*flow->queues)[i]]->mark = 1; + mlx5_flow_create_update_rxqs(dev, flow); } return 0; } @@ -2285,20 +2972,21 @@ flow_create: /** * Verify the flow list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return the number of flows not released. */ int -priv_flow_verify(struct priv *priv) +mlx5_flow_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct rte_flow *flow; int ret = 0; TAILQ_FOREACH(flow, &priv->flows, next) { - DEBUG("%p: flow %p still referenced", (void *)priv, - (void *)flow); + DRV_LOG(DEBUG, "port %u flow %p still referenced", + dev->data->port_id, (void *)flow); ++ret; } return ret; @@ -2319,7 +3007,7 @@ priv_flow_verify(struct priv *priv) * A VLAN flow mask to apply. * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, @@ -2351,9 +3039,20 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, .type = RTE_FLOW_ITEM_TYPE_END, }, }; + uint16_t queue[priv->reta_idx_n]; + struct rte_flow_action_rss action_rss = { + .func = RTE_ETH_HASH_FUNCTION_DEFAULT, + .level = 0, + .types = priv->rss_conf.rss_hf, + .key_len = priv->rss_conf.rss_key_len, + .queue_num = priv->reta_idx_n, + .key = priv->rss_conf.rss_key, + .queue = queue, + }; struct rte_flow_action actions[] = { { .type = RTE_FLOW_ACTION_TYPE_RSS, + .conf = &action_rss, }, { .type = RTE_FLOW_ACTION_TYPE_END, @@ -2362,26 +3061,17 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, struct rte_flow *flow; struct rte_flow_error error; unsigned int i; - union { - struct rte_flow_action_rss rss; - struct { - const struct rte_eth_rss_conf *rss_conf; - uint16_t num; - uint16_t queue[RTE_MAX_QUEUES_PER_PORT]; - } local; - } action_rss; - - if (!priv->reta_idx_n) - return EINVAL; + + if (!priv->reta_idx_n) { + rte_errno = EINVAL; + return -rte_errno; + } for (i = 0; i != priv->reta_idx_n; ++i) - action_rss.local.queue[i] = (*priv->reta_idx)[i]; - action_rss.local.rss_conf = &priv->rss_conf; - action_rss.local.num = priv->reta_idx_n; - actions[0].conf = (const void *)&action_rss.rss; - flow = priv_flow_create(priv, &priv->ctrl_flows, &attr, items, actions, - &error); + queue[i] = (*priv->reta_idx)[i]; + flow = mlx5_flow_list_create(dev, &priv->ctrl_flows, &attr, items, + actions, &error); if (!flow) - return rte_errno; + return -rte_errno; return 0; } @@ -2396,7 +3086,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, * An Ethernet flow mask to apply. * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_ctrl_flow(struct rte_eth_dev *dev, @@ -2415,14 +3105,11 @@ mlx5_ctrl_flow(struct rte_eth_dev *dev, int mlx5_flow_destroy(struct rte_eth_dev *dev, struct rte_flow *flow, - struct rte_flow_error *error) + struct rte_flow_error *error __rte_unused) { struct priv *priv = dev->data->dev_private; - (void)error; - priv_lock(priv); - priv_flow_destroy(priv, &priv->flows, flow); - priv_unlock(priv); + mlx5_flow_list_destroy(dev, &priv->flows, flow); return 0; } @@ -2434,14 +3121,11 @@ mlx5_flow_destroy(struct rte_eth_dev *dev, */ int mlx5_flow_flush(struct rte_eth_dev *dev, - struct rte_flow_error *error) + struct rte_flow_error *error __rte_unused) { struct priv *priv = dev->data->dev_private; - (void)error; - priv_lock(priv); - priv_flow_flush(priv, &priv->flows); - priv_unlock(priv); + mlx5_flow_list_flush(dev, &priv->flows); return 0; } @@ -2455,10 +3139,10 @@ mlx5_flow_flush(struct rte_eth_dev *dev, * returned data from the counter. * * @return - * 0 on success, a errno value otherwise and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_flow_query_count(struct ibv_counter_set *cs, +mlx5_flow_query_count(struct ibv_counter_set *cs, struct mlx5_flow_counter_stats *counter_stats, struct rte_flow_query_count *query_count, struct rte_flow_error *error) @@ -2472,15 +3156,13 @@ priv_flow_query_count(struct ibv_counter_set *cs, .out = counters, .outlen = 2 * sizeof(uint64_t), }; - int res = mlx5_glue->query_counter_set(&query_cs_attr, &query_out); + int err = mlx5_glue->query_counter_set(&query_cs_attr, &query_out); - if (res) { - rte_flow_error_set(error, -res, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "cannot read counter"); - return -res; - } + if (err) + return rte_flow_error_set(error, err, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "cannot read counter"); query_count->hits_set = 1; query_count->bytes_set = 1; query_count->hits = counters[0] - counter_stats->hits; @@ -2499,29 +3181,28 @@ priv_flow_query_count(struct ibv_counter_set *cs, * @see rte_flow_ops */ int -mlx5_flow_query(struct rte_eth_dev *dev, +mlx5_flow_query(struct rte_eth_dev *dev __rte_unused, struct rte_flow *flow, - enum rte_flow_action_type action __rte_unused, + const struct rte_flow_action *action __rte_unused, void *data, struct rte_flow_error *error) { - struct priv *priv = dev->data->dev_private; - int res = EINVAL; - - priv_lock(priv); if (flow->cs) { - res = priv_flow_query_count(flow->cs, - &flow->counter_stats, - (struct rte_flow_query_count *)data, - error); + int ret; + + ret = mlx5_flow_query_count(flow->cs, + &flow->counter_stats, + (struct rte_flow_query_count *)data, + error); + if (ret) + return ret; } else { - rte_flow_error_set(error, res, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "no counter found for flow"); + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "no counter found for flow"); } - priv_unlock(priv); - return -res; + return 0; } #endif @@ -2538,48 +3219,50 @@ mlx5_flow_isolate(struct rte_eth_dev *dev, { struct priv *priv = dev->data->dev_private; - priv_lock(priv); if (dev->data->dev_started) { rte_flow_error_set(error, EBUSY, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, "port must be stopped first"); - priv_unlock(priv); return -rte_errno; } priv->isolated = !!enable; if (enable) - priv->dev->dev_ops = &mlx5_dev_ops_isolate; + dev->dev_ops = &mlx5_dev_ops_isolate; else - priv->dev->dev_ops = &mlx5_dev_ops; - priv_unlock(priv); + dev->dev_ops = &mlx5_dev_ops; return 0; } /** * Convert a flow director filter to a generic flow. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. * @param fdir_filter * Flow director filter to add. * @param attributes * Generic flow parameters structure. * * @return - * 0 on success, errno value on error. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_fdir_filter_convert(struct priv *priv, +mlx5_fdir_filter_convert(struct rte_eth_dev *dev, const struct rte_eth_fdir_filter *fdir_filter, struct mlx5_fdir *attributes) { + struct priv *priv = dev->data->dev_private; const struct rte_eth_fdir_input *input = &fdir_filter->input; + const struct rte_eth_fdir_masks *mask = + &dev->data->dev_conf.fdir_conf.mask; /* Validate queue number. */ if (fdir_filter->action.rx_queue >= priv->rxqs_n) { - ERROR("invalid queue number %d", fdir_filter->action.rx_queue); - return EINVAL; + DRV_LOG(ERR, "port %u invalid queue number %d", + dev->data->port_id, fdir_filter->action.rx_queue); + rte_errno = EINVAL; + return -rte_errno; } attributes->attr.ingress = 1; attributes->items[0] = (struct rte_flow_item) { @@ -2600,144 +3283,140 @@ priv_fdir_filter_convert(struct priv *priv, }; break; default: - ERROR("invalid behavior %d", fdir_filter->action.behavior); - return ENOTSUP; + DRV_LOG(ERR, "port %u invalid behavior %d", + dev->data->port_id, + fdir_filter->action.behavior); + rte_errno = ENOTSUP; + return -rte_errno; } attributes->queue.index = fdir_filter->action.rx_queue; + /* Handle L3. */ switch (fdir_filter->input.flow_type) { case RTE_ETH_FLOW_NONFRAG_IPV4_UDP: + case RTE_ETH_FLOW_NONFRAG_IPV4_TCP: + case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER: attributes->l3.ipv4.hdr = (struct ipv4_hdr){ - .src_addr = input->flow.udp4_flow.ip.src_ip, - .dst_addr = input->flow.udp4_flow.ip.dst_ip, - .time_to_live = input->flow.udp4_flow.ip.ttl, - .type_of_service = input->flow.udp4_flow.ip.tos, - .next_proto_id = input->flow.udp4_flow.ip.proto, + .src_addr = input->flow.ip4_flow.src_ip, + .dst_addr = input->flow.ip4_flow.dst_ip, + .time_to_live = input->flow.ip4_flow.ttl, + .type_of_service = input->flow.ip4_flow.tos, + .next_proto_id = input->flow.ip4_flow.proto, }; - attributes->l4.udp.hdr = (struct udp_hdr){ - .src_port = input->flow.udp4_flow.src_port, - .dst_port = input->flow.udp4_flow.dst_port, + attributes->l3_mask.ipv4.hdr = (struct ipv4_hdr){ + .src_addr = mask->ipv4_mask.src_ip, + .dst_addr = mask->ipv4_mask.dst_ip, + .time_to_live = mask->ipv4_mask.ttl, + .type_of_service = mask->ipv4_mask.tos, + .next_proto_id = mask->ipv4_mask.proto, }; attributes->items[1] = (struct rte_flow_item){ .type = RTE_FLOW_ITEM_TYPE_IPV4, .spec = &attributes->l3, - .mask = &attributes->l3, + .mask = &attributes->l3_mask, + }; + break; + case RTE_ETH_FLOW_NONFRAG_IPV6_UDP: + case RTE_ETH_FLOW_NONFRAG_IPV6_TCP: + case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER: + attributes->l3.ipv6.hdr = (struct ipv6_hdr){ + .hop_limits = input->flow.ipv6_flow.hop_limits, + .proto = input->flow.ipv6_flow.proto, + }; + + memcpy(attributes->l3.ipv6.hdr.src_addr, + input->flow.ipv6_flow.src_ip, + RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); + memcpy(attributes->l3.ipv6.hdr.dst_addr, + input->flow.ipv6_flow.dst_ip, + RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); + memcpy(attributes->l3_mask.ipv6.hdr.src_addr, + mask->ipv6_mask.src_ip, + RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr)); + memcpy(attributes->l3_mask.ipv6.hdr.dst_addr, + mask->ipv6_mask.dst_ip, + RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr)); + attributes->items[1] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_IPV6, + .spec = &attributes->l3, + .mask = &attributes->l3_mask, + }; + break; + default: + DRV_LOG(ERR, "port %u invalid flow type%d", + dev->data->port_id, fdir_filter->input.flow_type); + rte_errno = ENOTSUP; + return -rte_errno; + } + /* Handle L4. */ + switch (fdir_filter->input.flow_type) { + case RTE_ETH_FLOW_NONFRAG_IPV4_UDP: + attributes->l4.udp.hdr = (struct udp_hdr){ + .src_port = input->flow.udp4_flow.src_port, + .dst_port = input->flow.udp4_flow.dst_port, + }; + attributes->l4_mask.udp.hdr = (struct udp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, }; attributes->items[2] = (struct rte_flow_item){ .type = RTE_FLOW_ITEM_TYPE_UDP, .spec = &attributes->l4, - .mask = &attributes->l4, + .mask = &attributes->l4_mask, }; break; case RTE_ETH_FLOW_NONFRAG_IPV4_TCP: - attributes->l3.ipv4.hdr = (struct ipv4_hdr){ - .src_addr = input->flow.tcp4_flow.ip.src_ip, - .dst_addr = input->flow.tcp4_flow.ip.dst_ip, - .time_to_live = input->flow.tcp4_flow.ip.ttl, - .type_of_service = input->flow.tcp4_flow.ip.tos, - .next_proto_id = input->flow.tcp4_flow.ip.proto, - }; attributes->l4.tcp.hdr = (struct tcp_hdr){ .src_port = input->flow.tcp4_flow.src_port, .dst_port = input->flow.tcp4_flow.dst_port, }; - attributes->items[1] = (struct rte_flow_item){ - .type = RTE_FLOW_ITEM_TYPE_IPV4, - .spec = &attributes->l3, - .mask = &attributes->l3, + attributes->l4_mask.tcp.hdr = (struct tcp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, }; attributes->items[2] = (struct rte_flow_item){ .type = RTE_FLOW_ITEM_TYPE_TCP, .spec = &attributes->l4, - .mask = &attributes->l4, - }; - break; - case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER: - attributes->l3.ipv4.hdr = (struct ipv4_hdr){ - .src_addr = input->flow.ip4_flow.src_ip, - .dst_addr = input->flow.ip4_flow.dst_ip, - .time_to_live = input->flow.ip4_flow.ttl, - .type_of_service = input->flow.ip4_flow.tos, - .next_proto_id = input->flow.ip4_flow.proto, - }; - attributes->items[1] = (struct rte_flow_item){ - .type = RTE_FLOW_ITEM_TYPE_IPV4, - .spec = &attributes->l3, - .mask = &attributes->l3, + .mask = &attributes->l4_mask, }; break; case RTE_ETH_FLOW_NONFRAG_IPV6_UDP: - attributes->l3.ipv6.hdr = (struct ipv6_hdr){ - .hop_limits = input->flow.udp6_flow.ip.hop_limits, - .proto = input->flow.udp6_flow.ip.proto, - }; - memcpy(attributes->l3.ipv6.hdr.src_addr, - input->flow.udp6_flow.ip.src_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); - memcpy(attributes->l3.ipv6.hdr.dst_addr, - input->flow.udp6_flow.ip.dst_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); attributes->l4.udp.hdr = (struct udp_hdr){ .src_port = input->flow.udp6_flow.src_port, .dst_port = input->flow.udp6_flow.dst_port, }; - attributes->items[1] = (struct rte_flow_item){ - .type = RTE_FLOW_ITEM_TYPE_IPV6, - .spec = &attributes->l3, - .mask = &attributes->l3, + attributes->l4_mask.udp.hdr = (struct udp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, }; attributes->items[2] = (struct rte_flow_item){ .type = RTE_FLOW_ITEM_TYPE_UDP, .spec = &attributes->l4, - .mask = &attributes->l4, + .mask = &attributes->l4_mask, }; break; case RTE_ETH_FLOW_NONFRAG_IPV6_TCP: - attributes->l3.ipv6.hdr = (struct ipv6_hdr){ - .hop_limits = input->flow.tcp6_flow.ip.hop_limits, - .proto = input->flow.tcp6_flow.ip.proto, - }; - memcpy(attributes->l3.ipv6.hdr.src_addr, - input->flow.tcp6_flow.ip.src_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); - memcpy(attributes->l3.ipv6.hdr.dst_addr, - input->flow.tcp6_flow.ip.dst_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); attributes->l4.tcp.hdr = (struct tcp_hdr){ .src_port = input->flow.tcp6_flow.src_port, .dst_port = input->flow.tcp6_flow.dst_port, }; - attributes->items[1] = (struct rte_flow_item){ - .type = RTE_FLOW_ITEM_TYPE_IPV6, - .spec = &attributes->l3, - .mask = &attributes->l3, + attributes->l4_mask.tcp.hdr = (struct tcp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, }; attributes->items[2] = (struct rte_flow_item){ .type = RTE_FLOW_ITEM_TYPE_TCP, .spec = &attributes->l4, - .mask = &attributes->l4, + .mask = &attributes->l4_mask, }; break; + case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER: case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER: - attributes->l3.ipv6.hdr = (struct ipv6_hdr){ - .hop_limits = input->flow.ipv6_flow.hop_limits, - .proto = input->flow.ipv6_flow.proto, - }; - memcpy(attributes->l3.ipv6.hdr.src_addr, - input->flow.ipv6_flow.src_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); - memcpy(attributes->l3.ipv6.hdr.dst_addr, - input->flow.ipv6_flow.dst_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); - attributes->items[1] = (struct rte_flow_item){ - .type = RTE_FLOW_ITEM_TYPE_IPV6, - .spec = &attributes->l3, - .mask = &attributes->l3, - }; break; default: - ERROR("invalid flow type%d", - fdir_filter->input.flow_type); - return ENOTSUP; + DRV_LOG(ERR, "port %u invalid flow type%d", + dev->data->port_id, fdir_filter->input.flow_type); + rte_errno = ENOTSUP; + return -rte_errno; } return 0; } @@ -2745,18 +3424,19 @@ priv_fdir_filter_convert(struct priv *priv, /** * Add new flow director filter and store it in list. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. * @param fdir_filter * Flow director filter to add. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_fdir_filter_add(struct priv *priv, +mlx5_fdir_filter_add(struct rte_eth_dev *dev, const struct rte_eth_fdir_filter *fdir_filter) { + struct priv *priv = dev->data->dev_private; struct mlx5_fdir attributes = { .attr.group = 0, .l2_mask = { @@ -2772,41 +3452,40 @@ priv_fdir_filter_add(struct priv *priv, struct rte_flow *flow; int ret; - ret = priv_fdir_filter_convert(priv, fdir_filter, &attributes); + ret = mlx5_fdir_filter_convert(dev, fdir_filter, &attributes); if (ret) - return -ret; - ret = priv_flow_convert(priv, &attributes.attr, attributes.items, + return ret; + ret = mlx5_flow_convert(dev, &attributes.attr, attributes.items, attributes.actions, &error, &parser); if (ret) - return -ret; - flow = priv_flow_create(priv, - &priv->flows, - &attributes.attr, - attributes.items, - attributes.actions, - &error); + return ret; + flow = mlx5_flow_list_create(dev, &priv->flows, &attributes.attr, + attributes.items, attributes.actions, + &error); if (flow) { - DEBUG("FDIR created %p", (void *)flow); + DRV_LOG(DEBUG, "port %u FDIR created %p", dev->data->port_id, + (void *)flow); return 0; } - return ENOTSUP; + return -rte_errno; } /** * Delete specific filter. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. * @param fdir_filter * Filter to be deleted. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_fdir_filter_delete(struct priv *priv, +mlx5_fdir_filter_delete(struct rte_eth_dev *dev, const struct rte_eth_fdir_filter *fdir_filter) { + struct priv *priv = dev->data->dev_private; struct mlx5_fdir attributes = { .attr.group = 0, }; @@ -2819,10 +3498,10 @@ priv_fdir_filter_delete(struct priv *priv, unsigned int i; int ret; - ret = priv_fdir_filter_convert(priv, fdir_filter, &attributes); + ret = mlx5_fdir_filter_convert(dev, fdir_filter, &attributes); if (ret) - return -ret; - ret = priv_flow_convert(priv, &attributes.attr, attributes.items, + return ret; + ret = mlx5_flow_convert(dev, &attributes.attr, attributes.items, attributes.actions, &error, &parser); if (ret) goto exit; @@ -2850,11 +3529,14 @@ priv_fdir_filter_delete(struct priv *priv, struct ibv_spec_header *flow_h; void *flow_spec; unsigned int specs_n; + unsigned int queue_id = parser.drop ? HASH_RXQ_ETH : + parser.layer; - attr = parser.queue[HASH_RXQ_ETH].ibv_attr; - flow_attr = flow->frxq[HASH_RXQ_ETH].ibv_attr; + attr = parser.queue[queue_id].ibv_attr; + flow_attr = flow->frxq[queue_id].ibv_attr; /* Compare first the attributes. */ - if (memcmp(attr, flow_attr, sizeof(struct ibv_flow_attr))) + if (!flow_attr || + memcmp(attr, flow_attr, sizeof(struct ibv_flow_attr))) continue; if (attr->num_of_specs == 0) continue; @@ -2879,67 +3561,70 @@ wrong_flow: /* The flow does not match. */ continue; } + ret = rte_errno; /* Save rte_errno before cleanup. */ if (flow) - priv_flow_destroy(priv, &priv->flows, flow); + mlx5_flow_list_destroy(dev, &priv->flows, flow); exit: for (i = 0; i != hash_rxq_init_n; ++i) { if (parser.queue[i].ibv_attr) rte_free(parser.queue[i].ibv_attr); } - return -ret; + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } /** * Update queue for specific filter. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. * @param fdir_filter * Filter to be updated. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_fdir_filter_update(struct priv *priv, +mlx5_fdir_filter_update(struct rte_eth_dev *dev, const struct rte_eth_fdir_filter *fdir_filter) { int ret; - ret = priv_fdir_filter_delete(priv, fdir_filter); + ret = mlx5_fdir_filter_delete(dev, fdir_filter); if (ret) return ret; - ret = priv_fdir_filter_add(priv, fdir_filter); - return ret; + return mlx5_fdir_filter_add(dev, fdir_filter); } /** * Flush all filters. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. */ static void -priv_fdir_filter_flush(struct priv *priv) +mlx5_fdir_filter_flush(struct rte_eth_dev *dev) { - priv_flow_flush(priv, &priv->flows); + struct priv *priv = dev->data->dev_private; + + mlx5_flow_list_flush(dev, &priv->flows); } /** * Get flow director information. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. * @param[out] fdir_info * Resulting flow director information. */ static void -priv_fdir_info_get(struct priv *priv, struct rte_eth_fdir_info *fdir_info) +mlx5_fdir_info_get(struct rte_eth_dev *dev, struct rte_eth_fdir_info *fdir_info) { struct rte_eth_fdir_masks *mask = - &priv->dev->data->dev_conf.fdir_conf.mask; + &dev->data->dev_conf.fdir_conf.mask; - fdir_info->mode = priv->dev->data->dev_conf.fdir_conf.mode; + fdir_info->mode = dev->data->dev_conf.fdir_conf.mode; fdir_info->guarant_spc = 0; rte_memcpy(&fdir_info->mask, mask, sizeof(fdir_info->mask)); fdir_info->max_flexpayload = 0; @@ -2953,54 +3638,52 @@ priv_fdir_info_get(struct priv *priv, struct rte_eth_fdir_info *fdir_info) /** * Deal with flow director operations. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param filter_op * Operation to perform. * @param arg * Pointer to operation-specific structure. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_fdir_ctrl_func(struct priv *priv, enum rte_filter_op filter_op, void *arg) +mlx5_fdir_ctrl_func(struct rte_eth_dev *dev, enum rte_filter_op filter_op, + void *arg) { enum rte_fdir_mode fdir_mode = - priv->dev->data->dev_conf.fdir_conf.mode; - int ret = 0; + dev->data->dev_conf.fdir_conf.mode; if (filter_op == RTE_ETH_FILTER_NOP) return 0; if (fdir_mode != RTE_FDIR_MODE_PERFECT && fdir_mode != RTE_FDIR_MODE_PERFECT_MAC_VLAN) { - ERROR("%p: flow director mode %d not supported", - (void *)priv, fdir_mode); - return EINVAL; + DRV_LOG(ERR, "port %u flow director mode %d not supported", + dev->data->port_id, fdir_mode); + rte_errno = EINVAL; + return -rte_errno; } switch (filter_op) { case RTE_ETH_FILTER_ADD: - ret = priv_fdir_filter_add(priv, arg); - break; + return mlx5_fdir_filter_add(dev, arg); case RTE_ETH_FILTER_UPDATE: - ret = priv_fdir_filter_update(priv, arg); - break; + return mlx5_fdir_filter_update(dev, arg); case RTE_ETH_FILTER_DELETE: - ret = priv_fdir_filter_delete(priv, arg); - break; + return mlx5_fdir_filter_delete(dev, arg); case RTE_ETH_FILTER_FLUSH: - priv_fdir_filter_flush(priv); + mlx5_fdir_filter_flush(dev); break; case RTE_ETH_FILTER_INFO: - priv_fdir_info_get(priv, arg); + mlx5_fdir_info_get(dev, arg); break; default: - DEBUG("%p: unknown operation %u", (void *)priv, - filter_op); - ret = EINVAL; - break; + DRV_LOG(DEBUG, "port %u unknown operation %u", + dev->data->port_id, filter_op); + rte_errno = EINVAL; + return -rte_errno; } - return ret; + return 0; } /** @@ -3016,7 +3699,7 @@ priv_fdir_ctrl_func(struct priv *priv, enum rte_filter_op filter_op, void *arg) * Pointer to operation-specific structure. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_dev_filter_ctrl(struct rte_eth_dev *dev, @@ -3024,24 +3707,74 @@ mlx5_dev_filter_ctrl(struct rte_eth_dev *dev, enum rte_filter_op filter_op, void *arg) { - int ret = EINVAL; - struct priv *priv = dev->data->dev_private; - switch (filter_type) { case RTE_ETH_FILTER_GENERIC: - if (filter_op != RTE_ETH_FILTER_GET) - return -EINVAL; + if (filter_op != RTE_ETH_FILTER_GET) { + rte_errno = EINVAL; + return -rte_errno; + } *(const void **)arg = &mlx5_flow_ops; return 0; case RTE_ETH_FILTER_FDIR: - priv_lock(priv); - ret = priv_fdir_ctrl_func(priv, filter_op, arg); - priv_unlock(priv); - break; + return mlx5_fdir_ctrl_func(dev, filter_op, arg); default: - ERROR("%p: filter type (%d) not supported", - (void *)dev, filter_type); - break; + DRV_LOG(ERR, "port %u filter type (%d) not supported", + dev->data->port_id, filter_type); + rte_errno = ENOTSUP; + return -rte_errno; } - return -ret; + return 0; +} + +/** + * Detect number of Verbs flow priorities supported. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * number of supported Verbs flow priority. + */ +unsigned int +mlx5_get_max_verbs_prio(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + unsigned int verb_priorities = MLX5_VERBS_FLOW_PRIO_8; + struct { + struct ibv_flow_attr attr; + struct ibv_flow_spec_eth eth; + struct ibv_flow_spec_action_drop drop; + } flow_attr = { + .attr = { + .num_of_specs = 2, + }, + .eth = { + .type = IBV_FLOW_SPEC_ETH, + .size = sizeof(struct ibv_flow_spec_eth), + }, + .drop = { + .size = sizeof(struct ibv_flow_spec_action_drop), + .type = IBV_FLOW_SPEC_ACTION_DROP, + }, + }; + struct ibv_flow *flow; + + do { + flow_attr.attr.priority = verb_priorities - 1; + flow = mlx5_glue->create_flow(priv->flow_drop_queue->qp, + &flow_attr.attr); + if (flow) { + claim_zero(mlx5_glue->destroy_flow(flow)); + /* Try more priorities. */ + verb_priorities *= 2; + } else { + /* Failed, restore last right number. */ + verb_priorities /= 2; + break; + } + } while (1); + DRV_LOG(DEBUG, "port %u Verbs flow priorities: %d," + " user flow priorities: %d", + dev->data->port_id, verb_priorities, MLX5_CTRL_FLOW_PRIORITY); + return verb_priorities; } diff --git a/drivers/net/mlx5/mlx5_glue.c b/drivers/net/mlx5/mlx5_glue.c index 1c4396ad..c7965e51 100644 --- a/drivers/net/mlx5/mlx5_glue.c +++ b/drivers/net/mlx5/mlx5_glue.c @@ -1,12 +1,18 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2018 6WIND S.A. - * Copyright 2018 Mellanox Technologies, Ltd. + * Copyright 2018 Mellanox Technologies, Ltd */ #include <errno.h> #include <stddef.h> #include <stdint.h> +/* + * Not needed by this file; included to work around the lack of off_t + * definition for mlx5dv.h with unpatched rdma-core versions. + */ +#include <sys/types.h> + /* Verbs headers do not support -pedantic. */ #ifdef PEDANTIC #pragma GCC diagnostic ignored "-Wpedantic" @@ -287,6 +293,21 @@ mlx5_glue_dv_create_cq(struct ibv_context *context, return mlx5dv_create_cq(context, cq_attr, mlx5_cq_attr); } +static struct ibv_wq * +mlx5_glue_dv_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *wq_attr, + struct mlx5dv_wq_init_attr *mlx5_wq_attr) +{ +#ifndef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + (void)context; + (void)wq_attr; + (void)mlx5_wq_attr; + return NULL; +#else + return mlx5dv_create_wq(context, wq_attr, mlx5_wq_attr); +#endif +} + static int mlx5_glue_dv_query_device(struct ibv_context *ctx, struct mlx5dv_context *attrs_out) @@ -307,6 +328,21 @@ mlx5_glue_dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type) return mlx5dv_init_obj(obj, obj_type); } +static struct ibv_qp * +mlx5_glue_dv_create_qp(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex, + struct mlx5dv_qp_init_attr *dv_qp_init_attr) +{ +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + return mlx5dv_create_qp(context, qp_init_attr_ex, dv_qp_init_attr); +#else + (void)context; + (void)qp_init_attr_ex; + (void)dv_qp_init_attr; + return NULL; +#endif +} + const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){ .version = MLX5_GLUE_VERSION, .fork_init = mlx5_glue_fork_init, @@ -347,7 +383,9 @@ const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){ .port_state_str = mlx5_glue_port_state_str, .cq_ex_to_cq = mlx5_glue_cq_ex_to_cq, .dv_create_cq = mlx5_glue_dv_create_cq, + .dv_create_wq = mlx5_glue_dv_create_wq, .dv_query_device = mlx5_glue_dv_query_device, .dv_set_context_attr = mlx5_glue_dv_set_context_attr, .dv_init_obj = mlx5_glue_dv_init_obj, + .dv_create_qp = mlx5_glue_dv_create_qp, }; diff --git a/drivers/net/mlx5/mlx5_glue.h b/drivers/net/mlx5/mlx5_glue.h index b5efee3b..e584d367 100644 --- a/drivers/net/mlx5/mlx5_glue.h +++ b/drivers/net/mlx5/mlx5_glue.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2018 6WIND S.A. - * Copyright 2018 Mellanox Technologies, Ltd. + * Copyright 2018 Mellanox Technologies, Ltd */ #ifndef MLX5_GLUE_H_ @@ -31,6 +31,14 @@ struct ibv_counter_set_init_attr; struct ibv_query_counter_set_attr; #endif +#ifndef HAVE_IBV_DEVICE_TUNNEL_SUPPORT +struct mlx5dv_qp_init_attr; +#endif + +#ifndef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT +struct mlx5dv_wq_init_attr; +#endif + /* LIB_GLUE_VERSION must be updated every time this structure is modified. */ struct mlx5_glue { const char *version; @@ -100,12 +108,20 @@ struct mlx5_glue { (struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr, struct mlx5dv_cq_init_attr *mlx5_cq_attr); + struct ibv_wq *(*dv_create_wq) + (struct ibv_context *context, + struct ibv_wq_init_attr *wq_attr, + struct mlx5dv_wq_init_attr *mlx5_wq_attr); int (*dv_query_device)(struct ibv_context *ctx_in, struct mlx5dv_context *attrs_out); int (*dv_set_context_attr)(struct ibv_context *ibv_ctx, enum mlx5dv_set_ctx_attr_type type, void *attr); int (*dv_init_obj)(struct mlx5dv_obj *obj, uint64_t obj_type); + struct ibv_qp *(*dv_create_qp) + (struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex, + struct mlx5dv_qp_init_attr *dv_qp_init_attr); }; const struct mlx5_glue *mlx5_glue; diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c index e8a8d459..672a4761 100644 --- a/drivers/net/mlx5/mlx5_mac.c +++ b/drivers/net/mlx5/mlx5_mac.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -35,44 +35,52 @@ /** * Get MAC address by querying netdevice. * - * @param[in] priv - * struct priv for the requested device. + * @param[in] dev + * Pointer to Ethernet device. * @param[out] mac * MAC address output buffer. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]) +mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN]) { struct ifreq request; + int ret; - if (priv_ifreq(priv, SIOCGIFHWADDR, &request)) - return -1; + ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request); + if (ret) + return ret; memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); return 0; } /** - * DPDK callback to remove a MAC address. + * Remove a MAC address from the internal array. * * @param dev * Pointer to Ethernet device structure. * @param index * MAC address index. */ -void -mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) +static void +mlx5_internal_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) { + struct priv *priv = dev->data->dev_private; + const int vf = priv->config.vf; + assert(index < MLX5_MAX_MAC_ADDRESSES); + if (is_zero_ether_addr(&dev->data->mac_addrs[index])) + return; + if (vf) + mlx5_nl_mac_addr_remove(dev, &dev->data->mac_addrs[index], + index); memset(&dev->data->mac_addrs[index], 0, sizeof(struct ether_addr)); - if (!dev->data->promiscuous) - mlx5_traffic_restart(dev); } /** - * DPDK callback to add a MAC address. + * Adds a MAC address to the internal array. * * @param dev * Pointer to Ethernet device structure. @@ -80,21 +88,23 @@ mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) * MAC address to register. * @param index * MAC address index. - * @param vmdq - * VMDq pool index to associate address with (ignored). * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -int -mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, - uint32_t index, uint32_t vmdq) +static int +mlx5_internal_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index) { + struct priv *priv = dev->data->dev_private; + const int vf = priv->config.vf; unsigned int i; - int ret = 0; - (void)vmdq; assert(index < MLX5_MAX_MAC_ADDRESSES); + if (is_zero_ether_addr(mac)) { + rte_errno = EINVAL; + return -rte_errno; + } /* First, make sure this address isn't already configured. */ for (i = 0; (i != MLX5_MAX_MAC_ADDRESSES); ++i) { /* Skip this index, it's going to be reconfigured. */ @@ -103,12 +113,74 @@ mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, if (memcmp(&dev->data->mac_addrs[i], mac, sizeof(*mac))) continue; /* Address already configured elsewhere, return with error. */ - return EADDRINUSE; + rte_errno = EADDRINUSE; + return -rte_errno; + } + if (vf) { + int ret = mlx5_nl_mac_addr_add(dev, mac, index); + + if (ret) + return ret; } dev->data->mac_addrs[index] = *mac; + return 0; +} + +/** + * DPDK callback to remove a MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param index + * MAC address index. + */ +void +mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) +{ + int ret; + + if (index >= MLX5_MAX_UC_MAC_ADDRESSES) + return; + mlx5_internal_mac_addr_remove(dev, index); + if (!dev->data->promiscuous) { + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot restart traffic: %s", + dev->data->port_id, strerror(rte_errno)); + } +} + +/** + * DPDK callback to add a MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param mac_addr + * MAC address to register. + * @param index + * MAC address index. + * @param vmdq + * VMDq pool index to associate address with (ignored). + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index, uint32_t vmdq __rte_unused) +{ + int ret; + + if (index >= MLX5_MAX_UC_MAC_ADDRESSES) { + rte_errno = EINVAL; + return -rte_errno; + } + ret = mlx5_internal_mac_addr_add(dev, mac, index); + if (ret < 0) + return ret; if (!dev->data->promiscuous) - mlx5_traffic_restart(dev); - return ret; + return mlx5_traffic_restart(dev); + return 0; } /** @@ -118,10 +190,43 @@ mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, * Pointer to Ethernet device structure. * @param mac_addr * MAC address to register. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -void +int mlx5_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr) { - DEBUG("%p: setting primary MAC address", (void *)dev); - mlx5_mac_addr_add(dev, mac_addr, 0, 0); + DRV_LOG(DEBUG, "port %u setting primary MAC address", + dev->data->port_id); + return mlx5_mac_addr_add(dev, mac_addr, 0, 0); +} + +/** + * DPDK callback to set multicast addresses list. + * + * @see rte_eth_dev_set_mc_addr_list() + */ +int +mlx5_set_mc_addr_list(struct rte_eth_dev *dev, + struct ether_addr *mc_addr_set, uint32_t nb_mc_addr) +{ + uint32_t i; + int ret; + + if (nb_mc_addr >= MLX5_MAX_MC_MAC_ADDRESSES) { + rte_errno = ENOSPC; + return -rte_errno; + } + for (i = MLX5_MAX_UC_MAC_ADDRESSES; i != MLX5_MAX_MAC_ADDRESSES; ++i) + mlx5_internal_mac_addr_remove(dev, i); + i = MLX5_MAX_UC_MAC_ADDRESSES; + while (nb_mc_addr--) { + ret = mlx5_internal_mac_addr_add(dev, mc_addr_set++, i++); + if (ret) + return ret; + } + if (!dev->data->promiscuous) + return mlx5_traffic_restart(dev); + return 0; } diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c index 857dfcd8..08105a44 100644 --- a/drivers/net/mlx5/mlx5_mr.c +++ b/drivers/net/mlx5/mlx5_mr.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2016 6WIND S.A. - * Copyright 2016 Mellanox. + * Copyright 2016 Mellanox Technologies, Ltd */ #ifdef PEDANTIC @@ -13,362 +13,1183 @@ #include <rte_mempool.h> #include <rte_malloc.h> +#include <rte_rwlock.h> #include "mlx5.h" +#include "mlx5_mr.h" #include "mlx5_rxtx.h" #include "mlx5_glue.h" -struct mlx5_check_mempool_data { +struct mr_find_contig_memsegs_data { + uintptr_t addr; + uintptr_t start; + uintptr_t end; + const struct rte_memseg_list *msl; +}; + +struct mr_update_mp_data { + struct rte_eth_dev *dev; + struct mlx5_mr_ctrl *mr_ctrl; int ret; - char *start; - char *end; }; -/* Called by mlx5_check_mempool() when iterating the memory chunks. */ -static void -mlx5_check_mempool_cb(struct rte_mempool *mp, - void *opaque, struct rte_mempool_memhdr *memhdr, - unsigned int mem_idx) +/** + * Expand B-tree table to a given size. Can't be called with holding + * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc(). + * + * @param bt + * Pointer to B-tree structure. + * @param n + * Number of entries for expansion. + * + * @return + * 0 on success, -1 on failure. + */ +static int +mr_btree_expand(struct mlx5_mr_btree *bt, int n) { - struct mlx5_check_mempool_data *data = opaque; + void *mem; + int ret = 0; - (void)mp; - (void)mem_idx; + if (n <= bt->size) + return ret; + /* + * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is + * used inside if there's no room to expand. Because this is a quite + * rare case and a part of very slow path, it is very acceptable. + * Initially cache_bh[] will be given practically enough space and once + * it is expanded, expansion wouldn't be needed again ever. + */ + mem = rte_realloc(bt->table, n * sizeof(struct mlx5_mr_cache), 0); + if (mem == NULL) { + /* Not an error, B-tree search will be skipped. */ + DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table", + (void *)bt); + ret = -1; + } else { + DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n); + bt->table = mem; + bt->size = n; + } + return ret; +} - /* It already failed, skip the next chunks. */ - if (data->ret != 0) - return; - /* It is the first chunk. */ - if (data->start == NULL && data->end == NULL) { - data->start = memhdr->addr; - data->end = data->start + memhdr->len; - return; +/** + * Look up LKey from given B-tree lookup table, store the last index and return + * searched LKey. + * + * @param bt + * Pointer to B-tree structure. + * @param[out] idx + * Pointer to index. Even on search failure, returns index where it stops + * searching so that index can be used when inserting a new entry. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static uint32_t +mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr) +{ + struct mlx5_mr_cache *lkp_tbl; + uint16_t n; + uint16_t base = 0; + + assert(bt != NULL); + lkp_tbl = *bt->table; + n = bt->len; + /* First entry must be NULL for comparison. */ + assert(bt->len > 0 || (lkp_tbl[0].start == 0 && + lkp_tbl[0].lkey == UINT32_MAX)); + /* Binary search. */ + do { + register uint16_t delta = n >> 1; + + if (addr < lkp_tbl[base + delta].start) { + n = delta; + } else { + base += delta; + n -= delta; + } + } while (n > 1); + assert(addr >= lkp_tbl[base].start); + *idx = base; + if (addr < lkp_tbl[base].end) + return lkp_tbl[base].lkey; + /* Not found. */ + return UINT32_MAX; +} + +/** + * Insert an entry to B-tree lookup table. + * + * @param bt + * Pointer to B-tree structure. + * @param entry + * Pointer to new entry to insert. + * + * @return + * 0 on success, -1 on failure. + */ +static int +mr_btree_insert(struct mlx5_mr_btree *bt, struct mlx5_mr_cache *entry) +{ + struct mlx5_mr_cache *lkp_tbl; + uint16_t idx = 0; + size_t shift; + + assert(bt != NULL); + assert(bt->len <= bt->size); + assert(bt->len > 0); + lkp_tbl = *bt->table; + /* Find out the slot for insertion. */ + if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { + DRV_LOG(DEBUG, + "abort insertion to B-tree(%p): already exist at" + " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); + /* Already exist, return. */ + return 0; } - if (data->end == memhdr->addr) { - data->end += memhdr->len; - return; + /* If table is full, return error. */ + if (unlikely(bt->len == bt->size)) { + bt->overflow = 1; + return -1; } - if (data->start == (char *)memhdr->addr + memhdr->len) { - data->start -= memhdr->len; + /* Insert entry. */ + ++idx; + shift = (bt->len - idx) * sizeof(struct mlx5_mr_cache); + if (shift) + memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); + lkp_tbl[idx] = *entry; + bt->len++; + DRV_LOG(DEBUG, + "inserted B-tree(%p)[%u]," + " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); + return 0; +} + +/** + * Initialize B-tree and allocate memory for lookup table. + * + * @param bt + * Pointer to B-tree structure. + * @param n + * Number of entries to allocate. + * @param socket + * NUMA socket on which memory must be allocated. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket) +{ + if (bt == NULL) { + rte_errno = EINVAL; + return -rte_errno; + } + assert(!bt->table && !bt->size); + memset(bt, 0, sizeof(*bt)); + bt->table = rte_calloc_socket("B-tree table", + n, sizeof(struct mlx5_mr_cache), + 0, socket); + if (bt->table == NULL) { + rte_errno = ENOMEM; + DRV_LOG(ERR, + "failed to allocate memory for btree cache on socket %d", + socket); + return -rte_errno; + } + bt->size = n; + /* First entry must be NULL for binary search. */ + (*bt->table)[bt->len++] = (struct mlx5_mr_cache) { + .lkey = UINT32_MAX, + }; + DRV_LOG(DEBUG, "initialized B-tree %p with table %p", + (void *)bt, (void *)bt->table); + return 0; +} + +/** + * Free B-tree resources. + * + * @param bt + * Pointer to B-tree structure. + */ +void +mlx5_mr_btree_free(struct mlx5_mr_btree *bt) +{ + if (bt == NULL) + return; + DRV_LOG(DEBUG, "freeing B-tree %p with table %p", + (void *)bt, (void *)bt->table); + rte_free(bt->table); + memset(bt, 0, sizeof(*bt)); +} + +/** + * Dump all the entries in a B-tree + * + * @param bt + * Pointer to B-tree structure. + */ +static void +mlx5_mr_btree_dump(struct mlx5_mr_btree *bt) +{ + int idx; + struct mlx5_mr_cache *lkp_tbl; + + if (bt == NULL) return; + lkp_tbl = *bt->table; + for (idx = 0; idx < bt->len; ++idx) { + struct mlx5_mr_cache *entry = &lkp_tbl[idx]; + + DRV_LOG(DEBUG, + "B-tree(%p)[%u]," + " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); } - /* Error, mempool is not virtually contiguous. */ - data->ret = -1; } /** - * Check if a mempool can be used: it must be virtually contiguous. + * Find virtually contiguous memory chunk in a given MR. * - * @param[in] mp - * Pointer to memory pool. - * @param[out] start - * Pointer to the start address of the mempool virtual memory area - * @param[out] end - * Pointer to the end address of the mempool virtual memory area + * @param dev + * Pointer to MR structure. + * @param[out] entry + * Pointer to returning MR cache entry. If not found, this will not be + * updated. + * @param start_idx + * Start index of the memseg bitmap. * * @return - * 0 on success (mempool is virtually contiguous), -1 on error. + * Next index to go on lookup. */ -static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start, - uintptr_t *end) +static int +mr_find_next_chunk(struct mlx5_mr *mr, struct mlx5_mr_cache *entry, + int base_idx) { - struct mlx5_check_mempool_data data; + uintptr_t start = 0; + uintptr_t end = 0; + uint32_t idx = 0; - memset(&data, 0, sizeof(data)); - rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data); - *start = (uintptr_t)data.start; - *end = (uintptr_t)data.end; + for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { + if (rte_bitmap_get(mr->ms_bmp, idx)) { + const struct rte_memseg_list *msl; + const struct rte_memseg *ms; - return data.ret; + msl = mr->msl; + ms = rte_fbarray_get(&msl->memseg_arr, + mr->ms_base_idx + idx); + assert(msl->page_sz == ms->hugepage_sz); + if (!start) + start = ms->addr_64; + end = ms->addr_64 + ms->hugepage_sz; + } else if (start) { + /* Passed the end of a fragment. */ + break; + } + } + if (start) { + /* Found one chunk. */ + entry->start = start; + entry->end = end; + entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey); + } + return idx; } /** - * Register a Memory Region (MR) <-> Memory Pool (MP) association in - * txq->mp2mr[]. If mp2mr[] is full, remove an entry first. + * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. + * Then, this entry will have to be searched by mr_lookup_dev_list() in + * mlx5_mr_create() on miss. * - * This function should only be called by txq_mp2mr(). + * @param dev + * Pointer to Ethernet device. + * @param mr + * Pointer to MR to insert. * - * @param priv - * Pointer to private structure. - * @param txq - * Pointer to TX queue structure. - * @param[in] mp - * Memory Pool for which a Memory Region lkey must be returned. - * @param idx - * Index of the next available entry. + * @return + * 0 on success, -1 on failure. + */ +static int +mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx5_mr *mr) +{ + struct priv *priv = dev->data->dev_private; + unsigned int n; + + DRV_LOG(DEBUG, "port %u inserting MR(%p) to global cache", + dev->data->port_id, (void *)mr); + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx5_mr_cache entry = { 0, }; + + /* Find a contiguous chunk and advance the index. */ + n = mr_find_next_chunk(mr, &entry, n); + if (!entry.end) + break; + if (mr_btree_insert(&priv->mr.cache, &entry) < 0) { + /* + * Overflowed, but the global table cannot be expanded + * because of deadlock. + */ + return -1; + } + } + return 0; +} + +/** + * Look up address in the original global MR list. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry. If no match, this will not be updated. + * @param addr + * Search key. * * @return - * mr on success, NULL on failure. + * Found MR on match, NULL otherwise. */ -struct mlx5_mr* -priv_txq_mp2mr_reg(struct priv *priv, struct mlx5_txq_data *txq, - struct rte_mempool *mp, unsigned int idx) +static struct mlx5_mr * +mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry, + uintptr_t addr) { - struct mlx5_txq_ctrl *txq_ctrl = - container_of(txq, struct mlx5_txq_ctrl, txq); + struct priv *priv = dev->data->dev_private; struct mlx5_mr *mr; - /* Add a new entry, register MR first. */ - DEBUG("%p: discovered new memory pool \"%s\" (%p)", - (void *)txq_ctrl, mp->name, (void *)mp); - mr = priv_mr_get(priv, mp); - if (mr == NULL) { - if (rte_eal_process_type() != RTE_PROC_PRIMARY) { - DEBUG("Using unregistered mempool 0x%p(%s) in secondary process," - " please create mempool before rte_eth_dev_start()", - (void *)mp, mp->name); - return NULL; + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + unsigned int n; + + if (mr->ms_n == 0) + continue; + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx5_mr_cache ret = { 0, }; + + n = mr_find_next_chunk(mr, &ret, n); + if (addr >= ret.start && addr < ret.end) { + /* Found. */ + *entry = ret; + return mr; + } } - mr = priv_mr_new(priv, mp); - } - if (unlikely(mr == NULL)) { - DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", - (void *)txq_ctrl); - return NULL; - } - if (unlikely(idx == RTE_DIM(txq->mp2mr))) { - /* Table is full, remove oldest entry. */ - DEBUG("%p: MR <-> MP table full, dropping oldest entry.", - (void *)txq_ctrl); - --idx; - priv_mr_release(priv, txq->mp2mr[0]); - memmove(&txq->mp2mr[0], &txq->mp2mr[1], - (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); } - /* Store the new entry. */ - txq_ctrl->txq.mp2mr[idx] = mr; - DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, - (void *)txq_ctrl, mp->name, (void *)mp, - txq_ctrl->txq.mp2mr[idx]->lkey); - return mr; + return NULL; } /** - * Register a Memory Region (MR) <-> Memory Pool (MP) association in - * txq->mp2mr[]. If mp2mr[] is full, remove an entry first. - * - * This function should only be called by txq_mp2mr(). + * Look up address on device. * - * @param txq - * Pointer to TX queue structure. - * @param[in] mp - * Memory Pool for which a Memory Region lkey must be returned. - * @param idx - * Index of the next available entry. + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry. If no match, this will not be updated. + * @param addr + * Search key. * * @return - * mr on success, NULL on failure. + * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. */ -struct mlx5_mr* -mlx5_txq_mp2mr_reg(struct mlx5_txq_data *txq, struct rte_mempool *mp, - unsigned int idx) +static uint32_t +mr_lookup_dev(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry, + uintptr_t addr) { - struct mlx5_txq_ctrl *txq_ctrl = - container_of(txq, struct mlx5_txq_ctrl, txq); + struct priv *priv = dev->data->dev_private; + uint16_t idx; + uint32_t lkey = UINT32_MAX; struct mlx5_mr *mr; - priv_lock(txq_ctrl->priv); - mr = priv_txq_mp2mr_reg(txq_ctrl->priv, txq, mp, idx); - priv_unlock(txq_ctrl->priv); - return mr; + /* + * If the global cache has overflowed since it failed to expand the + * B-tree table, it can't have all the existing MRs. Then, the address + * has to be searched by traversing the original MR list instead, which + * is very slow path. Otherwise, the global cache is all inclusive. + */ + if (!unlikely(priv->mr.cache.overflow)) { + lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + if (lkey != UINT32_MAX) + *entry = (*priv->mr.cache.table)[idx]; + } else { + /* Falling back to the slowest path. */ + mr = mr_lookup_dev_list(dev, entry, addr); + if (mr != NULL) + lkey = entry->lkey; + } + assert(lkey == UINT32_MAX || (addr >= entry->start && + addr < entry->end)); + return lkey; } -struct mlx5_mp2mr_mbuf_check_data { - int ret; -}; +/** + * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() + * can raise memory free event and the callback function will spin on the lock. + * + * @param mr + * Pointer to MR to free. + */ +static void +mr_free(struct mlx5_mr *mr) +{ + if (mr == NULL) + return; + DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr); + if (mr->ibv_mr != NULL) + claim_zero(mlx5_glue->dereg_mr(mr->ibv_mr)); + if (mr->ms_bmp != NULL) + rte_bitmap_free(mr->ms_bmp); + rte_free(mr); +} /** - * Callback function for rte_mempool_obj_iter() to check whether a given - * mempool object looks like a mbuf. - * - * @param[in] mp - * The mempool pointer - * @param[in] arg - * Context data (struct txq_mp2mr_mbuf_check_data). Contains the - * return value. - * @param[in] obj - * Object address. - * @param index - * Object index, unused. + * Releass resources of detached MR having no online entry. + * + * @param dev + * Pointer to Ethernet device. */ static void -txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj, - uint32_t index __rte_unused) +mlx5_mr_garbage_collect(struct rte_eth_dev *dev) { - struct mlx5_mp2mr_mbuf_check_data *data = arg; - struct rte_mbuf *buf = obj; + struct priv *priv = dev->data->dev_private; + struct mlx5_mr *mr_next; + struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); + /* Must be called from the primary process. */ + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); /* - * Check whether mbuf structure fits element size and whether mempool - * pointer is valid. + * MR can't be freed with holding the lock because rte_free() could call + * memory free callback function. This will be a deadlock situation. */ - if (sizeof(*buf) > mp->elt_size || buf->pool != mp) - data->ret = -1; + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Detach the whole free list and release it after unlocking. */ + free_list = priv->mr.mr_free_list; + LIST_INIT(&priv->mr.mr_free_list); + rte_rwlock_write_unlock(&priv->mr.rwlock); + /* Release resources. */ + mr_next = LIST_FIRST(&free_list); + while (mr_next != NULL) { + struct mlx5_mr *mr = mr_next; + + mr_next = LIST_NEXT(mr, mr); + mr_free(mr); + } +} + +/* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */ +static int +mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg) +{ + struct mr_find_contig_memsegs_data *data = arg; + + if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) + return 0; + /* Found, save it and stop walking. */ + data->start = ms->addr_64; + data->end = ms->addr_64 + len; + data->msl = msl; + return 1; } /** - * Iterator function for rte_mempool_walk() to register existing mempools and - * fill the MP to MR cache of a TX queue. + * Create a new global Memroy Region (MR) for a missing virtual address. + * Register entire virtually contiguous memory chunk around the address. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry, found in the global cache or newly + * created. If failed to create one, this will not be updated. + * @param addr + * Target virtual address to register. * - * @param[in] mp - * Memory Pool to register. - * @param *arg - * Pointer to TX queue structure. + * @return + * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. */ -void -mlx5_mp2mr_iter(struct rte_mempool *mp, void *arg) +static uint32_t +mlx5_mr_create(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry, + uintptr_t addr) { - struct priv *priv = (struct priv *)arg; - struct mlx5_mp2mr_mbuf_check_data data = { - .ret = 0, + struct priv *priv = dev->data->dev_private; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + const struct rte_memseg_list *msl; + const struct rte_memseg *ms; + struct mlx5_mr *mr = NULL; + size_t len; + uint32_t ms_n; + uint32_t bmp_size; + void *bmp_mem; + int ms_idx_shift = -1; + unsigned int n; + struct mr_find_contig_memsegs_data data = { + .addr = addr, }; - struct mlx5_mr *mr; + struct mr_find_contig_memsegs_data data_re; - /* Register mempool only if the first element looks like a mbuf. */ - if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 || - data.ret == -1) - return; - mr = priv_mr_get(priv, mp); - if (mr) { - priv_mr_release(priv, mr); - return; + DRV_LOG(DEBUG, "port %u creating a MR using address (%p)", + dev->data->port_id, (void *)addr); + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + DRV_LOG(WARNING, + "port %u using address (%p) of unregistered mempool" + " in secondary process, please create mempool" + " before rte_eth_dev_start()", + dev->data->port_id, (void *)addr); + rte_errno = EPERM; + goto err_nolock; + } + /* + * Release detached MRs if any. This can't be called with holding either + * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have + * been detached by the memory free event but it couldn't be released + * inside the callback due to deadlock. As a result, releasing resources + * is quite opportunistic. + */ + mlx5_mr_garbage_collect(dev); + /* + * Find out a contiguous virtual address chunk in use, to which the + * given address belongs, in order to register maximum range. In the + * best case where mempools are not dynamically recreated and + * '--socket-mem' is speicified as an EAL option, it is very likely to + * have only one MR(LKey) per a socket and per a hugepage-size even + * though the system memory is highly fragmented. + */ + if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { + DRV_LOG(WARNING, + "port %u unable to find virtually contiguous" + " chunk for address (%p)." + " rte_memseg_contig_walk() failed.", + dev->data->port_id, (void *)addr); + rte_errno = ENXIO; + goto err_nolock; + } +alloc_resources: + /* Addresses must be page-aligned. */ + assert(rte_is_aligned((void *)data.start, data.msl->page_sz)); + assert(rte_is_aligned((void *)data.end, data.msl->page_sz)); + msl = data.msl; + ms = rte_mem_virt2memseg((void *)data.start, msl); + len = data.end - data.start; + assert(msl->page_sz == ms->hugepage_sz); + /* Number of memsegs in the range. */ + ms_n = len / msl->page_sz; + DRV_LOG(DEBUG, + "port %u extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," + " page_sz=0x%" PRIx64 ", ms_n=%u", + dev->data->port_id, (void *)addr, + data.start, data.end, msl->page_sz, ms_n); + /* Size of memory for bitmap. */ + bmp_size = rte_bitmap_get_memory_footprint(ms_n); + mr = rte_zmalloc_socket(NULL, + RTE_ALIGN_CEIL(sizeof(*mr), + RTE_CACHE_LINE_SIZE) + + bmp_size, + RTE_CACHE_LINE_SIZE, msl->socket_id); + if (mr == NULL) { + DRV_LOG(WARNING, + "port %u unable to allocate memory for a new MR of" + " address (%p).", + dev->data->port_id, (void *)addr); + rte_errno = ENOMEM; + goto err_nolock; + } + mr->msl = msl; + /* + * Save the index of the first memseg and initialize memseg bitmap. To + * see if a memseg of ms_idx in the memseg-list is still valid, check: + * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) + */ + mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); + mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); + if (mr->ms_bmp == NULL) { + DRV_LOG(WARNING, + "port %u unable to initialize bitamp for a new MR of" + " address (%p).", + dev->data->port_id, (void *)addr); + rte_errno = EINVAL; + goto err_nolock; + } + /* + * Should recheck whether the extended contiguous chunk is still valid. + * Because memory_hotplug_lock can't be held if there's any memory + * related calls in a critical path, resource allocation above can't be + * locked. If the memory has been changed at this point, try again with + * just single page. If not, go on with the big chunk atomically from + * here. + */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + data_re = data; + if (len > msl->page_sz && + !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { + DRV_LOG(WARNING, + "port %u unable to find virtually contiguous" + " chunk for address (%p)." + " rte_memseg_contig_walk() failed.", + dev->data->port_id, (void *)addr); + rte_errno = ENXIO; + goto err_memlock; + } + if (data.start != data_re.start || data.end != data_re.end) { + /* + * The extended contiguous chunk has been changed. Try again + * with single memseg instead. + */ + data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); + data.end = data.start + msl->page_sz; + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + mr_free(mr); + goto alloc_resources; + } + assert(data.msl == data_re.msl); + rte_rwlock_write_lock(&priv->mr.rwlock); + /* + * Check the address is really missing. If other thread already created + * one or it is not found due to overflow, abort and return. + */ + if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) { + /* + * Insert to the global cache table. It may fail due to + * low-on-memory. Then, this entry will have to be searched + * here again. + */ + mr_btree_insert(&priv->mr.cache, entry); + DRV_LOG(DEBUG, + "port %u found MR for %p on final lookup, abort", + dev->data->port_id, (void *)addr); + rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + /* + * Must be unlocked before calling rte_free() because + * mlx5_mr_mem_event_free_cb() can be called inside. + */ + mr_free(mr); + return entry->lkey; } - priv_mr_new(priv, mp); + /* + * Trim start and end addresses for verbs MR. Set bits for registering + * memsegs but exclude already registered ones. Bitmap can be + * fragmented. + */ + for (n = 0; n < ms_n; ++n) { + uintptr_t start; + struct mlx5_mr_cache ret = { 0, }; + + start = data_re.start + n * msl->page_sz; + /* Exclude memsegs already registered by other MRs. */ + if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) { + /* + * Start from the first unregistered memseg in the + * extended range. + */ + if (ms_idx_shift == -1) { + mr->ms_base_idx += n; + data.start = start; + ms_idx_shift = n; + } + data.end = start + msl->page_sz; + rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); + ++mr->ms_n; + } + } + len = data.end - data.start; + mr->ms_bmp_n = len / msl->page_sz; + assert(ms_idx_shift + mr->ms_bmp_n <= ms_n); + /* + * Finally create a verbs MR for the memory chunk. ibv_reg_mr() can be + * called with holding the memory lock because it doesn't use + * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket() + * through mlx5_alloc_verbs_buf(). + */ + mr->ibv_mr = mlx5_glue->reg_mr(priv->pd, (void *)data.start, len, + IBV_ACCESS_LOCAL_WRITE); + if (mr->ibv_mr == NULL) { + DRV_LOG(WARNING, + "port %u fail to create a verbs MR for address (%p)", + dev->data->port_id, (void *)addr); + rte_errno = EINVAL; + goto err_mrlock; + } + assert((uintptr_t)mr->ibv_mr->addr == data.start); + assert(mr->ibv_mr->length == len); + LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + DRV_LOG(DEBUG, + "port %u MR CREATED (%p) for %p:\n" + " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," + " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", + dev->data->port_id, (void *)mr, (void *)addr, + data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey), + mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); + /* Insert to the global cache table. */ + mr_insert_dev_cache(dev, mr); + /* Fill in output data. */ + mr_lookup_dev(dev, entry, addr); + /* Lookup can't fail. */ + assert(entry->lkey != UINT32_MAX); + rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return entry->lkey; +err_mrlock: + rte_rwlock_write_unlock(&priv->mr.rwlock); +err_memlock: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); +err_nolock: + /* + * In case of error, as this can be called in a datapath, a warning + * message per an error is preferable instead. Must be unlocked before + * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called + * inside. + */ + mr_free(mr); + return UINT32_MAX; } /** - * Register a new memory region from the mempool and store it in the memory - * region list. + * Rebuild the global B-tree cache of device from the original MR list. * - * @param priv - * Pointer to private structure. - * @param mp - * Pointer to the memory pool to register. - * @return - * The memory region on success. + * @param dev + * Pointer to Ethernet device. */ -struct mlx5_mr* -priv_mr_new(struct priv *priv, struct rte_mempool *mp) +static void +mr_rebuild_dev_cache(struct rte_eth_dev *dev) { - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - uintptr_t start; - uintptr_t end; - unsigned int i; + struct priv *priv = dev->data->dev_private; struct mlx5_mr *mr; - mr = rte_zmalloc_socket(__func__, sizeof(*mr), 0, mp->socket_id); - if (!mr) { - DEBUG("unable to configure MR, ibv_reg_mr() failed."); - return NULL; + DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id); + /* Flush cache to rebuild. */ + priv->mr.cache.len = 1; + priv->mr.cache.overflow = 0; + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) + if (mr_insert_dev_cache(dev, mr) < 0) + return; +} + +/** + * Callback for memory free event. Iterate freed memsegs and check whether it + * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a + * result, the MR would be fragmented. If it becomes empty, the MR will be freed + * later by mlx5_mr_garbage_collect(). Even if this callback is called from a + * secondary process, the garbage collector will be called in primary process + * as the secondary process can't call mlx5_mr_create(). + * + * The global cache must be rebuilt if there's any change and this event has to + * be propagated to dataplane threads to flush the local caches. + * + * @param dev + * Pointer to Ethernet device. + * @param addr + * Address of freed memory. + * @param len + * Size of freed memory. + */ +static void +mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len) +{ + struct priv *priv = dev->data->dev_private; + const struct rte_memseg_list *msl; + struct mlx5_mr *mr; + int ms_n; + int i; + int rebuild = 0; + + DRV_LOG(DEBUG, "port %u free callback: addr=%p, len=%zu", + dev->data->port_id, addr, len); + msl = rte_mem_virt2memseg_list(addr); + /* addr and len must be page-aligned. */ + assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz)); + assert(len == RTE_ALIGN(len, msl->page_sz)); + ms_n = len / msl->page_sz; + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Clear bits of freed memsegs from MR. */ + for (i = 0; i < ms_n; ++i) { + const struct rte_memseg *ms; + struct mlx5_mr_cache entry; + uintptr_t start; + int ms_idx; + uint32_t pos; + + /* Find MR having this memseg. */ + start = (uintptr_t)addr + i * msl->page_sz; + mr = mr_lookup_dev_list(dev, &entry, start); + if (mr == NULL) + continue; + ms = rte_mem_virt2memseg((void *)start, msl); + assert(ms != NULL); + assert(msl->page_sz == ms->hugepage_sz); + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + pos = ms_idx - mr->ms_base_idx; + assert(rte_bitmap_get(mr->ms_bmp, pos)); + assert(pos < mr->ms_bmp_n); + DRV_LOG(DEBUG, "port %u MR(%p): clear bitmap[%u] for addr %p", + dev->data->port_id, (void *)mr, pos, (void *)start); + rte_bitmap_clear(mr->ms_bmp, pos); + if (--mr->ms_n == 0) { + LIST_REMOVE(mr, mr); + LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + DRV_LOG(DEBUG, "port %u remove MR(%p) from list", + dev->data->port_id, (void *)mr); + } + /* + * MR is fragmented or will be freed. the global cache must be + * rebuilt. + */ + rebuild = 1; } - if (mlx5_check_mempool(mp, &start, &end) != 0) { - ERROR("mempool %p: not virtually contiguous", - (void *)mp); - return NULL; + if (rebuild) { + mr_rebuild_dev_cache(dev); + /* + * Flush local caches by propagating invalidation across cores. + * rte_smp_wmb() is enough to synchronize this event. If one of + * freed memsegs is seen by other core, that means the memseg + * has been allocated by allocator, which will come after this + * free call. Therefore, this store instruction (incrementing + * generation below) will be guaranteed to be seen by other core + * before the core sees the newly allocated memory. + */ + ++priv->mr.dev_gen; + DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d", + priv->mr.dev_gen); + rte_smp_wmb(); } - DEBUG("mempool %p area start=%p end=%p size=%zu", - (void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - /* Save original addresses for exact MR lookup. */ - mr->start = start; - mr->end = end; - /* Round start and end to page boundary if found in memory segments. */ - for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { - uintptr_t addr = (uintptr_t)ms[i].addr; - size_t len = ms[i].len; - unsigned int align = ms[i].hugepage_sz; - - if ((start > addr) && (start < addr + len)) - start = RTE_ALIGN_FLOOR(start, align); - if ((end > addr) && (end < addr + len)) - end = RTE_ALIGN_CEIL(end, align); + rte_rwlock_write_unlock(&priv->mr.rwlock); + if (rebuild && rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG) + mlx5_mr_dump_dev(dev); +} + +/** + * Callback for memory event. This can be called from both primary and secondary + * process. + * + * @param event_type + * Memory event type. + * @param addr + * Address of memory. + * @param len + * Size of memory. + */ +void +mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, + size_t len, void *arg __rte_unused) +{ + struct priv *priv; + struct mlx5_dev_list *dev_list = &mlx5_shared_data->mem_event_cb_list; + + switch (event_type) { + case RTE_MEM_EVENT_FREE: + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + /* Iterate all the existing mlx5 devices. */ + LIST_FOREACH(priv, dev_list, mem_event_cb) + mlx5_mr_mem_event_free_cb(ETH_DEV(priv), addr, len); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); + break; + case RTE_MEM_EVENT_ALLOC: + default: + break; } - DEBUG("mempool %p using start=%p end=%p size=%zu for MR", - (void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - mr->mr = mlx5_glue->reg_mr(priv->pd, (void *)start, end - start, - IBV_ACCESS_LOCAL_WRITE); - mr->mp = mp; - mr->lkey = rte_cpu_to_be_32(mr->mr->lkey); - rte_atomic32_inc(&mr->refcnt); - DEBUG("%p: new Memory Region %p refcnt: %d", (void *)priv, - (void *)mr, rte_atomic32_read(&mr->refcnt)); - LIST_INSERT_HEAD(&priv->mr, mr, next); - return mr; } /** - * Search the memory region object in the memory region list. + * Look up address in the global MR cache table. If not found, create a new MR. + * Insert the found/created entry to local bottom-half cache table. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param[out] entry + * Pointer to returning MR cache entry, found in the global cache or newly + * created. If failed to create one, this is not written. + * @param addr + * Search key. * - * @param priv - * Pointer to private structure. - * @param mp - * Pointer to the memory pool to register. * @return - * The memory region on success. + * Searched LKey on success, UINT32_MAX on no match. */ -struct mlx5_mr* -priv_mr_get(struct priv *priv, struct rte_mempool *mp) +static uint32_t +mlx5_mr_lookup_dev(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + struct mlx5_mr_cache *entry, uintptr_t addr) { - struct mlx5_mr *mr; + struct priv *priv = dev->data->dev_private; + struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; + uint16_t idx; + uint32_t lkey; - assert(mp); - if (LIST_EMPTY(&priv->mr)) - return NULL; - LIST_FOREACH(mr, &priv->mr, next) { - if (mr->mp == mp) { - rte_atomic32_inc(&mr->refcnt); - DEBUG("Memory Region %p refcnt: %d", - (void *)mr, rte_atomic32_read(&mr->refcnt)); - return mr; - } + /* If local cache table is full, try to double it. */ + if (unlikely(bt->len == bt->size)) + mr_btree_expand(bt, bt->size << 1); + /* Look up in the global cache. */ + rte_rwlock_read_lock(&priv->mr.rwlock); + lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + if (lkey != UINT32_MAX) { + /* Found. */ + *entry = (*priv->mr.cache.table)[idx]; + rte_rwlock_read_unlock(&priv->mr.rwlock); + /* + * Update local cache. Even if it fails, return the found entry + * to update top-half cache. Next time, this entry will be found + * in the global cache. + */ + mr_btree_insert(bt, entry); + return lkey; } - return NULL; + rte_rwlock_read_unlock(&priv->mr.rwlock); + /* First time to see the address? Create a new MR. */ + lkey = mlx5_mr_create(dev, entry, addr); + /* + * Update the local cache if successfully created a new global MR. Even + * if failed to create one, there's no action to take in this datapath + * code. As returning LKey is invalid, this will eventually make HW + * fail. + */ + if (lkey != UINT32_MAX) + mr_btree_insert(bt, entry); + return lkey; } /** - * Release the memory region object. + * Bottom-half of LKey search on datapath. Firstly search in cache_bh[] and if + * misses, search in the global MR cache table and update the new entry to + * per-queue local caches. * - * @param mr - * Pointer to memory region to release. + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param addr + * Search key. * * @return - * 0 on success, errno on failure. + * Searched LKey on success, UINT32_MAX on no match. */ -int -priv_mr_release(struct priv *priv, struct mlx5_mr *mr) +static uint32_t +mlx5_mr_addr2mr_bh(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + uintptr_t addr) { - (void)priv; - assert(mr); - DEBUG("Memory Region %p refcnt: %d", - (void *)mr, rte_atomic32_read(&mr->refcnt)); - if (rte_atomic32_dec_and_test(&mr->refcnt)) { - claim_zero(mlx5_glue->dereg_mr(mr->mr)); - LIST_REMOVE(mr, next); - rte_free(mr); - return 0; + uint32_t lkey; + uint16_t bh_idx = 0; + /* Victim in top-half cache to replace with new entry. */ + struct mlx5_mr_cache *repl = &mr_ctrl->cache[mr_ctrl->head]; + + /* Binary-search MR translation table. */ + lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); + /* Update top-half cache. */ + if (likely(lkey != UINT32_MAX)) { + *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; + } else { + /* + * If missed in local lookup table, search in the global cache + * and local cache_bh[] will be updated inside if possible. + * Top-half cache entry will also be updated. + */ + lkey = mlx5_mr_lookup_dev(dev, mr_ctrl, repl, addr); + if (unlikely(lkey == UINT32_MAX)) + return UINT32_MAX; } - return EBUSY; + /* Update the most recently used entry. */ + mr_ctrl->mru = mr_ctrl->head; + /* Point to the next victim, the oldest. */ + mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; + return lkey; +} + +/** + * Bottom-half of LKey search on Rx. + * + * @param rxq + * Pointer to Rx queue structure. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +uint32_t +mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr) +{ + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl; + struct priv *priv = rxq_ctrl->priv; + + DRV_LOG(DEBUG, + "Rx queue %u: miss on top-half, mru=%u, head=%u, addr=%p", + rxq_ctrl->idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr); + return mlx5_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr); +} + +/** + * Bottom-half of LKey search on Tx. + * + * @param txq + * Pointer to Tx queue structure. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +uint32_t +mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr) +{ + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq, struct mlx5_txq_ctrl, txq); + struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + struct priv *priv = txq_ctrl->priv; + + DRV_LOG(DEBUG, + "Tx queue %u: miss on top-half, mru=%u, head=%u, addr=%p", + txq_ctrl->idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr); + return mlx5_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr); +} + +/** + * Flush all of the local cache entries. + * + * @param mr_ctrl + * Pointer to per-queue MR control structure. + */ +void +mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl) +{ + /* Reset the most-recently-used index. */ + mr_ctrl->mru = 0; + /* Reset the linear search array. */ + mr_ctrl->head = 0; + memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); + /* Reset the B-tree table. */ + mr_ctrl->cache_bh.len = 1; + mr_ctrl->cache_bh.overflow = 0; + /* Update the generation number. */ + mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; + DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d", + (void *)mr_ctrl, mr_ctrl->cur_gen); +} + +/* Called during rte_mempool_mem_iter() by mlx5_mr_update_mp(). */ +static void +mlx5_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque, + struct rte_mempool_memhdr *memhdr, + unsigned mem_idx __rte_unused) +{ + struct mr_update_mp_data *data = opaque; + uint32_t lkey; + + /* Stop iteration if failed in the previous walk. */ + if (data->ret < 0) + return; + /* Register address of the chunk and update local caches. */ + lkey = mlx5_mr_addr2mr_bh(data->dev, data->mr_ctrl, + (uintptr_t)memhdr->addr); + if (lkey == UINT32_MAX) + data->ret = -1; } /** - * Verify the flow list is empty + * Register entire memory chunks in a Mempool. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param mp + * Pointer to registering Mempool. * - * @return the number of object not released. + * @return + * 0 on success, -1 on failure. */ int -priv_mr_verify(struct priv *priv) +mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + struct rte_mempool *mp) { - int ret = 0; + struct mr_update_mp_data data = { + .dev = dev, + .mr_ctrl = mr_ctrl, + .ret = 0, + }; + + rte_mempool_mem_iter(mp, mlx5_mr_update_mp_cb, &data); + return data.ret; +} + +/** + * Dump all the created MRs and the global cache entries. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_mr_dump_dev(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; struct mlx5_mr *mr; + int mr_n = 0; + int chunk_n = 0; + + rte_rwlock_read_lock(&priv->mr.rwlock); + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + unsigned int n; - LIST_FOREACH(mr, &priv->mr, next) { - DEBUG("%p: mr %p still referenced", (void *)priv, - (void *)mr); - ++ret; + DRV_LOG(DEBUG, + "port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", + dev->data->port_id, mr_n++, + rte_cpu_to_be_32(mr->ibv_mr->lkey), + mr->ms_n, mr->ms_bmp_n); + if (mr->ms_n == 0) + continue; + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx5_mr_cache ret = { 0, }; + + n = mr_find_next_chunk(mr, &ret, n); + if (!ret.end) + break; + DRV_LOG(DEBUG, + " chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", + chunk_n++, ret.start, ret.end); + } } - return ret; + DRV_LOG(DEBUG, "port %u dumping global cache", dev->data->port_id); + mlx5_mr_btree_dump(&priv->mr.cache); + rte_rwlock_read_unlock(&priv->mr.rwlock); +} + +/** + * Release all the created MRs and resources. Remove device from memory callback + * list. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_mr_release(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_mr *mr_next = LIST_FIRST(&priv->mr.mr_list); + + /* Remove from memory callback device list. */ + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + LIST_REMOVE(priv, mem_event_cb); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); + if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG) + mlx5_mr_dump_dev(dev); + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Detach from MR list and move to free list. */ + while (mr_next != NULL) { + struct mlx5_mr *mr = mr_next; + + mr_next = LIST_NEXT(mr, mr); + LIST_REMOVE(mr, mr); + LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + } + LIST_INIT(&priv->mr.mr_list); + /* Free global cache. */ + mlx5_mr_btree_free(&priv->mr.cache); + rte_rwlock_write_unlock(&priv->mr.rwlock); + /* Free all remaining MRs. */ + mlx5_mr_garbage_collect(dev); } diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h new file mode 100644 index 00000000..e0b28215 --- /dev/null +++ b/drivers/net/mlx5/mlx5_mr.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 6WIND S.A. + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_MR_H_ +#define RTE_PMD_MLX5_MR_H_ + +#include <stddef.h> +#include <stdint.h> +#include <sys/queue.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#include <infiniband/mlx5dv.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_eal_memconfig.h> +#include <rte_ethdev.h> +#include <rte_rwlock.h> +#include <rte_bitmap.h> + +/* Memory Region object. */ +struct mlx5_mr { + LIST_ENTRY(mlx5_mr) mr; /**< Pointer to the prev/next entry. */ + struct ibv_mr *ibv_mr; /* Verbs Memory Region. */ + const struct rte_memseg_list *msl; + int ms_base_idx; /* Start index of msl->memseg_arr[]. */ + int ms_n; /* Number of memsegs in use. */ + uint32_t ms_bmp_n; /* Number of bits in memsegs bit-mask. */ + struct rte_bitmap *ms_bmp; /* Bit-mask of memsegs belonged to MR. */ +}; + +/* Cache entry for Memory Region. */ +struct mlx5_mr_cache { + uintptr_t start; /* Start address of MR. */ + uintptr_t end; /* End address of MR. */ + uint32_t lkey; /* rte_cpu_to_be_32(ibv_mr->lkey). */ +} __rte_packed; + +/* MR Cache table for Binary search. */ +struct mlx5_mr_btree { + uint16_t len; /* Number of entries. */ + uint16_t size; /* Total number of entries. */ + int overflow; /* Mark failure of table expansion. */ + struct mlx5_mr_cache (*table)[]; +} __rte_packed; + +/* Per-queue MR control descriptor. */ +struct mlx5_mr_ctrl { + uint32_t *dev_gen_ptr; /* Generation number of device to poll. */ + uint32_t cur_gen; /* Generation number saved to flush caches. */ + uint16_t mru; /* Index of last hit entry in top-half cache. */ + uint16_t head; /* Index of the oldest entry in top-half cache. */ + struct mlx5_mr_cache cache[MLX5_MR_CACHE_N]; /* Cache for top-half. */ + struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */ +} __rte_packed; + +extern struct mlx5_dev_list mlx5_mem_event_cb_list; +extern rte_rwlock_t mlx5_mem_event_rwlock; + +/* First entry must be NULL for comparison. */ +#define mlx5_mr_btree_len(bt) ((bt)->len - 1) + +int mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket); +void mlx5_mr_btree_free(struct mlx5_mr_btree *bt); +void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, + size_t len, void *arg); +int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + struct rte_mempool *mp); +void mlx5_mr_dump_dev(struct rte_eth_dev *dev); +void mlx5_mr_release(struct rte_eth_dev *dev); + +/** + * Look up LKey from given lookup table by linear search. Firstly look up the + * last-hit entry. If miss, the entire array is searched. If found, update the + * last-hit index and return LKey. + * + * @param lkp_tbl + * Pointer to lookup table. + * @param[in,out] cached_idx + * Pointer to last-hit index. + * @param n + * Size of lookup table. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static __rte_always_inline uint32_t +mlx5_mr_lookup_cache(struct mlx5_mr_cache *lkp_tbl, uint16_t *cached_idx, + uint16_t n, uintptr_t addr) +{ + uint16_t idx; + + if (likely(addr >= lkp_tbl[*cached_idx].start && + addr < lkp_tbl[*cached_idx].end)) + return lkp_tbl[*cached_idx].lkey; + for (idx = 0; idx < n && lkp_tbl[idx].start != 0; ++idx) { + if (addr >= lkp_tbl[idx].start && + addr < lkp_tbl[idx].end) { + /* Found. */ + *cached_idx = idx; + return lkp_tbl[idx].lkey; + } + } + return UINT32_MAX; +} + +#endif /* RTE_PMD_MLX5_MR_H_ */ diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c new file mode 100644 index 00000000..dca85835 --- /dev/null +++ b/drivers/net/mlx5/mlx5_nl.c @@ -0,0 +1,627 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 6WIND S.A. + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <unistd.h> + +#include "mlx5.h" +#include "mlx5_utils.h" + +/* Size of the buffer to receive kernel messages */ +#define MLX5_NL_BUF_SIZE (32 * 1024) +/* Send buffer size for the Netlink socket */ +#define MLX5_SEND_BUF_SIZE 32768 +/* Receive buffer size for the Netlink socket */ +#define MLX5_RECV_BUF_SIZE 32768 + +/* + * Define NDA_RTA as defined in iproute2 sources. + * + * see in iproute2 sources file include/libnetlink.h + */ +#ifndef MLX5_NDA_RTA +#define MLX5_NDA_RTA(r) \ + ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg)))) +#endif + +/* Add/remove MAC address through Netlink */ +struct mlx5_nl_mac_addr { + struct ether_addr (*mac)[]; + /**< MAC address handled by the device. */ + int mac_n; /**< Number of addresses in the array. */ +}; + +/** + * Opens a Netlink socket. + * + * @param nl_groups + * Netlink group value (e.g. RTMGRP_LINK). + * + * @return + * A file descriptor on success, a negative errno value otherwise and + * rte_errno is set. + */ +int +mlx5_nl_init(uint32_t nl_groups) +{ + int fd; + int sndbuf_size = MLX5_SEND_BUF_SIZE; + int rcvbuf_size = MLX5_RECV_BUF_SIZE; + struct sockaddr_nl local = { + .nl_family = AF_NETLINK, + .nl_groups = nl_groups, + }; + int ret; + + fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + if (fd == -1) { + rte_errno = errno; + return -rte_errno; + } + ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int)); + if (ret == -1) { + rte_errno = errno; + goto error; + } + ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int)); + if (ret == -1) { + rte_errno = errno; + goto error; + } + ret = bind(fd, (struct sockaddr *)&local, sizeof(local)); + if (ret == -1) { + rte_errno = errno; + goto error; + } + return fd; +error: + close(fd); + return -rte_errno; +} + +/** + * Send a request message to the kernel on the Netlink socket. + * + * @param[in] nlsk_fd + * Netlink socket file descriptor. + * @param[in] nh + * The Netlink message send to the kernel. + * @param[in] ssn + * Sequence number. + * @param[in] req + * Pointer to the request structure. + * @param[in] len + * Length of the request in bytes. + * + * @return + * The number of sent bytes on success, a negative errno value otherwise and + * rte_errno is set. + */ +static int +mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req, + int len) +{ + struct sockaddr_nl sa = { + .nl_family = AF_NETLINK, + }; + struct iovec iov[2] = { + { .iov_base = nh, .iov_len = sizeof(*nh), }, + { .iov_base = req, .iov_len = len, }, + }; + struct msghdr msg = { + .msg_name = &sa, + .msg_namelen = sizeof(sa), + .msg_iov = iov, + .msg_iovlen = 2, + }; + int send_bytes; + + nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */ + nh->nlmsg_seq = sn; + send_bytes = sendmsg(nlsk_fd, &msg, 0); + if (send_bytes < 0) { + rte_errno = errno; + return -rte_errno; + } + return send_bytes; +} + +/** + * Send a message to the kernel on the Netlink socket. + * + * @param[in] nlsk_fd + * The Netlink socket file descriptor used for communication. + * @param[in] nh + * The Netlink message send to the kernel. + * @param[in] sn + * Sequence number. + * + * @return + * The number of sent bytes on success, a negative errno value otherwise and + * rte_errno is set. + */ +static int +mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn) +{ + struct sockaddr_nl sa = { + .nl_family = AF_NETLINK, + }; + struct iovec iov = { + .iov_base = nh, + .iov_len = nh->nlmsg_len, + }; + struct msghdr msg = { + .msg_name = &sa, + .msg_namelen = sizeof(sa), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + int send_bytes; + + nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */ + nh->nlmsg_seq = sn; + send_bytes = sendmsg(nlsk_fd, &msg, 0); + if (send_bytes < 0) { + rte_errno = errno; + return -rte_errno; + } + return send_bytes; +} + +/** + * Receive a message from the kernel on the Netlink socket, following + * mlx5_nl_send(). + * + * @param[in] nlsk_fd + * The Netlink socket file descriptor used for communication. + * @param[in] sn + * Sequence number. + * @param[in] cb + * The callback function to call for each Netlink message received. + * @param[in, out] arg + * Custom arguments for the callback. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg), + void *arg) +{ + struct sockaddr_nl sa; + char buf[MLX5_RECV_BUF_SIZE]; + struct iovec iov = { + .iov_base = buf, + .iov_len = sizeof(buf), + }; + struct msghdr msg = { + .msg_name = &sa, + .msg_namelen = sizeof(sa), + .msg_iov = &iov, + /* One message at a time */ + .msg_iovlen = 1, + }; + int multipart = 0; + int ret = 0; + + do { + struct nlmsghdr *nh; + int recv_bytes = 0; + + do { + recv_bytes = recvmsg(nlsk_fd, &msg, 0); + if (recv_bytes == -1) { + rte_errno = errno; + return -rte_errno; + } + nh = (struct nlmsghdr *)buf; + } while (nh->nlmsg_seq != sn); + for (; + NLMSG_OK(nh, (unsigned int)recv_bytes); + nh = NLMSG_NEXT(nh, recv_bytes)) { + if (nh->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err_data = NLMSG_DATA(nh); + + if (err_data->error < 0) { + rte_errno = -err_data->error; + return -rte_errno; + } + /* Ack message. */ + return 0; + } + /* Multi-part msgs and their trailing DONE message. */ + if (nh->nlmsg_flags & NLM_F_MULTI) { + if (nh->nlmsg_type == NLMSG_DONE) + return 0; + multipart = 1; + } + if (cb) { + ret = cb(nh, arg); + if (ret < 0) + return ret; + } + } + } while (multipart); + return ret; +} + +/** + * Parse Netlink message to retrieve the bridge MAC address. + * + * @param nh + * Pointer to Netlink Message Header. + * @param arg + * PMD data register with this callback. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg) +{ + struct mlx5_nl_mac_addr *data = arg; + struct ndmsg *r = NLMSG_DATA(nh); + struct rtattr *attribute; + int len; + + len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)); + for (attribute = MLX5_NDA_RTA(r); + RTA_OK(attribute, len); + attribute = RTA_NEXT(attribute, len)) { + if (attribute->rta_type == NDA_LLADDR) { + if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) { + DRV_LOG(WARNING, + "not enough room to finalize the" + " request"); + rte_errno = ENOMEM; + return -rte_errno; + } +#ifndef NDEBUG + char m[18]; + + ether_format_addr(m, 18, RTA_DATA(attribute)); + DRV_LOG(DEBUG, "bridge MAC address %s", m); +#endif + memcpy(&(*data->mac)[data->mac_n++], + RTA_DATA(attribute), ETHER_ADDR_LEN); + } + } + return 0; +} + +/** + * Get bridge MAC addresses. + * + * @param dev + * Pointer to Ethernet device. + * @param mac[out] + * Pointer to the array table of MAC addresses to fill. + * Its size should be of MLX5_MAX_MAC_ADDRESSES. + * @param mac_n[out] + * Number of entries filled in MAC array. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct ether_addr (*mac)[], + int *mac_n) +{ + struct priv *priv = dev->data->dev_private; + int iface_idx = mlx5_ifindex(dev); + struct { + struct nlmsghdr hdr; + struct ifinfomsg ifm; + } req = { + .hdr = { + .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nlmsg_type = RTM_GETNEIGH, + .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + }, + .ifm = { + .ifi_family = PF_BRIDGE, + .ifi_index = iface_idx, + }, + }; + struct mlx5_nl_mac_addr data = { + .mac = mac, + .mac_n = 0, + }; + int fd; + int ret; + uint32_t sn = priv->nl_sn++; + + if (priv->nl_socket == -1) + return 0; + fd = priv->nl_socket; + ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm, + sizeof(struct ifinfomsg)); + if (ret < 0) + goto error; + ret = mlx5_nl_recv(fd, sn, mlx5_nl_mac_addr_cb, &data); + if (ret < 0) + goto error; + *mac_n = data.mac_n; + return 0; +error: + DRV_LOG(DEBUG, "port %u cannot retrieve MAC address list %s", + dev->data->port_id, strerror(rte_errno)); + return -rte_errno; +} + +/** + * Modify the MAC address neighbour table with Netlink. + * + * @param dev + * Pointer to Ethernet device. + * @param mac + * MAC address to consider. + * @param add + * 1 to add the MAC address, 0 to remove the MAC address. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac, + int add) +{ + struct priv *priv = dev->data->dev_private; + int iface_idx = mlx5_ifindex(dev); + struct { + struct nlmsghdr hdr; + struct ndmsg ndm; + struct rtattr rta; + uint8_t buffer[ETHER_ADDR_LEN]; + } req = { + .hdr = { + .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), + .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | + NLM_F_EXCL | NLM_F_ACK, + .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH, + }, + .ndm = { + .ndm_family = PF_BRIDGE, + .ndm_state = NUD_NOARP | NUD_PERMANENT, + .ndm_ifindex = iface_idx, + .ndm_flags = NTF_SELF, + }, + .rta = { + .rta_type = NDA_LLADDR, + .rta_len = RTA_LENGTH(ETHER_ADDR_LEN), + }, + }; + int fd; + int ret; + uint32_t sn = priv->nl_sn++; + + if (priv->nl_socket == -1) + return 0; + fd = priv->nl_socket; + memcpy(RTA_DATA(&req.rta), mac, ETHER_ADDR_LEN); + req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + + RTA_ALIGN(req.rta.rta_len); + ret = mlx5_nl_send(fd, &req.hdr, sn); + if (ret < 0) + goto error; + ret = mlx5_nl_recv(fd, sn, NULL, NULL); + if (ret < 0) + goto error; + return 0; +error: + DRV_LOG(DEBUG, + "port %u cannot %s MAC address %02X:%02X:%02X:%02X:%02X:%02X" + " %s", + dev->data->port_id, + add ? "add" : "remove", + mac->addr_bytes[0], mac->addr_bytes[1], + mac->addr_bytes[2], mac->addr_bytes[3], + mac->addr_bytes[4], mac->addr_bytes[5], + strerror(rte_errno)); + return -rte_errno; +} + +/** + * Add a MAC address. + * + * @param dev + * Pointer to Ethernet device. + * @param mac + * MAC address to register. + * @param index + * MAC address index. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index) +{ + struct priv *priv = dev->data->dev_private; + int ret; + + ret = mlx5_nl_mac_addr_modify(dev, mac, 1); + if (!ret) + BITFIELD_SET(priv->mac_own, index); + if (ret == -EEXIST) + return 0; + return ret; +} + +/** + * Remove a MAC address. + * + * @param dev + * Pointer to Ethernet device. + * @param mac + * MAC address to remove. + * @param index + * MAC address index. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index) +{ + struct priv *priv = dev->data->dev_private; + + BITFIELD_RESET(priv->mac_own, index); + return mlx5_nl_mac_addr_modify(dev, mac, 0); +} + +/** + * Synchronize Netlink bridge table to the internal table. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev) +{ + struct ether_addr macs[MLX5_MAX_MAC_ADDRESSES]; + int macs_n = 0; + int i; + int ret; + + ret = mlx5_nl_mac_addr_list(dev, &macs, &macs_n); + if (ret) + return; + for (i = 0; i != macs_n; ++i) { + int j; + + /* Verify the address is not in the array yet. */ + for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) + if (is_same_ether_addr(&macs[i], + &dev->data->mac_addrs[j])) + break; + if (j != MLX5_MAX_MAC_ADDRESSES) + continue; + /* Find the first entry available. */ + for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) { + if (is_zero_ether_addr(&dev->data->mac_addrs[j])) { + dev->data->mac_addrs[j] = macs[i]; + break; + } + } + } +} + +/** + * Flush all added MAC addresses. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + int i; + + for (i = MLX5_MAX_MAC_ADDRESSES - 1; i >= 0; --i) { + struct ether_addr *m = &dev->data->mac_addrs[i]; + + if (BITFIELD_ISSET(priv->mac_own, i)) + mlx5_nl_mac_addr_remove(dev, m, i); + } +} + +/** + * Enable promiscuous / all multicast mode through Netlink. + * + * @param dev + * Pointer to Ethernet device structure. + * @param flags + * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti. + * @param enable + * Nonzero to enable, disable otherwise. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_device_flags(struct rte_eth_dev *dev, uint32_t flags, int enable) +{ + struct priv *priv = dev->data->dev_private; + int iface_idx = mlx5_ifindex(dev); + struct { + struct nlmsghdr hdr; + struct ifinfomsg ifi; + } req = { + .hdr = { + .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nlmsg_type = RTM_NEWLINK, + .nlmsg_flags = NLM_F_REQUEST, + }, + .ifi = { + .ifi_flags = enable ? flags : 0, + .ifi_change = flags, + .ifi_index = iface_idx, + }, + }; + int fd; + int ret; + + assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI))); + if (priv->nl_socket < 0) + return 0; + fd = priv->nl_socket; + ret = mlx5_nl_send(fd, &req.hdr, priv->nl_sn++); + if (ret < 0) + return ret; + return 0; +} + +/** + * Enable promiscuous mode through Netlink. + * + * @param dev + * Pointer to Ethernet device structure. + * @param enable + * Nonzero to enable, disable otherwise. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_promisc(struct rte_eth_dev *dev, int enable) +{ + int ret = mlx5_nl_device_flags(dev, IFF_PROMISC, enable); + + if (ret) + DRV_LOG(DEBUG, + "port %u cannot %s promisc mode: Netlink error %s", + dev->data->port_id, enable ? "enable" : "disable", + strerror(rte_errno)); + return ret; +} + +/** + * Enable all multicast mode through Netlink. + * + * @param dev + * Pointer to Ethernet device structure. + * @param enable + * Nonzero to enable, disable otherwise. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable) +{ + int ret = mlx5_nl_device_flags(dev, IFF_ALLMULTI, enable); + + if (ret) + DRV_LOG(DEBUG, + "port %u cannot %s allmulti mode: Netlink error %s", + dev->data->port_id, enable ? "enable" : "disable", + strerror(rte_errno)); + return ret; +} diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h index 9eb9c15e..0cf370cd 100644 --- a/drivers/net/mlx5/mlx5_prm.h +++ b/drivers/net/mlx5/mlx5_prm.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2016 6WIND S.A. - * Copyright 2016 Mellanox. + * Copyright 2016 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_PRM_H_ @@ -107,6 +107,30 @@ /* Inner L4 checksum offload (Tunneled packets only). */ #define MLX5_ETH_WQE_L4_INNER_CSUM (1u << 5) +/* Outer L4 type is TCP. */ +#define MLX5_ETH_WQE_L4_OUTER_TCP (0u << 5) + +/* Outer L4 type is UDP. */ +#define MLX5_ETH_WQE_L4_OUTER_UDP (1u << 5) + +/* Outer L3 type is IPV4. */ +#define MLX5_ETH_WQE_L3_OUTER_IPV4 (0u << 4) + +/* Outer L3 type is IPV6. */ +#define MLX5_ETH_WQE_L3_OUTER_IPV6 (1u << 4) + +/* Inner L4 type is TCP. */ +#define MLX5_ETH_WQE_L4_INNER_TCP (0u << 1) + +/* Inner L4 type is UDP. */ +#define MLX5_ETH_WQE_L4_INNER_UDP (1u << 1) + +/* Inner L3 type is IPV4. */ +#define MLX5_ETH_WQE_L3_INNER_IPV4 (0u << 0) + +/* Inner L3 type is IPV6. */ +#define MLX5_ETH_WQE_L3_INNER_IPV6 (1u << 0) + /* Is flow mark valid. */ #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN #define MLX5_FLOW_MARK_IS_VALID(val) ((val) & 0xffffff00) @@ -195,6 +219,21 @@ struct mlx5_mpw { } data; }; +/* WQE for Multi-Packet RQ. */ +struct mlx5_wqe_mprq { + struct mlx5_wqe_srq_next_seg next_seg; + struct mlx5_wqe_data_seg dseg; +}; + +#define MLX5_MPRQ_LEN_MASK 0x000ffff +#define MLX5_MPRQ_LEN_SHIFT 0 +#define MLX5_MPRQ_STRIDE_NUM_MASK 0x3fff0000 +#define MLX5_MPRQ_STRIDE_NUM_SHIFT 16 +#define MLX5_MPRQ_FILLER_MASK 0x80000000 +#define MLX5_MPRQ_FILLER_SHIFT 31 + +#define MLX5_MPRQ_STRIDE_SHIFT_BYTE 2 + /* CQ element structure - should be equal to the cache line size */ struct mlx5_cqe { #if (RTE_CACHE_LINE_SIZE == 128) diff --git a/drivers/net/mlx5/mlx5_rss.c b/drivers/net/mlx5/mlx5_rss.c index d06b0bee..d69b4c09 100644 --- a/drivers/net/mlx5/mlx5_rss.c +++ b/drivers/net/mlx5/mlx5_rss.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -35,35 +35,48 @@ * RSS configuration data. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_rss_hash_update(struct rte_eth_dev *dev, struct rte_eth_rss_conf *rss_conf) { struct priv *priv = dev->data->dev_private; - int ret = 0; + unsigned int i; + unsigned int idx; - priv_lock(priv); if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) { - ret = -EINVAL; - goto out; + rte_errno = EINVAL; + return -rte_errno; } if (rss_conf->rss_key && rss_conf->rss_key_len) { + if (rss_conf->rss_key_len != rss_hash_default_key_len) { + DRV_LOG(ERR, + "port %u RSS key len must be %zu Bytes long", + dev->data->port_id, rss_hash_default_key_len); + rte_errno = EINVAL; + return -rte_errno; + } priv->rss_conf.rss_key = rte_realloc(priv->rss_conf.rss_key, rss_conf->rss_key_len, 0); if (!priv->rss_conf.rss_key) { - ret = -ENOMEM; - goto out; + rte_errno = ENOMEM; + return -rte_errno; } memcpy(priv->rss_conf.rss_key, rss_conf->rss_key, rss_conf->rss_key_len); priv->rss_conf.rss_key_len = rss_conf->rss_key_len; } priv->rss_conf.rss_hf = rss_conf->rss_hf; -out: - priv_unlock(priv); - return ret; + /* Enable the RSS hash in all Rx queues. */ + for (i = 0, idx = 0; idx != priv->rxqs_n; ++i) { + if (!(*priv->rxqs)[i]) + continue; + (*priv->rxqs)[i]->rss_hash = !!rss_conf->rss_hf && + !!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS); + ++idx; + } + return 0; } /** @@ -75,7 +88,7 @@ out: * RSS configuration data. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_rss_hash_conf_get(struct rte_eth_dev *dev, @@ -83,9 +96,10 @@ mlx5_rss_hash_conf_get(struct rte_eth_dev *dev, { struct priv *priv = dev->data->dev_private; - if (!rss_conf) - return -EINVAL; - priv_lock(priv); + if (!rss_conf) { + rte_errno = EINVAL; + return -rte_errno; + } if (rss_conf->rss_key && (rss_conf->rss_key_len >= priv->rss_conf.rss_key_len)) { memcpy(rss_conf->rss_key, priv->rss_conf.rss_key, @@ -93,24 +107,24 @@ mlx5_rss_hash_conf_get(struct rte_eth_dev *dev, } rss_conf->rss_key_len = priv->rss_conf.rss_key_len; rss_conf->rss_hf = priv->rss_conf.rss_hf; - priv_unlock(priv); return 0; } /** * Allocate/reallocate RETA index table. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @praram reta_size * The size of the array to allocate. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_rss_reta_index_resize(struct priv *priv, unsigned int reta_size) +mlx5_rss_reta_index_resize(struct rte_eth_dev *dev, unsigned int reta_size) { + struct priv *priv = dev->data->dev_private; void *mem; unsigned int old_size = priv->reta_idx_n; @@ -119,11 +133,12 @@ priv_rss_reta_index_resize(struct priv *priv, unsigned int reta_size) mem = rte_realloc(priv->reta_idx, reta_size * sizeof((*priv->reta_idx)[0]), 0); - if (!mem) - return ENOMEM; + if (!mem) { + rte_errno = ENOMEM; + return -rte_errno; + } priv->reta_idx = mem; priv->reta_idx_n = reta_size; - if (old_size < reta_size) memset(&(*priv->reta_idx)[old_size], 0, (reta_size - old_size) * @@ -132,28 +147,31 @@ priv_rss_reta_index_resize(struct priv *priv, unsigned int reta_size) } /** - * Query RETA table. + * DPDK callback to get the RETA indirection table. * - * @param priv - * Pointer to private structure. - * @param[in, out] reta_conf - * Pointer to the first RETA configuration structure. + * @param dev + * Pointer to Ethernet device structure. + * @param reta_conf + * Pointer to RETA configuration structure array. * @param reta_size - * Number of entries. + * Size of the RETA table. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -priv_dev_rss_reta_query(struct priv *priv, +int +mlx5_dev_rss_reta_query(struct rte_eth_dev *dev, struct rte_eth_rss_reta_entry64 *reta_conf, - unsigned int reta_size) + uint16_t reta_size) { + struct priv *priv = dev->data->dev_private; unsigned int idx; unsigned int i; - if (!reta_size || reta_size > priv->reta_idx_n) - return EINVAL; + if (!reta_size || reta_size > priv->reta_idx_n) { + rte_errno = EINVAL; + return -rte_errno; + } /* Fill each entry of the table even if its bit is not set. */ for (idx = 0, i = 0; (i != reta_size); ++i) { idx = i / RTE_RETA_GROUP_SIZE; @@ -164,34 +182,36 @@ priv_dev_rss_reta_query(struct priv *priv, } /** - * Update RETA table. + * DPDK callback to update the RETA indirection table. * - * @param priv - * Pointer to private structure. - * @param[in] reta_conf - * Pointer to the first RETA configuration structure. + * @param dev + * Pointer to Ethernet device structure. + * @param reta_conf + * Pointer to RETA configuration structure array. * @param reta_size - * Number of entries. + * Size of the RETA table. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -priv_dev_rss_reta_update(struct priv *priv, +int +mlx5_dev_rss_reta_update(struct rte_eth_dev *dev, struct rte_eth_rss_reta_entry64 *reta_conf, - unsigned int reta_size) + uint16_t reta_size) { + int ret; + struct priv *priv = dev->data->dev_private; unsigned int idx; unsigned int i; unsigned int pos; - int ret; - if (!reta_size) - return EINVAL; - ret = priv_rss_reta_index_resize(priv, reta_size); + if (!reta_size) { + rte_errno = EINVAL; + return -rte_errno; + } + ret = mlx5_rss_reta_index_resize(dev, reta_size); if (ret) return ret; - for (idx = 0, i = 0; (i != reta_size); ++i) { idx = i / RTE_RETA_GROUP_SIZE; pos = i % RTE_RETA_GROUP_SIZE; @@ -200,63 +220,9 @@ priv_dev_rss_reta_update(struct priv *priv, assert(reta_conf[idx].reta[pos] < priv->rxqs_n); (*priv->reta_idx)[i] = reta_conf[idx].reta[pos]; } - return 0; -} - -/** - * DPDK callback to get the RETA indirection table. - * - * @param dev - * Pointer to Ethernet device structure. - * @param reta_conf - * Pointer to RETA configuration structure array. - * @param reta_size - * Size of the RETA table. - * - * @return - * 0 on success, negative errno value on failure. - */ -int -mlx5_dev_rss_reta_query(struct rte_eth_dev *dev, - struct rte_eth_rss_reta_entry64 *reta_conf, - uint16_t reta_size) -{ - int ret; - struct priv *priv = dev->data->dev_private; - - priv_lock(priv); - ret = priv_dev_rss_reta_query(priv, reta_conf, reta_size); - priv_unlock(priv); - return -ret; -} - -/** - * DPDK callback to update the RETA indirection table. - * - * @param dev - * Pointer to Ethernet device structure. - * @param reta_conf - * Pointer to RETA configuration structure array. - * @param reta_size - * Size of the RETA table. - * - * @return - * 0 on success, negative errno value on failure. - */ -int -mlx5_dev_rss_reta_update(struct rte_eth_dev *dev, - struct rte_eth_rss_reta_entry64 *reta_conf, - uint16_t reta_size) -{ - int ret; - struct priv *priv = dev->data->dev_private; - - priv_lock(priv); - ret = priv_dev_rss_reta_update(priv, reta_conf, reta_size); - priv_unlock(priv); if (dev->data->dev_started) { mlx5_dev_stop(dev); - mlx5_dev_start(dev); + return mlx5_dev_start(dev); } - return -ret; + return 0; } diff --git a/drivers/net/mlx5/mlx5_rxmode.c b/drivers/net/mlx5/mlx5_rxmode.c index 4ffc869a..80824bc4 100644 --- a/drivers/net/mlx5/mlx5_rxmode.c +++ b/drivers/net/mlx5/mlx5_rxmode.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -32,8 +32,15 @@ void mlx5_promiscuous_enable(struct rte_eth_dev *dev) { + int ret; + dev->data->promiscuous = 1; - mlx5_traffic_restart(dev); + if (((struct priv *)dev->data->dev_private)->config.vf) + mlx5_nl_promisc(dev, 1); + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot enable promiscuous mode: %s", + dev->data->port_id, strerror(rte_errno)); } /** @@ -45,8 +52,15 @@ mlx5_promiscuous_enable(struct rte_eth_dev *dev) void mlx5_promiscuous_disable(struct rte_eth_dev *dev) { + int ret; + dev->data->promiscuous = 0; - mlx5_traffic_restart(dev); + if (((struct priv *)dev->data->dev_private)->config.vf) + mlx5_nl_promisc(dev, 0); + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot disable promiscuous mode: %s", + dev->data->port_id, strerror(rte_errno)); } /** @@ -58,8 +72,15 @@ mlx5_promiscuous_disable(struct rte_eth_dev *dev) void mlx5_allmulticast_enable(struct rte_eth_dev *dev) { + int ret; + dev->data->all_multicast = 1; - mlx5_traffic_restart(dev); + if (((struct priv *)dev->data->dev_private)->config.vf) + mlx5_nl_allmulti(dev, 1); + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot enable allmulicast mode: %s", + dev->data->port_id, strerror(rte_errno)); } /** @@ -71,6 +92,13 @@ mlx5_allmulticast_enable(struct rte_eth_dev *dev) void mlx5_allmulticast_disable(struct rte_eth_dev *dev) { + int ret; + dev->data->all_multicast = 0; - mlx5_traffic_restart(dev); + if (((struct priv *)dev->data->dev_private)->config.vf) + mlx5_nl_allmulti(dev, 0); + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot disable allmulicast mode: %s", + dev->data->port_id, strerror(rte_errno)); } diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index ff58c492..de3f869e 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -55,7 +55,124 @@ uint8_t rss_hash_default_key[] = { const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key); /** - * Allocate RX queue elements. + * Check whether Multi-Packet RQ can be enabled for the device. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 1 if supported, negative errno value if not. + */ +inline int +mlx5_check_mprq_support(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + + if (priv->config.mprq.enabled && + priv->rxqs_n >= priv->config.mprq.min_rxqs_num) + return 1; + return -ENOTSUP; +} + +/** + * Check whether Multi-Packet RQ is enabled for the Rx queue. + * + * @param rxq + * Pointer to receive queue structure. + * + * @return + * 0 if disabled, otherwise enabled. + */ +inline int +mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq) +{ + return rxq->strd_num_n > 0; +} + +/** + * Check whether Multi-Packet RQ is enabled for the device. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 0 if disabled, otherwise enabled. + */ +inline int +mlx5_mprq_enabled(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + uint16_t i; + uint16_t n = 0; + + if (mlx5_check_mprq_support(dev) < 0) + return 0; + /* All the configured queues should be enabled. */ + for (i = 0; i < priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + + if (!rxq) + continue; + if (mlx5_rxq_mprq_enabled(rxq)) + ++n; + } + /* Multi-Packet RQ can't be partially configured. */ + assert(n == 0 || n == priv->rxqs_n); + return n == priv->rxqs_n; +} + +/** + * Allocate RX queue elements for Multi-Packet RQ. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; + unsigned int wqe_n = 1 << rxq->elts_n; + unsigned int i; + int err; + + /* Iterate on segments. */ + for (i = 0; i <= wqe_n; ++i) { + struct mlx5_mprq_buf *buf; + + if (rte_mempool_get(rxq->mprq_mp, (void **)&buf) < 0) { + DRV_LOG(ERR, "port %u empty mbuf pool", rxq->port_id); + rte_errno = ENOMEM; + goto error; + } + if (i < wqe_n) + (*rxq->mprq_bufs)[i] = buf; + else + rxq->mprq_repl = buf; + } + DRV_LOG(DEBUG, + "port %u Rx queue %u allocated and configured %u segments", + rxq->port_id, rxq_ctrl->idx, wqe_n); + return 0; +error: + err = rte_errno; /* Save rte_errno before cleanup. */ + wqe_n = i; + for (i = 0; (i != wqe_n); ++i) { + if ((*rxq->mprq_bufs)[i] != NULL) + rte_mempool_put(rxq->mprq_mp, + (*rxq->mprq_bufs)[i]); + (*rxq->mprq_bufs)[i] = NULL; + } + DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything", + rxq->port_id, rxq_ctrl->idx); + rte_errno = err; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Allocate RX queue elements for Single-Packet RQ. * * @param rxq_ctrl * Pointer to RX queue structure. @@ -63,13 +180,13 @@ const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key); * @return * 0 on success, errno value on failure. */ -int -rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) +static int +rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) { const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n; unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n; unsigned int i; - int ret = 0; + int err; /* Iterate on segments. */ for (i = 0; (i != elts_n); ++i) { @@ -77,8 +194,9 @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp); if (buf == NULL) { - ERROR("%p: empty mbuf pool", (void *)rxq_ctrl); - ret = ENOMEM; + DRV_LOG(ERR, "port %u empty mbuf pool", + PORT_ID(rxq_ctrl->priv)); + rte_errno = ENOMEM; goto error; } /* Headroom is reserved by rte_pktmbuf_alloc(). */ @@ -97,7 +215,7 @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) (*rxq_ctrl->rxq.elts)[i] = buf; } /* If Rx vector is activated. */ - if (rxq_check_vec_support(&rxq_ctrl->rxq) > 0) { + if (mlx5_rxq_check_vec_support(&rxq_ctrl->rxq) > 0) { struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; struct rte_mbuf *mbuf_init = &rxq->fake_mbuf; int j; @@ -118,30 +236,78 @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j) (*rxq->elts)[elts_n + j] = &rxq->fake_mbuf; } - DEBUG("%p: allocated and configured %u segments (max %u packets)", - (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n)); - assert(ret == 0); + DRV_LOG(DEBUG, + "port %u Rx queue %u allocated and configured %u segments" + " (max %u packets)", + PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx, elts_n, + elts_n / (1 << rxq_ctrl->rxq.sges_n)); return 0; error: + err = rte_errno; /* Save rte_errno before cleanup. */ elts_n = i; for (i = 0; (i != elts_n); ++i) { if ((*rxq_ctrl->rxq.elts)[i] != NULL) rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]); (*rxq_ctrl->rxq.elts)[i] = NULL; } - DEBUG("%p: failed, freed everything", (void *)rxq_ctrl); - assert(ret > 0); - return ret; + DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything", + PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx); + rte_errno = err; /* Restore rte_errno. */ + return -rte_errno; } /** - * Free RX queue elements. + * Allocate RX queue elements. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + * + * @return + * 0 on success, errno value on failure. + */ +int +rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? + rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl); +} + +/** + * Free RX queue elements for Multi-Packet RQ. * * @param rxq_ctrl * Pointer to RX queue structure. */ static void -rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) +rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; + uint16_t i; + + DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs", + rxq->port_id, rxq_ctrl->idx); + if (rxq->mprq_bufs == NULL) + return; + assert(mlx5_rxq_check_vec_support(rxq) < 0); + for (i = 0; (i != (1u << rxq->elts_n)); ++i) { + if ((*rxq->mprq_bufs)[i] != NULL) + mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]); + (*rxq->mprq_bufs)[i] = NULL; + } + if (rxq->mprq_repl != NULL) { + mlx5_mprq_buf_free(rxq->mprq_repl); + rxq->mprq_repl = NULL; + } +} + +/** + * Free RX queue elements for Single-Packet RQ. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + */ +static void +rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) { struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; const uint16_t q_n = (1 << rxq->elts_n); @@ -149,14 +315,15 @@ rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi); uint16_t i; - DEBUG("%p: freeing WRs", (void *)rxq_ctrl); + DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs", + PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx); if (rxq->elts == NULL) return; /** * Some mbuf in the Ring belongs to the application. They cannot be * freed. */ - if (rxq_check_vec_support(rxq) > 0) { + if (mlx5_rxq_check_vec_support(rxq) > 0) { for (i = 0; i < used; ++i) (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL; rxq->rq_pi = rxq->rq_ci; @@ -169,6 +336,21 @@ rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) } /** + * Free RX queue elements. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + */ +static void +rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) + rxq_free_elts_mprq(rxq_ctrl); + else + rxq_free_elts_sprq(rxq_ctrl); +} + +/** * Clean up a RX queue. * * Destroy objects, free allocated memory and reset the structure for reuse. @@ -179,24 +361,26 @@ rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) void mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl) { - DEBUG("cleaning up %p", (void *)rxq_ctrl); + DRV_LOG(DEBUG, "port %u cleaning up Rx queue %u", + PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx); if (rxq_ctrl->ibv) - mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv); + mlx5_rxq_ibv_release(rxq_ctrl->ibv); memset(rxq_ctrl, 0, sizeof(*rxq_ctrl)); } /** * Returns the per-queue supported offloads. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return * Supported Rx offloads. */ uint64_t -mlx5_priv_get_rx_queue_offloads(struct priv *priv) +mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_dev_config *config = &priv->config; uint64_t offloads = (DEV_RX_OFFLOAD_SCATTER | DEV_RX_OFFLOAD_TIMESTAMP | @@ -217,13 +401,11 @@ mlx5_priv_get_rx_queue_offloads(struct priv *priv) /** * Returns the per-port supported offloads. * - * @param priv - * Pointer to private structure. * @return * Supported Rx offloads. */ uint64_t -mlx5_priv_get_rx_port_offloads(struct priv *priv __rte_unused) +mlx5_get_rx_port_offloads(void) { uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER; @@ -231,33 +413,6 @@ mlx5_priv_get_rx_port_offloads(struct priv *priv __rte_unused) } /** - * Checks if the per-queue offload configuration is valid. - * - * @param priv - * Pointer to private structure. - * @param offloads - * Per-queue offloads configuration. - * - * @return - * 1 if the configuration is valid, 0 otherwise. - */ -static int -priv_is_rx_queue_offloads_allowed(struct priv *priv, uint64_t offloads) -{ - uint64_t port_offloads = priv->dev->data->dev_conf.rxmode.offloads; - uint64_t queue_supp_offloads = - mlx5_priv_get_rx_queue_offloads(priv); - uint64_t port_supp_offloads = mlx5_priv_get_rx_port_offloads(priv); - - if ((offloads & (queue_supp_offloads | port_supp_offloads)) != - offloads) - return 0; - if (((port_offloads ^ offloads) & port_supp_offloads)) - return 0; - return 1; -} - -/** * * @param dev * Pointer to Ethernet device structure. @@ -273,7 +428,7 @@ priv_is_rx_queue_offloads_allowed(struct priv *priv, uint64_t offloads) * Memory pool for buffer allocations. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, @@ -284,53 +439,40 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx]; struct mlx5_rxq_ctrl *rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); - int ret = 0; - priv_lock(priv); if (!rte_is_power_of_2(desc)) { desc = 1 << log2above(desc); - WARN("%p: increased number of descriptors in RX queue %u" - " to the next power of two (%d)", - (void *)dev, idx, desc); + DRV_LOG(WARNING, + "port %u increased number of descriptors in Rx queue %u" + " to the next power of two (%d)", + dev->data->port_id, idx, desc); } - DEBUG("%p: configuring queue %u for %u descriptors", - (void *)dev, idx, desc); + DRV_LOG(DEBUG, "port %u configuring Rx queue %u for %u descriptors", + dev->data->port_id, idx, desc); if (idx >= priv->rxqs_n) { - ERROR("%p: queue index out of range (%u >= %u)", - (void *)dev, idx, priv->rxqs_n); - priv_unlock(priv); - return -EOVERFLOW; - } - if (!priv_is_rx_queue_offloads_allowed(priv, conf->offloads)) { - ret = ENOTSUP; - ERROR("%p: Rx queue offloads 0x%" PRIx64 " don't match port " - "offloads 0x%" PRIx64 " or supported offloads 0x%" PRIx64, - (void *)dev, conf->offloads, - dev->data->dev_conf.rxmode.offloads, - (mlx5_priv_get_rx_port_offloads(priv) | - mlx5_priv_get_rx_queue_offloads(priv))); - goto out; - } - if (!mlx5_priv_rxq_releasable(priv, idx)) { - ret = EBUSY; - ERROR("%p: unable to release queue index %u", - (void *)dev, idx); - goto out; - } - mlx5_priv_rxq_release(priv, idx); - rxq_ctrl = mlx5_priv_rxq_new(priv, idx, desc, socket, conf, mp); + DRV_LOG(ERR, "port %u Rx queue index out of range (%u >= %u)", + dev->data->port_id, idx, priv->rxqs_n); + rte_errno = EOVERFLOW; + return -rte_errno; + } + if (!mlx5_rxq_releasable(dev, idx)) { + DRV_LOG(ERR, "port %u unable to release queue index %u", + dev->data->port_id, idx); + rte_errno = EBUSY; + return -rte_errno; + } + mlx5_rxq_release(dev, idx); + rxq_ctrl = mlx5_rxq_new(dev, idx, desc, socket, conf, mp); if (!rxq_ctrl) { - ERROR("%p: unable to allocate queue index %u", - (void *)dev, idx); - ret = ENOMEM; - goto out; + DRV_LOG(ERR, "port %u unable to allocate queue index %u", + dev->data->port_id, idx); + rte_errno = ENOMEM; + return -rte_errno; } - DEBUG("%p: adding RX queue %p to list", - (void *)dev, (void *)rxq_ctrl); + DRV_LOG(DEBUG, "port %u adding Rx queue %u to list", + dev->data->port_id, idx); (*priv->rxqs)[idx] = &rxq_ctrl->rxq; -out: - priv_unlock(priv); - return -ret; + return 0; } /** @@ -350,45 +492,48 @@ mlx5_rx_queue_release(void *dpdk_rxq) return; rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); priv = rxq_ctrl->priv; - priv_lock(priv); - if (!mlx5_priv_rxq_releasable(priv, rxq_ctrl->rxq.stats.idx)) - rte_panic("Rx queue %p is still used by a flow and cannot be" - " removed\n", (void *)rxq_ctrl); - mlx5_priv_rxq_release(priv, rxq_ctrl->rxq.stats.idx); - priv_unlock(priv); + if (!mlx5_rxq_releasable(ETH_DEV(priv), rxq_ctrl->rxq.stats.idx)) + rte_panic("port %u Rx queue %u is still used by a flow and" + " cannot be removed\n", + PORT_ID(priv), rxq_ctrl->idx); + mlx5_rxq_release(ETH_DEV(priv), rxq_ctrl->rxq.stats.idx); } /** * Allocate queue vector and fill epoll fd list for Rx interrupts. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return - * 0 on success, negative on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_rx_intr_vec_enable(struct priv *priv) +mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; unsigned int i; unsigned int rxqs_n = priv->rxqs_n; unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); unsigned int count = 0; - struct rte_intr_handle *intr_handle = priv->dev->intr_handle; + struct rte_intr_handle *intr_handle = dev->intr_handle; - if (!priv->dev->data->dev_conf.intr_conf.rxq) + if (!dev->data->dev_conf.intr_conf.rxq) return 0; - priv_rx_intr_vec_disable(priv); + mlx5_rx_intr_vec_disable(dev); intr_handle->intr_vec = malloc(n * sizeof(intr_handle->intr_vec[0])); if (intr_handle->intr_vec == NULL) { - ERROR("failed to allocate memory for interrupt vector," - " Rx interrupts will not be supported"); - return -ENOMEM; + DRV_LOG(ERR, + "port %u failed to allocate memory for interrupt" + " vector, Rx interrupts will not be supported", + dev->data->port_id); + rte_errno = ENOMEM; + return -rte_errno; } intr_handle->type = RTE_INTR_HANDLE_EXT; for (i = 0; i != n; ++i) { /* This rxq ibv must not be released in this function. */ - struct mlx5_rxq_ibv *rxq_ibv = mlx5_priv_rxq_ibv_get(priv, i); + struct mlx5_rxq_ibv *rxq_ibv = mlx5_rxq_ibv_get(dev, i); int fd; int flags; int rc; @@ -402,27 +547,34 @@ priv_rx_intr_vec_enable(struct priv *priv) continue; } if (count >= RTE_MAX_RXTX_INTR_VEC_ID) { - ERROR("too many Rx queues for interrupt vector size" - " (%d), Rx interrupts cannot be enabled", - RTE_MAX_RXTX_INTR_VEC_ID); - priv_rx_intr_vec_disable(priv); - return -1; + DRV_LOG(ERR, + "port %u too many Rx queues for interrupt" + " vector size (%d), Rx interrupts cannot be" + " enabled", + dev->data->port_id, RTE_MAX_RXTX_INTR_VEC_ID); + mlx5_rx_intr_vec_disable(dev); + rte_errno = ENOMEM; + return -rte_errno; } fd = rxq_ibv->channel->fd; flags = fcntl(fd, F_GETFL); rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK); if (rc < 0) { - ERROR("failed to make Rx interrupt file descriptor" - " %d non-blocking for queue index %d", fd, i); - priv_rx_intr_vec_disable(priv); - return -1; + rte_errno = errno; + DRV_LOG(ERR, + "port %u failed to make Rx interrupt file" + " descriptor %d non-blocking for queue index" + " %d", + dev->data->port_id, fd, i); + mlx5_rx_intr_vec_disable(dev); + return -rte_errno; } intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count; intr_handle->efds[count] = fd; count++; } if (!count) - priv_rx_intr_vec_disable(priv); + mlx5_rx_intr_vec_disable(dev); else intr_handle->nb_efd = count; return 0; @@ -431,18 +583,19 @@ priv_rx_intr_vec_enable(struct priv *priv) /** * Clean up Rx interrupts handler. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. */ void -priv_rx_intr_vec_disable(struct priv *priv) +mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev) { - struct rte_intr_handle *intr_handle = priv->dev->intr_handle; + struct priv *priv = dev->data->dev_private; + struct rte_intr_handle *intr_handle = dev->intr_handle; unsigned int i; unsigned int rxqs_n = priv->rxqs_n; unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); - if (!priv->dev->data->dev_conf.intr_conf.rxq) + if (!dev->data->dev_conf.intr_conf.rxq) return; if (!intr_handle->intr_vec) goto free; @@ -459,7 +612,7 @@ priv_rx_intr_vec_disable(struct priv *priv) */ rxq_data = (*priv->rxqs)[i]; rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); - mlx5_priv_rxq_ibv_release(priv, rxq_ctrl->ibv); + mlx5_rxq_ibv_release(rxq_ctrl->ibv); } free: rte_intr_free_epoll_fd(intr_handle); @@ -502,7 +655,7 @@ mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq) * Rx queue number. * * @return - * 0 on success, negative on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id) @@ -510,31 +663,25 @@ mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id) struct priv *priv = dev->data->dev_private; struct mlx5_rxq_data *rxq_data; struct mlx5_rxq_ctrl *rxq_ctrl; - int ret = 0; - priv_lock(priv); rxq_data = (*priv->rxqs)[rx_queue_id]; if (!rxq_data) { - ret = EINVAL; - goto exit; + rte_errno = EINVAL; + return -rte_errno; } rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); if (rxq_ctrl->irq) { struct mlx5_rxq_ibv *rxq_ibv; - rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id); + rxq_ibv = mlx5_rxq_ibv_get(dev, rx_queue_id); if (!rxq_ibv) { - ret = EINVAL; - goto exit; + rte_errno = EINVAL; + return -rte_errno; } mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn); - mlx5_priv_rxq_ibv_release(priv, rxq_ibv); + mlx5_rxq_ibv_release(rxq_ibv); } -exit: - priv_unlock(priv); - if (ret) - WARN("unable to arm interrupt on rx queue %d", rx_queue_id); - return -ret; + return 0; } /** @@ -546,7 +693,7 @@ exit: * Rx queue number. * * @return - * 0 on success, negative on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id) @@ -557,53 +704,54 @@ mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id) struct mlx5_rxq_ibv *rxq_ibv = NULL; struct ibv_cq *ev_cq; void *ev_ctx; - int ret = 0; + int ret; - priv_lock(priv); rxq_data = (*priv->rxqs)[rx_queue_id]; if (!rxq_data) { - ret = EINVAL; - goto exit; + rte_errno = EINVAL; + return -rte_errno; } rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); if (!rxq_ctrl->irq) - goto exit; - rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id); + return 0; + rxq_ibv = mlx5_rxq_ibv_get(dev, rx_queue_id); if (!rxq_ibv) { - ret = EINVAL; - goto exit; + rte_errno = EINVAL; + return -rte_errno; } ret = mlx5_glue->get_cq_event(rxq_ibv->channel, &ev_cq, &ev_ctx); if (ret || ev_cq != rxq_ibv->cq) { - ret = EINVAL; + rte_errno = EINVAL; goto exit; } rxq_data->cq_arm_sn++; mlx5_glue->ack_cq_events(rxq_ibv->cq, 1); + return 0; exit: + ret = rte_errno; /* Save rte_errno before cleanup. */ if (rxq_ibv) - mlx5_priv_rxq_ibv_release(priv, rxq_ibv); - priv_unlock(priv); - if (ret) - WARN("unable to disable interrupt on rx queue %d", - rx_queue_id); - return -ret; + mlx5_rxq_ibv_release(rxq_ibv); + DRV_LOG(WARNING, "port %u unable to disable interrupt on Rx queue %d", + dev->data->port_id, rx_queue_id); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } /** * Create the Rx queue Verbs object. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * Queue index in DPDK Rx queue array * * @return - * The Verbs object initialised if it can be created. + * The Verbs object initialised, NULL otherwise and rte_errno is set. */ -struct mlx5_rxq_ibv* -mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) +struct mlx5_rxq_ibv * +mlx5_rxq_ibv_new(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx]; struct mlx5_rxq_ctrl *rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); @@ -613,10 +761,16 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) struct ibv_cq_init_attr_ex ibv; struct mlx5dv_cq_init_attr mlx5; } cq; - struct ibv_wq_init_attr wq; + struct { + struct ibv_wq_init_attr ibv; +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + struct mlx5dv_wq_init_attr mlx5; +#endif + } wq; struct ibv_cq_ex cq_attr; } attr; - unsigned int cqe_n = (1 << rxq_data->elts_n) - 1; + unsigned int cqe_n; + unsigned int wqe_n = 1 << rxq_data->elts_n; struct mlx5_rxq_ibv *tmpl; struct mlx5dv_cq cq_info; struct mlx5dv_rwq rwq; @@ -624,6 +778,7 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) int ret = 0; struct mlx5dv_obj obj; struct mlx5_dev_config *config = &priv->config; + const int mprq_en = mlx5_rxq_mprq_enabled(rxq_data); assert(rxq_data); assert(!rxq_ctrl->ibv); @@ -632,28 +787,26 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0, rxq_ctrl->socket); if (!tmpl) { - ERROR("%p: cannot allocate verbs resources", - (void *)rxq_ctrl); + DRV_LOG(ERR, + "port %u Rx queue %u cannot allocate verbs resources", + dev->data->port_id, rxq_ctrl->idx); + rte_errno = ENOMEM; goto error; } tmpl->rxq_ctrl = rxq_ctrl; - /* Use the entire RX mempool as the memory region. */ - tmpl->mr = priv_mr_get(priv, rxq_data->mp); - if (!tmpl->mr) { - tmpl->mr = priv_mr_new(priv, rxq_data->mp); - if (!tmpl->mr) { - ERROR("%p: MR creation failure", (void *)rxq_ctrl); - goto error; - } - } if (rxq_ctrl->irq) { tmpl->channel = mlx5_glue->create_comp_channel(priv->ctx); if (!tmpl->channel) { - ERROR("%p: Comp Channel creation failure", - (void *)rxq_ctrl); + DRV_LOG(ERR, "port %u: comp channel creation failure", + dev->data->port_id); + rte_errno = ENOMEM; goto error; } } + if (mprq_en) + cqe_n = wqe_n * (1 << rxq_data->strd_num_n) - 1; + else + cqe_n = wqe_n - 1; attr.cq.ibv = (struct ibv_cq_init_attr_ex){ .cqe = cqe_n, .channel = tmpl->channel, @@ -670,27 +823,32 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) * For vectorized Rx, it must not be doubled in order to * make cq_ci and rq_ci aligned. */ - if (rxq_check_vec_support(rxq_data) < 0) + if (mlx5_rxq_check_vec_support(rxq_data) < 0) attr.cq.ibv.cqe *= 2; } else if (config->cqe_comp && rxq_data->hw_timestamp) { - DEBUG("Rx CQE compression is disabled for HW timestamp"); + DRV_LOG(DEBUG, + "port %u Rx CQE compression is disabled for HW" + " timestamp", + dev->data->port_id); } tmpl->cq = mlx5_glue->cq_ex_to_cq (mlx5_glue->dv_create_cq(priv->ctx, &attr.cq.ibv, &attr.cq.mlx5)); if (tmpl->cq == NULL) { - ERROR("%p: CQ creation failure", (void *)rxq_ctrl); + DRV_LOG(ERR, "port %u Rx queue %u CQ creation failure", + dev->data->port_id, idx); + rte_errno = ENOMEM; goto error; } - DEBUG("priv->device_attr.max_qp_wr is %d", - priv->device_attr.orig_attr.max_qp_wr); - DEBUG("priv->device_attr.max_sge is %d", - priv->device_attr.orig_attr.max_sge); - attr.wq = (struct ibv_wq_init_attr){ + DRV_LOG(DEBUG, "port %u priv->device_attr.max_qp_wr is %d", + dev->data->port_id, priv->device_attr.orig_attr.max_qp_wr); + DRV_LOG(DEBUG, "port %u priv->device_attr.max_sge is %d", + dev->data->port_id, priv->device_attr.orig_attr.max_sge); + attr.wq.ibv = (struct ibv_wq_init_attr){ .wq_context = NULL, /* Could be useful in the future. */ .wq_type = IBV_WQT_RQ, /* Max number of outstanding WRs. */ - .max_wr = (1 << rxq_data->elts_n) >> rxq_data->sges_n, + .max_wr = wqe_n >> rxq_data->sges_n, /* Max number of scatter/gather elements in a WR. */ .max_sge = 1 << rxq_data->sges_n, .pd = priv->pd, @@ -704,32 +862,54 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) }; /* By default, FCS (CRC) is stripped by hardware. */ if (rxq_data->crc_present) { - attr.wq.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS; - attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; + attr.wq.ibv.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS; + attr.wq.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; } #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING if (config->hw_padding) { - attr.wq.create_flags |= IBV_WQ_FLAG_RX_END_PADDING; - attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; + attr.wq.ibv.create_flags |= IBV_WQ_FLAG_RX_END_PADDING; + attr.wq.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; } #endif - tmpl->wq = mlx5_glue->create_wq(priv->ctx, &attr.wq); +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + attr.wq.mlx5 = (struct mlx5dv_wq_init_attr){ + .comp_mask = 0, + }; + if (mprq_en) { + struct mlx5dv_striding_rq_init_attr *mprq_attr = + &attr.wq.mlx5.striding_rq_attrs; + + attr.wq.mlx5.comp_mask |= MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ; + *mprq_attr = (struct mlx5dv_striding_rq_init_attr){ + .single_stride_log_num_of_bytes = rxq_data->strd_sz_n, + .single_wqe_log_num_of_strides = rxq_data->strd_num_n, + .two_byte_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT, + }; + } + tmpl->wq = mlx5_glue->dv_create_wq(priv->ctx, &attr.wq.ibv, + &attr.wq.mlx5); +#else + tmpl->wq = mlx5_glue->create_wq(priv->ctx, &attr.wq.ibv); +#endif if (tmpl->wq == NULL) { - ERROR("%p: WQ creation failure", (void *)rxq_ctrl); + DRV_LOG(ERR, "port %u Rx queue %u WQ creation failure", + dev->data->port_id, idx); + rte_errno = ENOMEM; goto error; } /* * Make sure number of WRs*SGEs match expectations since a queue * cannot allocate more than "desc" buffers. */ - if (((int)attr.wq.max_wr != - ((1 << rxq_data->elts_n) >> rxq_data->sges_n)) || - ((int)attr.wq.max_sge != (1 << rxq_data->sges_n))) { - ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs", - (void *)rxq_ctrl, - ((1 << rxq_data->elts_n) >> rxq_data->sges_n), - (1 << rxq_data->sges_n), - attr.wq.max_wr, attr.wq.max_sge); + if (attr.wq.ibv.max_wr != (wqe_n >> rxq_data->sges_n) || + attr.wq.ibv.max_sge != (1u << rxq_data->sges_n)) { + DRV_LOG(ERR, + "port %u Rx queue %u requested %u*%u but got %u*%u" + " WRs*SGEs", + dev->data->port_id, idx, + wqe_n >> rxq_data->sges_n, (1 << rxq_data->sges_n), + attr.wq.ibv.max_wr, attr.wq.ibv.max_sge); + rte_errno = EINVAL; goto error; } /* Change queue state to ready. */ @@ -739,8 +919,10 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) }; ret = mlx5_glue->modify_wq(tmpl->wq, &mod); if (ret) { - ERROR("%p: WQ state to IBV_WQS_RDY failed", - (void *)rxq_ctrl); + DRV_LOG(ERR, + "port %u Rx queue %u WQ state to IBV_WQS_RDY failed", + dev->data->port_id, idx); + rte_errno = ret; goto error; } obj.cq.in = tmpl->cq; @@ -748,33 +930,53 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) obj.rwq.in = tmpl->wq; obj.rwq.out = &rwq; ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ); - if (ret != 0) + if (ret) { + rte_errno = ret; goto error; + } if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) { - ERROR("Wrong MLX5_CQE_SIZE environment variable value: " - "it should be set to %u", RTE_CACHE_LINE_SIZE); + DRV_LOG(ERR, + "port %u wrong MLX5_CQE_SIZE environment variable" + " value: it should be set to %u", + dev->data->port_id, RTE_CACHE_LINE_SIZE); + rte_errno = EINVAL; goto error; } /* Fill the rings. */ - rxq_data->wqes = (volatile struct mlx5_wqe_data_seg (*)[]) - (uintptr_t)rwq.buf; - for (i = 0; (i != (unsigned int)(1 << rxq_data->elts_n)); ++i) { - struct rte_mbuf *buf = (*rxq_data->elts)[i]; - volatile struct mlx5_wqe_data_seg *scat = &(*rxq_data->wqes)[i]; - + rxq_data->wqes = rwq.buf; + for (i = 0; (i != wqe_n); ++i) { + volatile struct mlx5_wqe_data_seg *scat; + uintptr_t addr; + uint32_t byte_count; + + if (mprq_en) { + struct mlx5_mprq_buf *buf = (*rxq_data->mprq_bufs)[i]; + + scat = &((volatile struct mlx5_wqe_mprq *) + rxq_data->wqes)[i].dseg; + addr = (uintptr_t)mlx5_mprq_buf_addr(buf); + byte_count = (1 << rxq_data->strd_sz_n) * + (1 << rxq_data->strd_num_n); + } else { + struct rte_mbuf *buf = (*rxq_data->elts)[i]; + + scat = &((volatile struct mlx5_wqe_data_seg *) + rxq_data->wqes)[i]; + addr = rte_pktmbuf_mtod(buf, uintptr_t); + byte_count = DATA_LEN(buf); + } /* scat->addr must be able to store a pointer. */ assert(sizeof(scat->addr) >= sizeof(uintptr_t)); *scat = (struct mlx5_wqe_data_seg){ - .addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, - uintptr_t)), - .byte_count = rte_cpu_to_be_32(DATA_LEN(buf)), - .lkey = tmpl->mr->lkey, + .addr = rte_cpu_to_be_64(addr), + .byte_count = rte_cpu_to_be_32(byte_count), + .lkey = mlx5_rx_addr2mr(rxq_data, addr), }; } rxq_data->rq_db = rwq.dbrec; rxq_data->cqe_n = log2above(cq_info.cqe_cnt); rxq_data->cq_ci = 0; - rxq_data->rq_ci = 0; + rxq_data->strd_ci = 0; rxq_data->rq_pi = 0; rxq_data->zip = (struct rxq_zip){ .ai = 0, @@ -785,43 +987,45 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) rxq_data->cqn = cq_info.cqn; rxq_data->cq_arm_sn = 0; /* Update doorbell counter. */ - rxq_data->rq_ci = (1 << rxq_data->elts_n) >> rxq_data->sges_n; + rxq_data->rq_ci = wqe_n >> rxq_data->sges_n; rte_wmb(); *rxq_data->rq_db = rte_cpu_to_be_32(rxq_data->rq_ci); - DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl); + DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id, + idx, (void *)&tmpl); rte_atomic32_inc(&tmpl->refcnt); - DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv, - (void *)tmpl, rte_atomic32_read(&tmpl->refcnt)); + DRV_LOG(DEBUG, "port %u Verbs Rx queue %u: refcnt %d", + dev->data->port_id, idx, rte_atomic32_read(&tmpl->refcnt)); LIST_INSERT_HEAD(&priv->rxqsibv, tmpl, next); priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; return tmpl; error: + ret = rte_errno; /* Save rte_errno before cleanup. */ if (tmpl->wq) claim_zero(mlx5_glue->destroy_wq(tmpl->wq)); if (tmpl->cq) claim_zero(mlx5_glue->destroy_cq(tmpl->cq)); if (tmpl->channel) claim_zero(mlx5_glue->destroy_comp_channel(tmpl->channel)); - if (tmpl->mr) - priv_mr_release(priv, tmpl->mr); priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; + rte_errno = ret; /* Restore rte_errno. */ return NULL; } /** * Get an Rx queue Verbs object. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * Queue index in DPDK Rx queue array * * @return * The Verbs object if it exists. */ -struct mlx5_rxq_ibv* -mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx) +struct mlx5_rxq_ibv * +mlx5_rxq_ibv_get(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx]; struct mlx5_rxq_ctrl *rxq_ctrl; @@ -831,11 +1035,10 @@ mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx) return NULL; rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); if (rxq_ctrl->ibv) { - priv_mr_get(priv, rxq_data->mp); rte_atomic32_inc(&rxq_ctrl->ibv->refcnt); - DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv, - (void *)rxq_ctrl->ibv, - rte_atomic32_read(&rxq_ctrl->ibv->refcnt)); + DRV_LOG(DEBUG, "port %u Verbs Rx queue %u: refcnt %d", + dev->data->port_id, rxq_ctrl->idx, + rte_atomic32_read(&rxq_ctrl->ibv->refcnt)); } return rxq_ctrl->ibv; } @@ -843,28 +1046,21 @@ mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx) /** * Release an Rx verbs queue object. * - * @param priv - * Pointer to private structure. * @param rxq_ibv * Verbs Rx queue object. * * @return - * 0 on success, errno value on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv) +mlx5_rxq_ibv_release(struct mlx5_rxq_ibv *rxq_ibv) { - int ret; - assert(rxq_ibv); assert(rxq_ibv->wq); assert(rxq_ibv->cq); - assert(rxq_ibv->mr); - ret = priv_mr_release(priv, rxq_ibv->mr); - if (!ret) - rxq_ibv->mr = NULL; - DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv, - (void *)rxq_ibv, rte_atomic32_read(&rxq_ibv->refcnt)); + DRV_LOG(DEBUG, "port %u Verbs Rx queue %u: refcnt %d", + PORT_ID(rxq_ibv->rxq_ctrl->priv), + rxq_ibv->rxq_ctrl->idx, rte_atomic32_read(&rxq_ibv->refcnt)); if (rte_atomic32_dec_and_test(&rxq_ibv->refcnt)) { rxq_free_elts(rxq_ibv->rxq_ctrl); claim_zero(mlx5_glue->destroy_wq(rxq_ibv->wq)); @@ -876,26 +1072,28 @@ mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv) rte_free(rxq_ibv); return 0; } - return EBUSY; + return 1; } /** * Verify the Verbs Rx queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_rxq_ibv_verify(struct priv *priv) +mlx5_rxq_ibv_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; int ret = 0; struct mlx5_rxq_ibv *rxq_ibv; LIST_FOREACH(rxq_ibv, &priv->rxqsibv, next) { - DEBUG("%p: Verbs Rx queue %p still referenced", (void *)priv, - (void *)rxq_ibv); + DRV_LOG(DEBUG, "port %u Verbs Rx queue %u still referenced", + dev->data->port_id, rxq_ibv->rxq_ctrl->idx); ++ret; } return ret; @@ -904,65 +1102,272 @@ mlx5_priv_rxq_ibv_verify(struct priv *priv) /** * Return true if a single reference exists on the object. * - * @param priv - * Pointer to private structure. * @param rxq_ibv * Verbs Rx queue object. */ int -mlx5_priv_rxq_ibv_releasable(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv) +mlx5_rxq_ibv_releasable(struct mlx5_rxq_ibv *rxq_ibv) { - (void)priv; assert(rxq_ibv); return (rte_atomic32_read(&rxq_ibv->refcnt) == 1); } /** + * Callback function to initialize mbufs for Multi-Packet RQ. + */ +static inline void +mlx5_mprq_buf_init(struct rte_mempool *mp, void *opaque_arg __rte_unused, + void *_m, unsigned int i __rte_unused) +{ + struct mlx5_mprq_buf *buf = _m; + + memset(_m, 0, sizeof(*buf)); + buf->mp = mp; + rte_atomic16_set(&buf->refcnt, 1); +} + +/** + * Free mempool of Multi-Packet RQ. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 0 on success, negative errno value on failure. + */ +int +mlx5_mprq_free_mp(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct rte_mempool *mp = priv->mprq_mp; + unsigned int i; + + if (mp == NULL) + return 0; + DRV_LOG(DEBUG, "port %u freeing mempool (%s) for Multi-Packet RQ", + dev->data->port_id, mp->name); + /* + * If a buffer in the pool has been externally attached to a mbuf and it + * is still in use by application, destroying the Rx qeueue can spoil + * the packet. It is unlikely to happen but if application dynamically + * creates and destroys with holding Rx packets, this can happen. + * + * TODO: It is unavoidable for now because the mempool for Multi-Packet + * RQ isn't provided by application but managed by PMD. + */ + if (!rte_mempool_full(mp)) { + DRV_LOG(ERR, + "port %u mempool for Multi-Packet RQ is still in use", + dev->data->port_id); + rte_errno = EBUSY; + return -rte_errno; + } + rte_mempool_free(mp); + /* Unset mempool for each Rx queue. */ + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + + if (rxq == NULL) + continue; + rxq->mprq_mp = NULL; + } + return 0; +} + +/** + * Allocate a mempool for Multi-Packet RQ. All configured Rx queues share the + * mempool. If already allocated, reuse it if there're enough elements. + * Otherwise, resize it. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 0 on success, negative errno value on failure. + */ +int +mlx5_mprq_alloc_mp(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct rte_mempool *mp = priv->mprq_mp; + char name[RTE_MEMPOOL_NAMESIZE]; + unsigned int desc = 0; + unsigned int buf_len; + unsigned int obj_num; + unsigned int obj_size; + unsigned int strd_num_n = 0; + unsigned int strd_sz_n = 0; + unsigned int i; + + if (!mlx5_mprq_enabled(dev)) + return 0; + /* Count the total number of descriptors configured. */ + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + + if (rxq == NULL) + continue; + desc += 1 << rxq->elts_n; + /* Get the max number of strides. */ + if (strd_num_n < rxq->strd_num_n) + strd_num_n = rxq->strd_num_n; + /* Get the max size of a stride. */ + if (strd_sz_n < rxq->strd_sz_n) + strd_sz_n = rxq->strd_sz_n; + } + assert(strd_num_n && strd_sz_n); + buf_len = (1 << strd_num_n) * (1 << strd_sz_n); + obj_size = buf_len + sizeof(struct mlx5_mprq_buf); + /* + * Received packets can be either memcpy'd or externally referenced. In + * case that the packet is attached to an mbuf as an external buffer, as + * it isn't possible to predict how the buffers will be queued by + * application, there's no option to exactly pre-allocate needed buffers + * in advance but to speculatively prepares enough buffers. + * + * In the data path, if this Mempool is depleted, PMD will try to memcpy + * received packets to buffers provided by application (rxq->mp) until + * this Mempool gets available again. + */ + desc *= 4; + obj_num = desc + MLX5_MPRQ_MP_CACHE_SZ * priv->rxqs_n; + /* Check a mempool is already allocated and if it can be resued. */ + if (mp != NULL && mp->elt_size >= obj_size && mp->size >= obj_num) { + DRV_LOG(DEBUG, "port %u mempool %s is being reused", + dev->data->port_id, mp->name); + /* Reuse. */ + goto exit; + } else if (mp != NULL) { + DRV_LOG(DEBUG, "port %u mempool %s should be resized, freeing it", + dev->data->port_id, mp->name); + /* + * If failed to free, which means it may be still in use, no way + * but to keep using the existing one. On buffer underrun, + * packets will be memcpy'd instead of external buffer + * attachment. + */ + if (mlx5_mprq_free_mp(dev)) { + if (mp->elt_size >= obj_size) + goto exit; + else + return -rte_errno; + } + } + snprintf(name, sizeof(name), "%s-mprq", dev->device->name); + mp = rte_mempool_create(name, obj_num, obj_size, MLX5_MPRQ_MP_CACHE_SZ, + 0, NULL, NULL, mlx5_mprq_buf_init, NULL, + dev->device->numa_node, 0); + if (mp == NULL) { + DRV_LOG(ERR, + "port %u failed to allocate a mempool for" + " Multi-Packet RQ, count=%u, size=%u", + dev->data->port_id, obj_num, obj_size); + rte_errno = ENOMEM; + return -rte_errno; + } + priv->mprq_mp = mp; +exit: + /* Set mempool for each Rx queue. */ + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + + if (rxq == NULL) + continue; + rxq->mprq_mp = mp; + } + DRV_LOG(INFO, "port %u Multi-Packet RQ is configured", + dev->data->port_id); + return 0; +} + +/** * Create a DPDK Rx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx - * TX queue index. + * RX queue index. * @param desc * Number of descriptors to configure in queue. * @param socket * NUMA socket on which memory must be allocated. * * @return - * A DPDK queue object on success. + * A DPDK queue object on success, NULL otherwise and rte_errno is set. */ -struct mlx5_rxq_ctrl* -mlx5_priv_rxq_new(struct priv *priv, uint16_t idx, uint16_t desc, - unsigned int socket, const struct rte_eth_rxconf *conf, - struct rte_mempool *mp) +struct mlx5_rxq_ctrl * +mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_rxconf *conf, + struct rte_mempool *mp) { - struct rte_eth_dev *dev = priv->dev; + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *tmpl; unsigned int mb_len = rte_pktmbuf_data_room_size(mp); + unsigned int mprq_stride_size; struct mlx5_dev_config *config = &priv->config; /* * Always allocate extra slots, even if eventually * the vector Rx will not be used. */ - const uint16_t desc_n = + uint16_t desc_n = desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP; + uint64_t offloads = conf->offloads | + dev->data->dev_conf.rxmode.offloads; + const int mprq_en = mlx5_check_mprq_support(dev) > 0; tmpl = rte_calloc_socket("RXQ", 1, sizeof(*tmpl) + desc_n * sizeof(struct rte_mbuf *), 0, socket); - if (!tmpl) + if (!tmpl) { + rte_errno = ENOMEM; return NULL; + } + if (mlx5_mr_btree_init(&tmpl->rxq.mr_ctrl.cache_bh, + MLX5_MR_BTREE_CACHE_N, socket)) { + /* rte_errno is already set. */ + goto error; + } tmpl->socket = socket; - if (priv->dev->data->dev_conf.intr_conf.rxq) + if (dev->data->dev_conf.intr_conf.rxq) tmpl->irq = 1; - /* Enable scattered packets support for this queue if necessary. */ + /* + * This Rx queue can be configured as a Multi-Packet RQ if all of the + * following conditions are met: + * - MPRQ is enabled. + * - The number of descs is more than the number of strides. + * - max_rx_pkt_len plus overhead is less than the max size of a + * stride. + * Otherwise, enable Rx scatter if necessary. + */ assert(mb_len >= RTE_PKTMBUF_HEADROOM); - if (dev->data->dev_conf.rxmode.max_rx_pkt_len <= - (mb_len - RTE_PKTMBUF_HEADROOM)) { + mprq_stride_size = + dev->data->dev_conf.rxmode.max_rx_pkt_len + + sizeof(struct rte_mbuf_ext_shared_info) + + RTE_PKTMBUF_HEADROOM; + if (mprq_en && + desc >= (1U << config->mprq.stride_num_n) && + mprq_stride_size <= (1U << config->mprq.max_stride_size_n)) { + /* TODO: Rx scatter isn't supported yet. */ + tmpl->rxq.sges_n = 0; + /* Trim the number of descs needed. */ + desc >>= config->mprq.stride_num_n; + tmpl->rxq.strd_num_n = config->mprq.stride_num_n; + tmpl->rxq.strd_sz_n = RTE_MAX(log2above(mprq_stride_size), + config->mprq.min_stride_size_n); + tmpl->rxq.strd_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT; + tmpl->rxq.mprq_max_memcpy_len = + RTE_MIN(mb_len - RTE_PKTMBUF_HEADROOM, + config->mprq.max_memcpy_len); + DRV_LOG(DEBUG, + "port %u Rx queue %u: Multi-Packet RQ is enabled" + " strd_num_n = %u, strd_sz_n = %u", + dev->data->port_id, idx, + tmpl->rxq.strd_num_n, tmpl->rxq.strd_sz_n); + } else if (dev->data->dev_conf.rxmode.max_rx_pkt_len <= + (mb_len - RTE_PKTMBUF_HEADROOM)) { tmpl->rxq.sges_n = 0; - } else if (conf->offloads & DEV_RX_OFFLOAD_SCATTER) { + } else if (offloads & DEV_RX_OFFLOAD_SCATTER) { unsigned int size = RTE_PKTMBUF_HEADROOM + dev->data->dev_conf.rxmode.max_rx_pkt_len; @@ -978,57 +1383,63 @@ mlx5_priv_rxq_new(struct priv *priv, uint16_t idx, uint16_t desc, size = mb_len * (1 << tmpl->rxq.sges_n); size -= RTE_PKTMBUF_HEADROOM; if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) { - ERROR("%p: too many SGEs (%u) needed to handle" - " requested maximum packet size %u", - (void *)dev, - 1 << sges_n, - dev->data->dev_conf.rxmode.max_rx_pkt_len); + DRV_LOG(ERR, + "port %u too many SGEs (%u) needed to handle" + " requested maximum packet size %u", + dev->data->port_id, + 1 << sges_n, + dev->data->dev_conf.rxmode.max_rx_pkt_len); + rte_errno = EOVERFLOW; goto error; } } else { - WARN("%p: the requested maximum Rx packet size (%u) is" - " larger than a single mbuf (%u) and scattered" - " mode has not been requested", - (void *)dev, - dev->data->dev_conf.rxmode.max_rx_pkt_len, - mb_len - RTE_PKTMBUF_HEADROOM); - } - DEBUG("%p: maximum number of segments per packet: %u", - (void *)dev, 1 << tmpl->rxq.sges_n); + DRV_LOG(WARNING, + "port %u the requested maximum Rx packet size (%u) is" + " larger than a single mbuf (%u) and scattered mode has" + " not been requested", + dev->data->port_id, + dev->data->dev_conf.rxmode.max_rx_pkt_len, + mb_len - RTE_PKTMBUF_HEADROOM); + } + DRV_LOG(DEBUG, "port %u maximum number of segments per packet: %u", + dev->data->port_id, 1 << tmpl->rxq.sges_n); if (desc % (1 << tmpl->rxq.sges_n)) { - ERROR("%p: number of RX queue descriptors (%u) is not a" - " multiple of SGEs per packet (%u)", - (void *)dev, - desc, - 1 << tmpl->rxq.sges_n); + DRV_LOG(ERR, + "port %u number of Rx queue descriptors (%u) is not a" + " multiple of SGEs per packet (%u)", + dev->data->port_id, + desc, + 1 << tmpl->rxq.sges_n); + rte_errno = EINVAL; goto error; } /* Toggle RX checksum offload if hardware supports it. */ - tmpl->rxq.csum = !!(conf->offloads & DEV_RX_OFFLOAD_CHECKSUM); - tmpl->rxq.csum_l2tun = (!!(conf->offloads & DEV_RX_OFFLOAD_CHECKSUM) && - priv->config.hw_csum_l2tun); - tmpl->rxq.hw_timestamp = !!(conf->offloads & DEV_RX_OFFLOAD_TIMESTAMP); + tmpl->rxq.csum = !!(offloads & DEV_RX_OFFLOAD_CHECKSUM); + tmpl->rxq.hw_timestamp = !!(offloads & DEV_RX_OFFLOAD_TIMESTAMP); /* Configure VLAN stripping. */ - tmpl->rxq.vlan_strip = !!(conf->offloads & DEV_RX_OFFLOAD_VLAN_STRIP); + tmpl->rxq.vlan_strip = !!(offloads & DEV_RX_OFFLOAD_VLAN_STRIP); /* By default, FCS (CRC) is stripped by hardware. */ - if (conf->offloads & DEV_RX_OFFLOAD_CRC_STRIP) { + if (offloads & DEV_RX_OFFLOAD_CRC_STRIP) { tmpl->rxq.crc_present = 0; } else if (config->hw_fcs_strip) { tmpl->rxq.crc_present = 1; } else { - WARN("%p: CRC stripping has been disabled but will still" - " be performed by hardware, make sure MLNX_OFED and" - " firmware are up to date", - (void *)dev); + DRV_LOG(WARNING, + "port %u CRC stripping has been disabled but will" + " still be performed by hardware, make sure MLNX_OFED" + " and firmware are up to date", + dev->data->port_id); tmpl->rxq.crc_present = 0; } - DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from" - " incoming frames to hide it", - (void *)dev, - tmpl->rxq.crc_present ? "disabled" : "enabled", - tmpl->rxq.crc_present << 2); + DRV_LOG(DEBUG, + "port %u CRC stripping is %s, %u bytes will be subtracted from" + " incoming frames to hide it", + dev->data->port_id, + tmpl->rxq.crc_present ? "disabled" : "enabled", + tmpl->rxq.crc_present << 2); /* Save port ID. */ - tmpl->rxq.rss_hash = priv->rxqs_n > 1; + tmpl->rxq.rss_hash = !!priv->rss_conf.rss_hf && + (!!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS)); tmpl->rxq.port_id = dev->data->port_id; tmpl->priv = priv; tmpl->rxq.mp = mp; @@ -1036,9 +1447,10 @@ mlx5_priv_rxq_new(struct priv *priv, uint16_t idx, uint16_t desc, tmpl->rxq.elts_n = log2above(desc); tmpl->rxq.elts = (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1); + tmpl->idx = idx; rte_atomic32_inc(&tmpl->refcnt); - DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv, - (void *)tmpl, rte_atomic32_read(&tmpl->refcnt)); + DRV_LOG(DEBUG, "port %u Rx queue %u: refcnt %d", dev->data->port_id, + idx, rte_atomic32_read(&tmpl->refcnt)); LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next); return tmpl; error: @@ -1049,28 +1461,29 @@ error: /** * Get a Rx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * * @return - * A pointer to the queue if it exists. + * A pointer to the queue if it exists, NULL otherwise. */ -struct mlx5_rxq_ctrl* -mlx5_priv_rxq_get(struct priv *priv, uint16_t idx) +struct mlx5_rxq_ctrl * +mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *rxq_ctrl = NULL; if ((*priv->rxqs)[idx]) { rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq); - - mlx5_priv_rxq_ibv_get(priv, idx); + mlx5_rxq_ibv_get(dev, idx); rte_atomic32_inc(&rxq_ctrl->refcnt); - DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv, - (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt)); + DRV_LOG(DEBUG, "port %u Rx queue %u: refcnt %d", + dev->data->port_id, rxq_ctrl->idx, + rte_atomic32_read(&rxq_ctrl->refcnt)); } return rxq_ctrl; } @@ -1078,59 +1491,60 @@ mlx5_priv_rxq_get(struct priv *priv, uint16_t idx) /** * Release a Rx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * * @return - * 0 on success, errno value on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_rxq_release(struct priv *priv, uint16_t idx) +mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *rxq_ctrl; if (!(*priv->rxqs)[idx]) return 0; rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq); assert(rxq_ctrl->priv); - if (rxq_ctrl->ibv) { - int ret; - - ret = mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv); - if (!ret) - rxq_ctrl->ibv = NULL; - } - DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv, - (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt)); + if (rxq_ctrl->ibv && !mlx5_rxq_ibv_release(rxq_ctrl->ibv)) + rxq_ctrl->ibv = NULL; + DRV_LOG(DEBUG, "port %u Rx queue %u: refcnt %d", dev->data->port_id, + rxq_ctrl->idx, rte_atomic32_read(&rxq_ctrl->refcnt)); if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) { + mlx5_mr_btree_free(&rxq_ctrl->rxq.mr_ctrl.cache_bh); LIST_REMOVE(rxq_ctrl, next); rte_free(rxq_ctrl); (*priv->rxqs)[idx] = NULL; return 0; } - return EBUSY; + return 1; } /** * Verify if the queue can be released. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * * @return - * 1 if the queue can be released. + * 1 if the queue can be released, negative errno otherwise and rte_errno is + * set. */ int -mlx5_priv_rxq_releasable(struct priv *priv, uint16_t idx) +mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *rxq_ctrl; - if (!(*priv->rxqs)[idx]) - return -1; + if (!(*priv->rxqs)[idx]) { + rte_errno = EINVAL; + return -rte_errno; + } rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq); return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1); } @@ -1138,20 +1552,22 @@ mlx5_priv_rxq_releasable(struct priv *priv, uint16_t idx) /** * Verify the Rx Queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_rxq_verify(struct priv *priv) +mlx5_rxq_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *rxq_ctrl; int ret = 0; LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) { - DEBUG("%p: Rx Queue %p still referenced", (void *)priv, - (void *)rxq_ctrl); + DRV_LOG(DEBUG, "port %u Rx Queue %u still referenced", + dev->data->port_id, rxq_ctrl->idx); ++ret; } return ret; @@ -1160,20 +1576,21 @@ mlx5_priv_rxq_verify(struct priv *priv) /** * Create an indirection table. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param queues * Queues entering in the indirection table. * @param queues_n * Number of queues in the array. * * @return - * A new indirection table. + * The Verbs object initialised, NULL otherwise and rte_errno is set. */ -struct mlx5_ind_table_ibv* -mlx5_priv_ind_table_ibv_new(struct priv *priv, uint16_t queues[], - uint16_t queues_n) +struct mlx5_ind_table_ibv * +mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, const uint16_t *queues, + uint32_t queues_n) { + struct priv *priv = dev->data->dev_private; struct mlx5_ind_table_ibv *ind_tbl; const unsigned int wq_n = rte_is_power_of_2(queues_n) ? log2above(queues_n) : @@ -1184,11 +1601,12 @@ mlx5_priv_ind_table_ibv_new(struct priv *priv, uint16_t queues[], ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) + queues_n * sizeof(uint16_t), 0); - if (!ind_tbl) + if (!ind_tbl) { + rte_errno = ENOMEM; return NULL; + } for (i = 0; i != queues_n; ++i) { - struct mlx5_rxq_ctrl *rxq = - mlx5_priv_rxq_get(priv, queues[i]); + struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev, queues[i]); if (!rxq) goto error; @@ -1206,24 +1624,28 @@ mlx5_priv_ind_table_ibv_new(struct priv *priv, uint16_t queues[], .ind_tbl = wq, .comp_mask = 0, }); - if (!ind_tbl->ind_table) + if (!ind_tbl->ind_table) { + rte_errno = errno; goto error; + } rte_atomic32_inc(&ind_tbl->refcnt); LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next); - DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv, - (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt)); + DEBUG("port %u new indirection table %p: queues:%u refcnt:%d", + dev->data->port_id, (void *)ind_tbl, 1 << wq_n, + rte_atomic32_read(&ind_tbl->refcnt)); return ind_tbl; error: rte_free(ind_tbl); - DEBUG("%p cannot create indirection table", (void *)priv); + DRV_LOG(DEBUG, "port %u cannot create indirection table", + dev->data->port_id); return NULL; } /** * Get an indirection table. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param queues * Queues entering in the indirection table. * @param queues_n @@ -1232,10 +1654,11 @@ error: * @return * An indirection table if found. */ -struct mlx5_ind_table_ibv* -mlx5_priv_ind_table_ibv_get(struct priv *priv, uint16_t queues[], - uint16_t queues_n) +struct mlx5_ind_table_ibv * +mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, const uint16_t *queues, + uint32_t queues_n) { + struct priv *priv = dev->data->dev_private; struct mlx5_ind_table_ibv *ind_tbl; LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) { @@ -1249,10 +1672,11 @@ mlx5_priv_ind_table_ibv_get(struct priv *priv, uint16_t queues[], unsigned int i; rte_atomic32_inc(&ind_tbl->refcnt); - DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv, - (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt)); + DRV_LOG(DEBUG, "port %u indirection table %p: refcnt %d", + dev->data->port_id, (void *)ind_tbl, + rte_atomic32_read(&ind_tbl->refcnt)); for (i = 0; i != ind_tbl->queues_n; ++i) - mlx5_priv_rxq_get(priv, ind_tbl->queues[i]); + mlx5_rxq_get(dev, ind_tbl->queues[i]); } return ind_tbl; } @@ -1260,52 +1684,59 @@ mlx5_priv_ind_table_ibv_get(struct priv *priv, uint16_t queues[], /** * Release an indirection table. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param ind_table * Indirection table to release. * * @return - * 0 on success, errno value on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_ind_table_ibv_release(struct priv *priv, - struct mlx5_ind_table_ibv *ind_tbl) +mlx5_ind_table_ibv_release(struct rte_eth_dev *dev, + struct mlx5_ind_table_ibv *ind_tbl) { unsigned int i; - DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv, - (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt)); - if (rte_atomic32_dec_and_test(&ind_tbl->refcnt)) + DRV_LOG(DEBUG, "port %u indirection table %p: refcnt %d", + dev->data->port_id, (void *)ind_tbl, + rte_atomic32_read(&ind_tbl->refcnt)); + if (rte_atomic32_dec_and_test(&ind_tbl->refcnt)) { claim_zero(mlx5_glue->destroy_rwq_ind_table (ind_tbl->ind_table)); + DEBUG("port %u delete indirection table %p: queues: %u", + dev->data->port_id, (void *)ind_tbl, ind_tbl->queues_n); + } for (i = 0; i != ind_tbl->queues_n; ++i) - claim_nonzero(mlx5_priv_rxq_release(priv, ind_tbl->queues[i])); + claim_nonzero(mlx5_rxq_release(dev, ind_tbl->queues[i])); if (!rte_atomic32_read(&ind_tbl->refcnt)) { LIST_REMOVE(ind_tbl, next); rte_free(ind_tbl); return 0; } - return EBUSY; + return 1; } /** * Verify the Rx Queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_ind_table_ibv_verify(struct priv *priv) +mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_ind_table_ibv *ind_tbl; int ret = 0; LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) { - DEBUG("%p: Verbs indirection table %p still referenced", - (void *)priv, (void *)ind_tbl); + DRV_LOG(DEBUG, + "port %u Verbs indirection table %p still referenced", + dev->data->port_id, (void *)ind_tbl); ++ret; } return ret; @@ -1314,8 +1745,8 @@ mlx5_priv_ind_table_ibv_verify(struct priv *priv) /** * Create an Rx Hash queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param rss_key * RSS key for the Rx hash queue. * @param rss_key_len @@ -1327,24 +1758,79 @@ mlx5_priv_ind_table_ibv_verify(struct priv *priv) * first queue index will be taken for the indirection table. * @param queues_n * Number of queues. + * @param tunnel + * Tunnel type, implies tunnel offloading like inner checksum if available. + * @param rss_level + * RSS hash on tunnel level. * * @return - * An hash Rx queue on success. + * The Verbs object initialised, NULL otherwise and rte_errno is set. */ -struct mlx5_hrxq* -mlx5_priv_hrxq_new(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, - uint64_t hash_fields, uint16_t queues[], uint16_t queues_n) +struct mlx5_hrxq * +mlx5_hrxq_new(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n, + uint32_t tunnel, uint32_t rss_level) { + struct priv *priv = dev->data->dev_private; struct mlx5_hrxq *hrxq; struct mlx5_ind_table_ibv *ind_tbl; struct ibv_qp *qp; + int err; +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + struct mlx5dv_qp_init_attr qp_init_attr = {0}; +#endif queues_n = hash_fields ? queues_n : 1; - ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n); - if (!ind_tbl) - ind_tbl = mlx5_priv_ind_table_ibv_new(priv, queues, queues_n); + ind_tbl = mlx5_ind_table_ibv_get(dev, queues, queues_n); if (!ind_tbl) + ind_tbl = mlx5_ind_table_ibv_new(dev, queues, queues_n); + if (!ind_tbl) { + rte_errno = ENOMEM; return NULL; + } + if (!rss_key_len) { + rss_key_len = rss_hash_default_key_len; + rss_key = rss_hash_default_key; + } +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + if (tunnel) { + qp_init_attr.comp_mask = + MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS; + qp_init_attr.create_flags = MLX5DV_QP_CREATE_TUNNEL_OFFLOADS; + } + qp = mlx5_glue->dv_create_qp + (priv->ctx, + &(struct ibv_qp_init_attr_ex){ + .qp_type = IBV_QPT_RAW_PACKET, + .comp_mask = + IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_IND_TABLE | + IBV_QP_INIT_ATTR_RX_HASH, + .rx_hash_conf = (struct ibv_rx_hash_conf){ + .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ, + .rx_hash_key_len = rss_key_len ? rss_key_len : + rss_hash_default_key_len, + .rx_hash_key = rss_key ? + (void *)(uintptr_t)rss_key : + rss_hash_default_key, + .rx_hash_fields_mask = hash_fields | + (tunnel && rss_level > 1 ? + (uint32_t)IBV_RX_HASH_INNER : 0), + }, + .rwq_ind_tbl = ind_tbl->ind_table, + .pd = priv->pd, + }, + &qp_init_attr); + DEBUG("port %u new QP:%p ind_tbl:%p hash_fields:0x%" PRIx64 + " tunnel:0x%x level:%u dv_attr:comp_mask:0x%" PRIx64 + " create_flags:0x%x", + dev->data->port_id, (void *)qp, (void *)ind_tbl, + (tunnel && rss_level == 2 ? (uint32_t)IBV_RX_HASH_INNER : 0) | + hash_fields, tunnel, rss_level, + qp_init_attr.comp_mask, qp_init_attr.create_flags); +#else qp = mlx5_glue->create_qp_ex (priv->ctx, &(struct ibv_qp_init_attr_ex){ @@ -1355,15 +1841,25 @@ mlx5_priv_hrxq_new(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, IBV_QP_INIT_ATTR_RX_HASH, .rx_hash_conf = (struct ibv_rx_hash_conf){ .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ, - .rx_hash_key_len = rss_key_len, - .rx_hash_key = rss_key, + .rx_hash_key_len = rss_key_len ? rss_key_len : + rss_hash_default_key_len, + .rx_hash_key = rss_key ? + (void *)(uintptr_t)rss_key : + rss_hash_default_key, .rx_hash_fields_mask = hash_fields, }, .rwq_ind_tbl = ind_tbl->ind_table, .pd = priv->pd, }); - if (!qp) + DEBUG("port %u new QP:%p ind_tbl:%p hash_fields:0x%" PRIx64 + " tunnel:0x%x level:%hhu", + dev->data->port_id, (void *)qp, (void *)ind_tbl, + hash_fields, tunnel, rss_level); +#endif + if (!qp) { + rte_errno = errno; goto error; + } hrxq = rte_calloc(__func__, 1, sizeof(*hrxq) + rss_key_len, 0); if (!hrxq) goto error; @@ -1371,24 +1867,29 @@ mlx5_priv_hrxq_new(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, hrxq->qp = qp; hrxq->rss_key_len = rss_key_len; hrxq->hash_fields = hash_fields; + hrxq->tunnel = tunnel; + hrxq->rss_level = rss_level; memcpy(hrxq->rss_key, rss_key, rss_key_len); rte_atomic32_inc(&hrxq->refcnt); LIST_INSERT_HEAD(&priv->hrxqs, hrxq, next); - DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv, - (void *)hrxq, rte_atomic32_read(&hrxq->refcnt)); + DRV_LOG(DEBUG, "port %u hash Rx queue %p: refcnt %d", + dev->data->port_id, (void *)hrxq, + rte_atomic32_read(&hrxq->refcnt)); return hrxq; error: - mlx5_priv_ind_table_ibv_release(priv, ind_tbl); + err = rte_errno; /* Save rte_errno before cleanup. */ + mlx5_ind_table_ibv_release(dev, ind_tbl); if (qp) claim_zero(mlx5_glue->destroy_qp(qp)); + rte_errno = err; /* Restore rte_errno. */ return NULL; } /** * Get an Rx Hash queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param rss_conf * RSS configuration for the Rx hash queue. * @param queues @@ -1396,14 +1897,22 @@ error: * first queue index will be taken for the indirection table. * @param queues_n * Number of queues. + * @param tunnel + * Tunnel type, implies tunnel offloading like inner checksum if available. + * @param rss_level + * RSS hash on tunnel level * * @return * An hash Rx queue on success. */ -struct mlx5_hrxq* -mlx5_priv_hrxq_get(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, - uint64_t hash_fields, uint16_t queues[], uint16_t queues_n) +struct mlx5_hrxq * +mlx5_hrxq_get(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n, + uint32_t tunnel, uint32_t rss_level) { + struct priv *priv = dev->data->dev_private; struct mlx5_hrxq *hrxq; queues_n = hash_fields ? queues_n : 1; @@ -1416,16 +1925,21 @@ mlx5_priv_hrxq_get(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, continue; if (hrxq->hash_fields != hash_fields) continue; - ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n); + if (hrxq->tunnel != tunnel) + continue; + if (hrxq->rss_level != rss_level) + continue; + ind_tbl = mlx5_ind_table_ibv_get(dev, queues, queues_n); if (!ind_tbl) continue; if (ind_tbl != hrxq->ind_table) { - mlx5_priv_ind_table_ibv_release(priv, ind_tbl); + mlx5_ind_table_ibv_release(dev, ind_tbl); continue; } rte_atomic32_inc(&hrxq->refcnt); - DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv, - (void *)hrxq, rte_atomic32_read(&hrxq->refcnt)); + DRV_LOG(DEBUG, "port %u hash Rx queue %p: refcnt %d", + dev->data->port_id, (void *)hrxq, + rte_atomic32_read(&hrxq->refcnt)); return hrxq; } return NULL; @@ -1434,47 +1948,55 @@ mlx5_priv_hrxq_get(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, /** * Release the hash Rx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param hrxq * Pointer to Hash Rx queue to release. * * @return - * 0 on success, errno value on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_hrxq_release(struct priv *priv, struct mlx5_hrxq *hrxq) +mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hrxq) { - DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv, - (void *)hrxq, rte_atomic32_read(&hrxq->refcnt)); + DRV_LOG(DEBUG, "port %u hash Rx queue %p: refcnt %d", + dev->data->port_id, (void *)hrxq, + rte_atomic32_read(&hrxq->refcnt)); if (rte_atomic32_dec_and_test(&hrxq->refcnt)) { claim_zero(mlx5_glue->destroy_qp(hrxq->qp)); - mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table); + DEBUG("port %u delete QP %p: hash: 0x%" PRIx64 ", tunnel:" + " 0x%x, level: %u", + dev->data->port_id, (void *)hrxq, hrxq->hash_fields, + hrxq->tunnel, hrxq->rss_level); + mlx5_ind_table_ibv_release(dev, hrxq->ind_table); LIST_REMOVE(hrxq, next); rte_free(hrxq); return 0; } - claim_nonzero(mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table)); - return EBUSY; + claim_nonzero(mlx5_ind_table_ibv_release(dev, hrxq->ind_table)); + return 1; } /** * Verify the Rx Queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_hrxq_ibv_verify(struct priv *priv) +mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_hrxq *hrxq; int ret = 0; LIST_FOREACH(hrxq, &priv->hrxqs, next) { - DEBUG("%p: Verbs Hash Rx queue %p still referenced", - (void *)priv, (void *)hrxq); + DRV_LOG(DEBUG, + "port %u Verbs hash Rx queue %p still referenced", + dev->data->port_id, (void *)hrxq); ++ret; } return ret; diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index dc4ead93..52785946 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <assert.h> @@ -34,19 +34,29 @@ #include "mlx5_prm.h" static __rte_always_inline uint32_t -rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe); +rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); static __rte_always_inline int mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, uint16_t cqe_cnt, uint32_t *rss_hash); static __rte_always_inline uint32_t -rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); +rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); + +static __rte_always_inline void +rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, + volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); + +static __rte_always_inline void +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx); uint32_t mlx5_ptype_table[] __rte_cache_aligned = { [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ }; +uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; +uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; + /** * Build a table to translate Rx completion flags to packet type. * @@ -86,6 +96,14 @@ mlx5_set_ptype_table(void) RTE_PTYPE_L4_TCP; (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP; + (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; /* UDP */ (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP; @@ -104,17 +122,27 @@ mlx5_set_ptype_table(void) RTE_PTYPE_L4_TCP; (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP; + (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP; (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP; /* Tunneled - L3 */ + (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_NONFRAG; (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_NONFRAG; + (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_NONFRAG; @@ -141,12 +169,36 @@ mlx5_set_ptype_table(void) (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_TCP; + (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_TCP; (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_TCP; + (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; /* Tunneled - UDP */ (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | @@ -163,6 +215,74 @@ mlx5_set_ptype_table(void) } /** + * Build a table to translate packet to checksum type of Verbs. + */ +void +mlx5_set_cksum_table(void) +{ + unsigned int i; + uint8_t v; + + /* + * The index should have: + * bit[0] = PKT_TX_TCP_SEG + * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM + * bit[4] = PKT_TX_IP_CKSUM + * bit[8] = PKT_TX_OUTER_IP_CKSUM + * bit[9] = tunnel + */ + for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) { + v = 0; + if (i & (1 << 9)) { + /* Tunneled packet. */ + if (i & (1 << 8)) /* Outer IP. */ + v |= MLX5_ETH_WQE_L3_CSUM; + if (i & (1 << 4)) /* Inner IP. */ + v |= MLX5_ETH_WQE_L3_INNER_CSUM; + if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ + v |= MLX5_ETH_WQE_L4_INNER_CSUM; + } else { + /* No tunnel. */ + if (i & (1 << 4)) /* IP. */ + v |= MLX5_ETH_WQE_L3_CSUM; + if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ + v |= MLX5_ETH_WQE_L4_CSUM; + } + mlx5_cksum_table[i] = v; + } +} + +/** + * Build a table to translate packet type of mbuf to SWP type of Verbs. + */ +void +mlx5_set_swp_types_table(void) +{ + unsigned int i; + uint8_t v; + + /* + * The index should have: + * bit[0:1] = PKT_TX_L4_MASK + * bit[4] = PKT_TX_IPV6 + * bit[8] = PKT_TX_OUTER_IPV6 + * bit[9] = PKT_TX_OUTER_UDP + */ + for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) { + v = 0; + if (i & (1 << 8)) + v |= MLX5_ETH_WQE_L3_OUTER_IPV6; + if (i & (1 << 9)) + v |= MLX5_ETH_WQE_L4_OUTER_UDP; + if (i & (1 << 4)) + v |= MLX5_ETH_WQE_L3_INNER_IPV6; + if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52)) + v |= MLX5_ETH_WQE_L4_INNER_UDP; + mlx5_swp_types_table[i] = v; + } +} + +/** * Return the size of tailroom of WQ. * * @param txq @@ -219,6 +339,60 @@ mlx5_copy_to_wq(void *dst, const void *src, size_t n, } /** + * Inline TSO headers into WQE. + * + * @return + * 0 on success, negative errno value on failure. + */ +static int +inline_tso(struct mlx5_txq_data *txq, struct rte_mbuf *buf, + uint32_t *length, + uintptr_t *addr, + uint16_t *pkt_inline_sz, + uint8_t **raw, + uint16_t *max_wqe, + uint16_t *tso_segsz, + uint16_t *tso_header_sz) +{ + uintptr_t end = (uintptr_t)(((uintptr_t)txq->wqes) + + (1 << txq->wqe_n) * MLX5_WQE_SIZE); + unsigned int copy_b; + uint8_t vlan_sz = (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0; + const uint8_t tunneled = txq->tunnel_en && (buf->ol_flags & + PKT_TX_TUNNEL_MASK); + uint16_t n_wqe; + + *tso_segsz = buf->tso_segsz; + *tso_header_sz = buf->l2_len + vlan_sz + buf->l3_len + buf->l4_len; + if (unlikely(*tso_segsz == 0 || *tso_header_sz == 0)) { + txq->stats.oerrors++; + return -EINVAL; + } + if (tunneled) + *tso_header_sz += buf->outer_l2_len + buf->outer_l3_len; + /* First seg must contain all TSO headers. */ + if (unlikely(*tso_header_sz > MLX5_MAX_TSO_HEADER) || + *tso_header_sz > DATA_LEN(buf)) { + txq->stats.oerrors++; + return -EINVAL; + } + copy_b = *tso_header_sz - *pkt_inline_sz; + if (!copy_b || ((end - (uintptr_t)*raw) < copy_b)) + return -EAGAIN; + n_wqe = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4; + if (unlikely(*max_wqe < n_wqe)) + return -EINVAL; + *max_wqe -= n_wqe; + rte_memcpy((void *)*raw, (void *)*addr, copy_b); + *length -= copy_b; + *addr += copy_b; + copy_b = MLX5_WQE_DS(copy_b) * MLX5_WQE_DWORD_SIZE; + *pkt_inline_sz += copy_b; + *raw += copy_b; + return 0; +} + +/** * DPDK callback to check the status of a tx descriptor. * * @param tx_queue @@ -335,7 +509,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (unlikely(!max_wqe)) return 0; do { - struct rte_mbuf *buf = NULL; + struct rte_mbuf *buf = *pkts; /* First_seg. */ uint8_t *raw; volatile struct mlx5_wqe_v *wqe = NULL; volatile rte_v128u32_t *dseg = NULL; @@ -347,14 +521,15 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) uint16_t tso_header_sz = 0; uint16_t ehdr; uint8_t cs_flags; - uint64_t tso = 0; + uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG); + uint32_t swp_offsets = 0; + uint8_t swp_types = 0; uint16_t tso_segsz = 0; #ifdef MLX5_PMD_SOFT_COUNTERS uint32_t total_length = 0; #endif + int ret; - /* first_seg */ - buf = *pkts; segs_n = buf->nb_segs; /* * Make sure there is enough room to store this packet and @@ -389,7 +564,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (pkts_n - i > 1) rte_prefetch0( rte_pktmbuf_mtod(*(pkts + 1), volatile void *)); - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); + txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types); raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE; /* Replace the Ethernet type by the VLAN if necessary. */ if (buf->ol_flags & PKT_TX_VLAN_PKT) { @@ -415,54 +591,14 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) addr += pkt_inline_sz; } raw += MLX5_WQE_DWORD_SIZE; - tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG); if (tso) { - uintptr_t end = - (uintptr_t)(((uintptr_t)txq->wqes) + - (1 << txq->wqe_n) * MLX5_WQE_SIZE); - unsigned int copy_b; - uint8_t vlan_sz = - (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0; - const uint64_t is_tunneled = - buf->ol_flags & (PKT_TX_TUNNEL_GRE | - PKT_TX_TUNNEL_VXLAN); - - tso_header_sz = buf->l2_len + vlan_sz + - buf->l3_len + buf->l4_len; - tso_segsz = buf->tso_segsz; - if (unlikely(tso_segsz == 0)) { - txq->stats.oerrors++; + ret = inline_tso(txq, buf, &length, + &addr, &pkt_inline_sz, + &raw, &max_wqe, + &tso_segsz, &tso_header_sz); + if (ret == -EINVAL) { break; - } - if (is_tunneled && txq->tunnel_en) { - tso_header_sz += buf->outer_l2_len + - buf->outer_l3_len; - cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; - } else { - cs_flags |= MLX5_ETH_WQE_L4_CSUM; - } - if (unlikely(tso_header_sz > MLX5_MAX_TSO_HEADER)) { - txq->stats.oerrors++; - break; - } - copy_b = tso_header_sz - pkt_inline_sz; - /* First seg must contain all headers. */ - assert(copy_b <= length); - if (copy_b && ((end - (uintptr_t)raw) > copy_b)) { - uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4; - - if (unlikely(max_wqe < n)) - break; - max_wqe -= n; - rte_memcpy((void *)raw, (void *)addr, copy_b); - addr += copy_b; - length -= copy_b; - /* Include padding for TSO header. */ - copy_b = MLX5_WQE_DS(copy_b) * - MLX5_WQE_DWORD_SIZE; - pkt_inline_sz += copy_b; - raw += copy_b; - } else { + } else if (ret == -EAGAIN) { /* NOP WQE. */ wqe->ctrl = (rte_v128u32_t){ rte_cpu_to_be_32(txq->wqe_ci << 8), @@ -507,7 +643,8 @@ pkt_inline: if (unlikely(max_wqe < n)) break; max_wqe -= n; - if (tso && !inl) { + if (tso) { + assert(inl == 0); inl = rte_cpu_to_be_32(copy_b | MLX5_INLINE_SEG); rte_memcpy((void *)raw, @@ -542,8 +679,17 @@ pkt_inline: } else if (!segs_n) { goto next_pkt; } else { - raw += copy_b; - inline_room -= copy_b; + /* + * Further inline the next segment only for + * non-TSO packets. + */ + if (!tso) { + raw += copy_b; + inline_room -= copy_b; + } else { + inline_room = 0; + } + /* Move to the next segment. */ --segs_n; buf = buf->next; assert(buf); @@ -633,8 +779,9 @@ next_pkt: 0, }; wqe->eseg = (rte_v128u32_t){ - 0, - cs_flags | (rte_cpu_to_be_16(tso_segsz) << 16), + swp_offsets, + cs_flags | (swp_types << 8) | + (rte_cpu_to_be_16(tso_segsz) << 16), 0, (ehdr << 16) | rte_cpu_to_be_16(tso_header_sz), }; @@ -647,8 +794,8 @@ next_pkt: 0, }; wqe->eseg = (rte_v128u32_t){ - 0, - cs_flags, + swp_offsets, + cs_flags | (swp_types << 8), 0, (ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz), }; @@ -820,7 +967,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) } max_elts -= segs_n; --pkts_n; - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); /* Retrieve packet information. */ length = PKT_LEN(buf); assert(length); @@ -1052,7 +1199,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, * iteration. */ max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); /* Retrieve packet information. */ length = PKT_LEN(buf); /* Start new session if packet differs. */ @@ -1320,7 +1467,6 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, do { struct rte_mbuf *buf = *(pkts++); uintptr_t addr; - unsigned int n; unsigned int do_inline = 0; /* Whether inline is possible. */ uint32_t length; uint8_t cs_flags; @@ -1330,7 +1476,7 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, /* Make sure there is enough room to store this packet. */ if (max_elts - j == 0) break; - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); /* Retrieve packet information. */ length = PKT_LEN(buf); /* Start new session if: @@ -1382,7 +1528,7 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, (!txq->mpw_hdr_dseg || mpw.total_len >= MLX5_WQE_SIZE); } - if (do_inline) { + if (max_inline && do_inline) { /* Inline packet into WQE. */ unsigned int max; @@ -1440,11 +1586,8 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, ((uintptr_t)mpw.data.raw + inl_pad); (*txq->elts)[elts_head++ & elts_m] = buf; - addr = rte_pktmbuf_mtod(buf, uintptr_t); - for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++) - rte_prefetch2((void *)(addr + - n * RTE_CACHE_LINE_SIZE)); - addr = rte_cpu_to_be_64(addr); + addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, + uintptr_t)); *dseg = (rte_v128u32_t) { rte_cpu_to_be_32(length), mlx5_tx_mb2mr(txq, buf), @@ -1541,6 +1684,8 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) /** * Translate RX completion flags to packet type. * + * @param[in] rxq + * Pointer to RX queue structure. * @param[in] cqe * Pointer to CQE. * @@ -1550,7 +1695,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) * Packet type for struct rte_mbuf. */ static inline uint32_t -rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) +rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) { uint8_t idx; uint8_t pinfo = cqe->pkt_info; @@ -1565,7 +1710,7 @@ rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) * bit[7] = outer_l3_type */ idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); - return mlx5_ptype_table[idx]; + return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); } /** @@ -1688,8 +1833,6 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, /** * Translate RX completion flags to offload flags. * - * @param[in] rxq - * Pointer to RX queue structure. * @param[in] cqe * Pointer to CQE. * @@ -1697,7 +1840,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, * Offload flags (ol_flags) for struct rte_mbuf. */ static inline uint32_t -rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) +rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) { uint32_t ol_flags = 0; uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); @@ -1709,18 +1852,56 @@ rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) TRANSPOSE(flags, MLX5_CQE_RX_L4_HDR_VALID, PKT_RX_L4_CKSUM_GOOD); - if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) - ol_flags |= - TRANSPOSE(flags, - MLX5_CQE_RX_L3_HDR_VALID, - PKT_RX_IP_CKSUM_GOOD) | - TRANSPOSE(flags, - MLX5_CQE_RX_L4_HDR_VALID, - PKT_RX_L4_CKSUM_GOOD); return ol_flags; } /** + * Fill in mbuf fields from RX completion flags. + * Note that pkt->ol_flags should be initialized outside of this function. + * + * @param rxq + * Pointer to RX queue. + * @param pkt + * mbuf to fill. + * @param cqe + * CQE to process. + * @param rss_hash_res + * Packet RSS Hash result. + */ +static inline void +rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, + volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) +{ + /* Update packet information. */ + pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); + if (rss_hash_res && rxq->rss_hash) { + pkt->hash.rss = rss_hash_res; + pkt->ol_flags |= PKT_RX_RSS_HASH; + } + if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { + pkt->ol_flags |= PKT_RX_FDIR; + if (cqe->sop_drop_qpn != + rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { + uint32_t mark = cqe->sop_drop_qpn; + + pkt->ol_flags |= PKT_RX_FDIR_ID; + pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); + } + } + if (rxq->csum) + pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); + if (rxq->vlan_strip && + (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { + pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; + pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); + } + if (rxq->hw_timestamp) { + pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); + pkt->ol_flags |= PKT_RX_TIMESTAMP; + } +} + +/** * DPDK callback for RX. * * @param dpdk_rxq @@ -1750,7 +1931,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) while (pkts_n) { unsigned int idx = rq_ci & wqe_cnt; - volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; + volatile struct mlx5_wqe_data_seg *wqe = + &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; struct rte_mbuf *rep = (*rxq->elts)[idx]; uint32_t rss_hash_res = 0; @@ -1796,40 +1978,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) } pkt = seg; assert(len >= (rxq->crc_present << 2)); - /* Update packet information. */ - pkt->packet_type = rxq_cq_to_pkt_type(cqe); pkt->ol_flags = 0; - if (rss_hash_res && rxq->rss_hash) { - pkt->hash.rss = rss_hash_res; - pkt->ol_flags = PKT_RX_RSS_HASH; - } - if (rxq->mark && - MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { - pkt->ol_flags |= PKT_RX_FDIR; - if (cqe->sop_drop_qpn != - rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { - uint32_t mark = cqe->sop_drop_qpn; - - pkt->ol_flags |= PKT_RX_FDIR_ID; - pkt->hash.fdir.hi = - mlx5_flow_mark_get(mark); - } - } - if (rxq->csum | rxq->csum_l2tun) - pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe); - if (rxq->vlan_strip && - (cqe->hdr_type_etc & - rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { - pkt->ol_flags |= PKT_RX_VLAN | - PKT_RX_VLAN_STRIPPED; - pkt->vlan_tci = - rte_be_to_cpu_16(cqe->vlan_info); - } - if (rxq->hw_timestamp) { - pkt->timestamp = - rte_be_to_cpu_64(cqe->timestamp); - pkt->ol_flags |= PKT_RX_TIMESTAMP; - } + rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); if (rxq->crc_present) len -= ETHER_CRC_LEN; PKT_LEN(pkt) = len; @@ -1845,6 +1995,9 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) * changes. */ wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wqe->lkey = mlx5_rx_mb2mr(rxq, rep); if (len > DATA_LEN(seg)) { len -= DATA_LEN(seg); ++NB_SEGS(pkt); @@ -1882,6 +2035,236 @@ skip: return i; } +void +mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) +{ + struct mlx5_mprq_buf *buf = opaque; + + if (rte_atomic16_read(&buf->refcnt) == 1) { + rte_mempool_put(buf->mp, buf); + } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { + rte_atomic16_set(&buf->refcnt, 1); + rte_mempool_put(buf->mp, buf); + } +} + +void +mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) +{ + mlx5_mprq_buf_free_cb(NULL, buf); +} + +static inline void +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx) +{ + struct mlx5_mprq_buf *rep = rxq->mprq_repl; + volatile struct mlx5_wqe_data_seg *wqe = + &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; + void *addr; + + assert(rep != NULL); + /* Replace MPRQ buf. */ + (*rxq->mprq_bufs)[rq_idx] = rep; + /* Replace WQE. */ + addr = mlx5_mprq_buf_addr(rep); + wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); + /* Stash a mbuf for next replacement. */ + if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) + rxq->mprq_repl = rep; + else + rxq->mprq_repl = NULL; +} + +/** + * DPDK callback for RX with Multi-Packet RQ support. + * + * @param dpdk_rxq + * Generic pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct mlx5_rxq_data *rxq = dpdk_rxq; + const unsigned int strd_n = 1 << rxq->strd_num_n; + const unsigned int strd_sz = 1 << rxq->strd_sz_n; + const unsigned int strd_shift = + MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; + const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; + const unsigned int wq_mask = (1 << rxq->elts_n) - 1; + volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; + unsigned int i = 0; + uint16_t rq_ci = rxq->rq_ci; + uint16_t strd_idx = rxq->strd_ci; + struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; + + while (i < pkts_n) { + struct rte_mbuf *pkt; + void *addr; + int ret; + unsigned int len; + uint16_t consumed_strd; + uint32_t offset; + uint32_t byte_cnt; + uint32_t rss_hash_res = 0; + + if (strd_idx == strd_n) { + /* Replace WQE only if the buffer is still in use. */ + if (rte_atomic16_read(&buf->refcnt) > 1) { + mprq_buf_replace(rxq, rq_ci & wq_mask); + /* Release the old buffer. */ + mlx5_mprq_buf_free(buf); + } else if (unlikely(rxq->mprq_repl == NULL)) { + struct mlx5_mprq_buf *rep; + + /* + * Currently, the MPRQ mempool is out of buffer + * and doing memcpy regardless of the size of Rx + * packet. Retry allocation to get back to + * normal. + */ + if (!rte_mempool_get(rxq->mprq_mp, + (void **)&rep)) + rxq->mprq_repl = rep; + } + /* Advance to the next WQE. */ + strd_idx = 0; + ++rq_ci; + buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; + } + cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; + ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &rss_hash_res); + if (!ret) + break; + if (unlikely(ret == -1)) { + /* RX error, packet is likely too large. */ + ++rxq->stats.idropped; + continue; + } + byte_cnt = ret; + consumed_strd = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> + MLX5_MPRQ_STRIDE_NUM_SHIFT; + assert(consumed_strd); + /* Calculate offset before adding up stride index. */ + offset = strd_idx * strd_sz + strd_shift; + strd_idx += consumed_strd; + if (byte_cnt & MLX5_MPRQ_FILLER_MASK) + continue; + /* + * Currently configured to receive a packet per a stride. But if + * MTU is adjusted through kernel interface, device could + * consume multiple strides without raising an error. In this + * case, the packet should be dropped because it is bigger than + * the max_rx_pkt_len. + */ + if (unlikely(consumed_strd > 1)) { + ++rxq->stats.idropped; + continue; + } + pkt = rte_pktmbuf_alloc(rxq->mp); + if (unlikely(pkt == NULL)) { + ++rxq->stats.rx_nombuf; + break; + } + len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; + assert((int)len >= (rxq->crc_present << 2)); + if (rxq->crc_present) + len -= ETHER_CRC_LEN; + addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf), offset); + /* Initialize the offload flag. */ + pkt->ol_flags = 0; + /* + * Memcpy packets to the target mbuf if: + * - The size of packet is smaller than mprq_max_memcpy_len. + * - Out of buffer in the Mempool for Multi-Packet RQ. + */ + if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { + /* + * When memcpy'ing packet due to out-of-buffer, the + * packet must be smaller than the target mbuf. + */ + if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { + rte_pktmbuf_free_seg(pkt); + ++rxq->stats.idropped; + continue; + } + rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); + } else { + rte_iova_t buf_iova; + struct rte_mbuf_ext_shared_info *shinfo; + uint16_t buf_len = consumed_strd * strd_sz; + + /* Increment the refcnt of the whole chunk. */ + rte_atomic16_add_return(&buf->refcnt, 1); + assert((uint16_t)rte_atomic16_read(&buf->refcnt) <= + strd_n + 1); + addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM); + /* + * MLX5 device doesn't use iova but it is necessary in a + * case where the Rx packet is transmitted via a + * different PMD. + */ + buf_iova = rte_mempool_virt2iova(buf) + + RTE_PTR_DIFF(addr, buf); + shinfo = rte_pktmbuf_ext_shinfo_init_helper(addr, + &buf_len, mlx5_mprq_buf_free_cb, buf); + /* + * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when + * attaching the stride to mbuf and more offload flags + * will be added below by calling rxq_cq_to_mbuf(). + * Other fields will be overwritten. + */ + rte_pktmbuf_attach_extbuf(pkt, addr, buf_iova, buf_len, + shinfo); + rte_pktmbuf_reset_headroom(pkt); + assert(pkt->ol_flags == EXT_ATTACHED_MBUF); + /* + * Prevent potential overflow due to MTU change through + * kernel interface. + */ + if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { + rte_pktmbuf_free_seg(pkt); + ++rxq->stats.idropped; + continue; + } + } + rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); + PKT_LEN(pkt) = len; + DATA_LEN(pkt) = len; + PORT(pkt) = rxq->port_id; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment bytes counter. */ + rxq->stats.ibytes += PKT_LEN(pkt); +#endif + /* Return packet. */ + *(pkts++) = pkt; + ++i; + } + /* Update the consumer indexes. */ + rxq->strd_ci = strd_idx; + rte_cio_wmb(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + if (rq_ci != rxq->rq_ci) { + rxq->rq_ci = rq_ci; + rte_cio_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment packets counter. */ + rxq->stats.ipackets += i; +#endif + return i; +} + /** * Dummy DPDK callback for TX. * @@ -1899,11 +2282,10 @@ skip: * Number of packets successfully transmitted (<= pkts_n). */ uint16_t -removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +removed_tx_burst(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_txq; - (void)pkts; - (void)pkts_n; return 0; } @@ -1924,11 +2306,10 @@ removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) * Number of packets successfully received (<= pkts_n). */ uint16_t -removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +removed_rx_burst(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_rxq; - (void)pkts; - (void)pkts_n; return 0; } @@ -1940,58 +2321,49 @@ removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) */ uint16_t __attribute__((weak)) -mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_txq; - (void)pkts; - (void)pkts_n; return 0; } uint16_t __attribute__((weak)) -mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +mlx5_tx_burst_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_txq; - (void)pkts; - (void)pkts_n; return 0; } uint16_t __attribute__((weak)) -mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_rxq; - (void)pkts; - (void)pkts_n; return 0; } int __attribute__((weak)) -priv_check_raw_vec_tx_support(struct priv *priv, struct rte_eth_dev *dev) +mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused) { - (void)priv; - (void)dev; return -ENOTSUP; } int __attribute__((weak)) -priv_check_vec_tx_support(struct priv *priv, struct rte_eth_dev *dev) +mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused) { - (void)priv; - (void)dev; return -ENOTSUP; } int __attribute__((weak)) -rxq_check_vec_support(struct mlx5_rxq_data *rxq) +mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) { - (void)rxq; return -ENOTSUP; } int __attribute__((weak)) -priv_check_vec_rx_support(struct priv *priv) +mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) { - (void)priv; return -ENOTSUP; } diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index d7e89055..f53bb43c 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_RXTX_H_ @@ -29,6 +29,7 @@ #include "mlx5_utils.h" #include "mlx5.h" +#include "mlx5_mr.h" #include "mlx5_autoconf.h" #include "mlx5_defs.h" #include "mlx5_prm.h" @@ -54,17 +55,6 @@ struct mlx5_txq_stats { struct priv; -/* Memory region queue object. */ -struct mlx5_mr { - LIST_ENTRY(mlx5_mr) next; /**< Pointer to the next element. */ - rte_atomic32_t refcnt; /*<< Reference counter. */ - uint32_t lkey; /*<< rte_cpu_to_be_32(mr->lkey) */ - uintptr_t start; /* Start address of MR */ - uintptr_t end; /* End address of MR */ - struct ibv_mr *mr; /*<< Memory Region. */ - struct rte_mempool *mp; /*<< Memory Pool. */ -}; - /* Compressed CQE context. */ struct rxq_zip { uint16_t ai; /* Array index. */ @@ -74,10 +64,19 @@ struct rxq_zip { uint32_t cqe_cnt; /* Number of CQEs. */ }; +/* Multi-Packet RQ buffer header. */ +struct mlx5_mprq_buf { + struct rte_mempool *mp; + rte_atomic16_t refcnt; /* Atomically accessed refcnt. */ + uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first packet. */ +} __rte_cache_aligned; + +/* Get pointer to the first stride. */ +#define mlx5_mprq_buf_addr(ptr) ((ptr) + 1) + /* RX queue descriptor. */ struct mlx5_rxq_data { unsigned int csum:1; /* Enable checksum offloading. */ - unsigned int csum_l2tun:1; /* Same for L2 tunnels. */ unsigned int hw_timestamp:1; /* Enable HW timestamp. */ unsigned int vlan_strip:1; /* Enable VLAN stripping. */ unsigned int crc_present:1; /* CRC must be subtracted. */ @@ -86,24 +85,37 @@ struct mlx5_rxq_data { unsigned int elts_n:4; /* Log 2 of Mbufs. */ unsigned int rss_hash:1; /* RSS hash result is enabled. */ unsigned int mark:1; /* Marked flow available on the queue. */ - unsigned int :15; /* Remaining bits. */ + unsigned int strd_num_n:5; /* Log 2 of the number of stride. */ + unsigned int strd_sz_n:4; /* Log 2 of stride size. */ + unsigned int strd_shift_en:1; /* Enable 2bytes shift on a stride. */ + unsigned int :6; /* Remaining bits. */ volatile uint32_t *rq_db; volatile uint32_t *cq_db; uint16_t port_id; uint16_t rq_ci; + uint16_t strd_ci; /* Stride index in a WQE for Multi-Packet RQ. */ uint16_t rq_pi; uint16_t cq_ci; - volatile struct mlx5_wqe_data_seg(*wqes)[]; + struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */ + uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */ + volatile void *wqes; volatile struct mlx5_cqe(*cqes)[]; struct rxq_zip zip; /* Compressed context. */ - struct rte_mbuf *(*elts)[]; + RTE_STD_C11 + union { + struct rte_mbuf *(*elts)[]; + struct mlx5_mprq_buf *(*mprq_bufs)[]; + }; struct rte_mempool *mp; + struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */ + struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */ struct mlx5_rxq_stats stats; uint64_t mbuf_initializer; /* Default rearm_data for vectorized Rx. */ struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */ void *cq_uar; /* CQ user access region. */ uint32_t cqn; /* CQ number. */ uint8_t cq_arm_sn; /* CQ arm seq number. */ + uint32_t tunnel; /* Tunnel information. */ } __rte_cache_aligned; /* Verbs Rx queue elements. */ @@ -114,18 +126,19 @@ struct mlx5_rxq_ibv { struct ibv_cq *cq; /* Completion Queue. */ struct ibv_wq *wq; /* Work Queue. */ struct ibv_comp_channel *channel; - struct mlx5_mr *mr; /* Memory Region (for mp). */ }; /* RX queue control descriptor. */ struct mlx5_rxq_ctrl { LIST_ENTRY(mlx5_rxq_ctrl) next; /* Pointer to the next element. */ rte_atomic32_t refcnt; /* Reference counter. */ - struct priv *priv; /* Back pointer to private data. */ struct mlx5_rxq_ibv *ibv; /* Verbs elements. */ + struct priv *priv; /* Back pointer to private data. */ struct mlx5_rxq_data rxq; /* Data path structure. */ unsigned int socket; /* CPU socket ID for allocations. */ + uint32_t tunnel_types[16]; /* Tunnel type counter. */ unsigned int irq:1; /* Whether IRQ is enabled. */ + uint16_t idx; /* Queue index. */ }; /* Indirection table. */ @@ -133,7 +146,7 @@ struct mlx5_ind_table_ibv { LIST_ENTRY(mlx5_ind_table_ibv) next; /* Pointer to the next element. */ rte_atomic32_t refcnt; /* Reference counter. */ struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */ - uint16_t queues_n; /**< Number of queues in the list. */ + uint32_t queues_n; /**< Number of queues in the list. */ uint16_t queues[]; /**< Queue list. */ }; @@ -144,7 +157,9 @@ struct mlx5_hrxq { struct mlx5_ind_table_ibv *ind_table; /* Indirection table. */ struct ibv_qp *qp; /* Verbs queue pair. */ uint64_t hash_fields; /* Verbs Hash fields. */ - uint8_t rss_key_len; /* Hash key length in bytes. */ + uint32_t tunnel; /* Tunnel type. */ + uint32_t rss_level; /* RSS on tunnel level. */ + uint32_t rss_key_len; /* Hash key length in bytes. */ uint8_t rss_key[]; /* Hash key. */ }; @@ -167,18 +182,18 @@ struct mlx5_txq_data { uint16_t tso_en:1; /* When set hardware TSO is enabled. */ uint16_t tunnel_en:1; /* When set TX offload for tunneled packets are supported. */ + uint16_t swp_en:1; /* Whether SW parser is enabled. */ uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */ uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */ uint16_t inline_max_packet_sz; /* Max packet size for inlining. */ - uint16_t mr_cache_idx; /* Index of last hit entry. */ uint32_t qp_num_8s; /* QP number shifted by 8. */ uint64_t offloads; /* Offloads for Tx Queue. */ + struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */ volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */ volatile void *wqes; /* Work queue (use volatile to write into). */ volatile uint32_t *qp_db; /* Work queue doorbell. */ volatile uint32_t *cq_db; /* Completion queue doorbell. */ volatile void *bf_reg; /* Blueflame register remapped. */ - struct mlx5_mr *mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MR translation table. */ struct rte_mbuf *(*elts)[]; /* TX elements. */ struct mlx5_txq_stats stats; /* TX queue counters. */ } __rte_cache_aligned; @@ -187,6 +202,7 @@ struct mlx5_txq_data { struct mlx5_txq_ibv { LIST_ENTRY(mlx5_txq_ibv) next; /* Pointer to the next element. */ rte_atomic32_t refcnt; /* Reference counter. */ + struct mlx5_txq_ctrl *txq_ctrl; /* Pointer to the control queue. */ struct ibv_cq *cq; /* Completion Queue. */ struct ibv_qp *qp; /* Queue Pair. */ }; @@ -195,14 +211,15 @@ struct mlx5_txq_ibv { struct mlx5_txq_ctrl { LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */ rte_atomic32_t refcnt; /* Reference counter. */ - struct priv *priv; /* Back pointer to private data. */ unsigned int socket; /* CPU socket ID for allocations. */ unsigned int max_inline_data; /* Max inline data. */ unsigned int max_tso_header; /* Max TSO header size. */ struct mlx5_txq_ibv *ibv; /* Verbs queue object. */ + struct priv *priv; /* Back pointer to private data. */ struct mlx5_txq_data txq; /* Data path structure. */ off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */ volatile void *bf_reg_orig; /* Blueflame register from verbs. */ + uint16_t idx; /* Queue index. */ }; /* mlx5_rxq.c */ @@ -210,97 +227,126 @@ struct mlx5_txq_ctrl { extern uint8_t rss_hash_default_key[]; extern const size_t rss_hash_default_key_len; -void mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *); -int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int, - const struct rte_eth_rxconf *, struct rte_mempool *); -void mlx5_rx_queue_release(void *); -int priv_rx_intr_vec_enable(struct priv *priv); -void priv_rx_intr_vec_disable(struct priv *priv); +int mlx5_check_mprq_support(struct rte_eth_dev *dev); +int mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq); +int mlx5_mprq_enabled(struct rte_eth_dev *dev); +int mlx5_mprq_free_mp(struct rte_eth_dev *dev); +int mlx5_mprq_alloc_mp(struct rte_eth_dev *dev); +void mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl); +int mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_rxconf *conf, + struct rte_mempool *mp); +void mlx5_rx_queue_release(void *dpdk_rxq); +int mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev); +void mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev); int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id); int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id); -struct mlx5_rxq_ibv *mlx5_priv_rxq_ibv_new(struct priv *, uint16_t); -struct mlx5_rxq_ibv *mlx5_priv_rxq_ibv_get(struct priv *, uint16_t); -int mlx5_priv_rxq_ibv_release(struct priv *, struct mlx5_rxq_ibv *); -int mlx5_priv_rxq_ibv_releasable(struct priv *, struct mlx5_rxq_ibv *); -int mlx5_priv_rxq_ibv_verify(struct priv *); -struct mlx5_rxq_ctrl *mlx5_priv_rxq_new(struct priv *, uint16_t, - uint16_t, unsigned int, - const struct rte_eth_rxconf *, - struct rte_mempool *); -struct mlx5_rxq_ctrl *mlx5_priv_rxq_get(struct priv *, uint16_t); -int mlx5_priv_rxq_release(struct priv *, uint16_t); -int mlx5_priv_rxq_releasable(struct priv *, uint16_t); -int mlx5_priv_rxq_verify(struct priv *); -int rxq_alloc_elts(struct mlx5_rxq_ctrl *); -struct mlx5_ind_table_ibv *mlx5_priv_ind_table_ibv_new(struct priv *, - uint16_t [], - uint16_t); -struct mlx5_ind_table_ibv *mlx5_priv_ind_table_ibv_get(struct priv *, - uint16_t [], - uint16_t); -int mlx5_priv_ind_table_ibv_release(struct priv *, struct mlx5_ind_table_ibv *); -int mlx5_priv_ind_table_ibv_verify(struct priv *); -struct mlx5_hrxq *mlx5_priv_hrxq_new(struct priv *, uint8_t *, uint8_t, - uint64_t, uint16_t [], uint16_t); -struct mlx5_hrxq *mlx5_priv_hrxq_get(struct priv *, uint8_t *, uint8_t, - uint64_t, uint16_t [], uint16_t); -int mlx5_priv_hrxq_release(struct priv *, struct mlx5_hrxq *); -int mlx5_priv_hrxq_ibv_verify(struct priv *); -uint64_t mlx5_priv_get_rx_port_offloads(struct priv *); -uint64_t mlx5_priv_get_rx_queue_offloads(struct priv *); +struct mlx5_rxq_ibv *mlx5_rxq_ibv_new(struct rte_eth_dev *dev, uint16_t idx); +struct mlx5_rxq_ibv *mlx5_rxq_ibv_get(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_rxq_ibv_release(struct mlx5_rxq_ibv *rxq_ibv); +int mlx5_rxq_ibv_releasable(struct mlx5_rxq_ibv *rxq_ibv); +int mlx5_rxq_ibv_verify(struct rte_eth_dev *dev); +struct mlx5_rxq_ctrl *mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, unsigned int socket, + const struct rte_eth_rxconf *conf, + struct rte_mempool *mp); +struct mlx5_rxq_ctrl *mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_rxq_verify(struct rte_eth_dev *dev); +int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl); +int rxq_alloc_mprq_buf(struct mlx5_rxq_ctrl *rxq_ctrl); +struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, + const uint16_t *queues, + uint32_t queues_n); +struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, + const uint16_t *queues, + uint32_t queues_n); +int mlx5_ind_table_ibv_release(struct rte_eth_dev *dev, + struct mlx5_ind_table_ibv *ind_tbl); +int mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev); +struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n, + uint32_t tunnel, uint32_t rss_level); +struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n, + uint32_t tunnel, uint32_t rss_level); +int mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hxrq); +int mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev); +uint64_t mlx5_get_rx_port_offloads(void); +uint64_t mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev); /* mlx5_txq.c */ -int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int, - const struct rte_eth_txconf *); -void mlx5_tx_queue_release(void *); -int priv_tx_uar_remap(struct priv *priv, int fd); -struct mlx5_txq_ibv *mlx5_priv_txq_ibv_new(struct priv *, uint16_t); -struct mlx5_txq_ibv *mlx5_priv_txq_ibv_get(struct priv *, uint16_t); -int mlx5_priv_txq_ibv_release(struct priv *, struct mlx5_txq_ibv *); -int mlx5_priv_txq_ibv_releasable(struct priv *, struct mlx5_txq_ibv *); -int mlx5_priv_txq_ibv_verify(struct priv *); -struct mlx5_txq_ctrl *mlx5_priv_txq_new(struct priv *, uint16_t, - uint16_t, unsigned int, - const struct rte_eth_txconf *); -struct mlx5_txq_ctrl *mlx5_priv_txq_get(struct priv *, uint16_t); -int mlx5_priv_txq_release(struct priv *, uint16_t); -int mlx5_priv_txq_releasable(struct priv *, uint16_t); -int mlx5_priv_txq_verify(struct priv *); -void txq_alloc_elts(struct mlx5_txq_ctrl *); -uint64_t mlx5_priv_get_tx_port_offloads(struct priv *); +int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_txconf *conf); +void mlx5_tx_queue_release(void *dpdk_txq); +int mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd); +struct mlx5_txq_ibv *mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx); +struct mlx5_txq_ibv *mlx5_txq_ibv_get(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_ibv_release(struct mlx5_txq_ibv *txq_ibv); +int mlx5_txq_ibv_releasable(struct mlx5_txq_ibv *txq_ibv); +int mlx5_txq_ibv_verify(struct rte_eth_dev *dev); +struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, unsigned int socket, + const struct rte_eth_txconf *conf); +struct mlx5_txq_ctrl *mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_verify(struct rte_eth_dev *dev); +void txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl); +uint64_t mlx5_get_tx_port_offloads(struct rte_eth_dev *dev); /* mlx5_rxtx.c */ extern uint32_t mlx5_ptype_table[]; +extern uint8_t mlx5_cksum_table[]; +extern uint8_t mlx5_swp_types_table[]; void mlx5_set_ptype_table(void); -uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_tx_burst_empw(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t); -uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t); -uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t); -int mlx5_rx_descriptor_status(void *, uint16_t); -int mlx5_tx_descriptor_status(void *, uint16_t); +void mlx5_set_cksum_table(void); +void mlx5_set_swp_types_table(void); +uint16_t mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n); +void mlx5_mprq_buf_free_cb(void *addr, void *opaque); +void mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf); +uint16_t mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); +int mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset); +int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset); /* Vectorized version of mlx5_rxtx.c */ -int priv_check_raw_vec_tx_support(struct priv *, struct rte_eth_dev *); -int priv_check_vec_tx_support(struct priv *, struct rte_eth_dev *); -int rxq_check_vec_support(struct mlx5_rxq_data *); -int priv_check_vec_rx_support(struct priv *); -uint16_t mlx5_tx_burst_raw_vec(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_tx_burst_vec(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_rx_burst_vec(void *, struct rte_mbuf **, uint16_t); +int mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev); +int mlx5_check_vec_tx_support(struct rte_eth_dev *dev); +int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data); +int mlx5_check_vec_rx_support(struct rte_eth_dev *dev); +uint16_t mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); /* mlx5_mr.c */ -void mlx5_mp2mr_iter(struct rte_mempool *, void *); -struct mlx5_mr *priv_txq_mp2mr_reg(struct priv *priv, struct mlx5_txq_data *, - struct rte_mempool *, unsigned int); -struct mlx5_mr *mlx5_txq_mp2mr_reg(struct mlx5_txq_data *, struct rte_mempool *, - unsigned int); +void mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl); +uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr); +uint32_t mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr); #ifndef NDEBUG /** @@ -363,9 +409,10 @@ check_cqe(volatile struct mlx5_cqe *cqe, (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) return 0; if (!check_cqe_seen(cqe)) { - ERROR("unexpected CQE error %u (0x%02x)" - " syndrome 0x%02x", - op_code, op_code, syndrome); + DRV_LOG(ERR, + "unexpected CQE error %u (0x%02x) syndrome" + " 0x%02x", + op_code, op_code, syndrome); rte_hexdump(stderr, "MLX5 Error CQE:", (const void *)((uintptr_t)err_cqe), sizeof(*err_cqe)); @@ -374,8 +421,8 @@ check_cqe(volatile struct mlx5_cqe *cqe, } else if ((op_code != MLX5_CQE_RESP_SEND) && (op_code != MLX5_CQE_REQ)) { if (!check_cqe_seen(cqe)) { - ERROR("unexpected CQE opcode %u (0x%02x)", - op_code, op_code); + DRV_LOG(ERR, "unexpected CQE opcode %u (0x%02x)", + op_code, op_code); rte_hexdump(stderr, "MLX5 CQE:", (const void *)((uintptr_t)cqe), sizeof(*cqe)); @@ -435,7 +482,7 @@ mlx5_tx_complete(struct mlx5_txq_data *txq) if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { if (!check_cqe_seen(cqe)) { - ERROR("unexpected error CQE, TX stopped"); + DRV_LOG(ERR, "unexpected error CQE, Tx stopped"); rte_hexdump(stderr, "MLX5 TXQ:", (const void *)((uintptr_t)txq->wqes), ((1 << txq->wqe_n) * @@ -487,77 +534,65 @@ mlx5_tx_complete(struct mlx5_txq_data *txq) } /** - * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which - * the cloned mbuf is allocated is returned instead. + * Query LKey from a packet buffer for Rx. No need to flush local caches for Rx + * as mempool is pre-configured and static. * - * @param buf - * Pointer to mbuf. + * @param rxq + * Pointer to Rx queue structure. + * @param addr + * Address to search. * * @return - * Memory pool where data is located for given mbuf. + * Searched LKey on success, UINT32_MAX on no match. */ -static struct rte_mempool * -mlx5_tx_mb2mp(struct rte_mbuf *buf) +static __rte_always_inline uint32_t +mlx5_rx_addr2mr(struct mlx5_rxq_data *rxq, uintptr_t addr) { - if (unlikely(RTE_MBUF_INDIRECT(buf))) - return rte_mbuf_from_indirect(buf)->pool; - return buf->pool; + struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl; + uint32_t lkey; + + /* Linear search on MR cache array. */ + lkey = mlx5_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru, + MLX5_MR_CACHE_N, addr); + if (likely(lkey != UINT32_MAX)) + return lkey; + /* Take slower bottom-half (Binary Search) on miss. */ + return mlx5_rx_addr2mr_bh(rxq, addr); } +#define mlx5_rx_mb2mr(rxq, mb) mlx5_rx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr)) + /** - * Get Memory Region (MR) <-> rte_mbuf association from txq->mp2mr[]. - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, - * remove an entry first. + * Query LKey from a packet buffer for Tx. If not found, add the mempool. * * @param txq - * Pointer to TX queue structure. - * @param[in] mp - * Memory Pool for which a Memory Region lkey must be returned. + * Pointer to Tx queue structure. + * @param addr + * Address to search. * * @return - * mr->lkey on success, (uint32_t)-1 on failure. + * Searched LKey on success, UINT32_MAX on no match. */ static __rte_always_inline uint32_t -mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb) +mlx5_tx_addr2mr(struct mlx5_txq_data *txq, uintptr_t addr) { - uint16_t i = txq->mr_cache_idx; - uintptr_t addr = rte_pktmbuf_mtod(mb, uintptr_t); - struct mlx5_mr *mr; - - assert(i < RTE_DIM(txq->mp2mr)); - if (likely(txq->mp2mr[i]->start <= addr && txq->mp2mr[i]->end > addr)) - return txq->mp2mr[i]->lkey; - for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { - if (unlikely(txq->mp2mr[i] == NULL || - txq->mp2mr[i]->mr == NULL)) { - /* Unknown MP, add a new MR for it. */ - break; - } - if (txq->mp2mr[i]->start <= addr && - txq->mp2mr[i]->end > addr) { - assert(txq->mp2mr[i]->lkey != (uint32_t)-1); - txq->mr_cache_idx = i; - return txq->mp2mr[i]->lkey; - } - } - mr = mlx5_txq_mp2mr_reg(txq, mlx5_tx_mb2mp(mb), i); - /* - * Request the reference to use in this queue, the original one is - * kept by the control plane. - */ - if (mr) { - rte_atomic32_inc(&mr->refcnt); - txq->mr_cache_idx = i >= RTE_DIM(txq->mp2mr) ? i - 1 : i; - return mr->lkey; - } else { - struct rte_mempool *mp = mlx5_tx_mb2mp(mb); - - WARN("Failed to register mempool 0x%p(%s)", - (void *)mp, mp->name); - } - return (uint32_t)-1; + struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + uint32_t lkey; + + /* Check generation bit to see if there's any change on existing MRs. */ + if (unlikely(*mr_ctrl->dev_gen_ptr != mr_ctrl->cur_gen)) + mlx5_mr_flush_local_cache(mr_ctrl); + /* Linear search on MR cache array. */ + lkey = mlx5_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru, + MLX5_MR_CACHE_N, addr); + if (likely(lkey != UINT32_MAX)) + return lkey; + /* Take slower bottom-half (binary search) on miss. */ + return mlx5_tx_addr2mr_bh(txq, addr); } +#define mlx5_tx_mb2mr(rxq, mb) mlx5_tx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr)) + /** * Ring TX queue doorbell and flush the update if requested. * @@ -599,38 +634,100 @@ mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe) } /** - * Convert the Checksum offloads to Verbs. + * Convert mbuf to Verb SWP. * * @param txq_data * Pointer to the Tx queue. * @param buf * Pointer to the mbuf. + * @param tso + * TSO offloads enabled. + * @param vlan + * VLAN offloads enabled + * @param offsets + * Pointer to the SWP header offsets. + * @param swp_types + * Pointer to the SWP header types. + */ +static __rte_always_inline void +txq_mbuf_to_swp(struct mlx5_txq_data *txq, struct rte_mbuf *buf, + uint8_t *offsets, uint8_t *swp_types) +{ + const uint64_t vlan = buf->ol_flags & PKT_TX_VLAN_PKT; + const uint64_t tunnel = buf->ol_flags & PKT_TX_TUNNEL_MASK; + const uint64_t tso = buf->ol_flags & PKT_TX_TCP_SEG; + const uint64_t csum_flags = buf->ol_flags & PKT_TX_L4_MASK; + const uint64_t inner_ip = + buf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6); + const uint64_t ol_flags_mask = PKT_TX_L4_MASK | PKT_TX_IPV6 | + PKT_TX_OUTER_IPV6; + uint16_t idx; + uint16_t off; + + if (likely(!txq->swp_en || (tunnel != PKT_TX_TUNNEL_UDP && + tunnel != PKT_TX_TUNNEL_IP))) + return; + /* + * The index should have: + * bit[0:1] = PKT_TX_L4_MASK + * bit[4] = PKT_TX_IPV6 + * bit[8] = PKT_TX_OUTER_IPV6 + * bit[9] = PKT_TX_OUTER_UDP + */ + idx = (buf->ol_flags & ol_flags_mask) >> 52; + if (tunnel == PKT_TX_TUNNEL_UDP) + idx |= 1 << 9; + *swp_types = mlx5_swp_types_table[idx]; + /* + * Set offsets for SW parser. Since ConnectX-5, SW parser just + * complements HW parser. SW parser starts to engage only if HW parser + * can't reach a header. For the older devices, HW parser will not kick + * in if any of SWP offsets is set. Therefore, all of the L3 offsets + * should be set regardless of HW offload. + */ + off = buf->outer_l2_len + (vlan ? sizeof(struct vlan_hdr) : 0); + offsets[1] = off >> 1; /* Outer L3 offset. */ + off += buf->outer_l3_len; + if (tunnel == PKT_TX_TUNNEL_UDP) + offsets[0] = off >> 1; /* Outer L4 offset. */ + if (inner_ip) { + off += buf->l2_len; + offsets[3] = off >> 1; /* Inner L3 offset. */ + if (csum_flags == PKT_TX_TCP_CKSUM || tso || + csum_flags == PKT_TX_UDP_CKSUM) { + off += buf->l3_len; + offsets[2] = off >> 1; /* Inner L4 offset. */ + } + } +} + +/** + * Convert the Checksum offloads to Verbs. + * + * @param buf + * Pointer to the mbuf. * * @return - * the converted cs_flags. + * Converted checksum flags. */ static __rte_always_inline uint8_t -txq_ol_cksum_to_cs(struct mlx5_txq_data *txq_data, struct rte_mbuf *buf) +txq_ol_cksum_to_cs(struct rte_mbuf *buf) { - uint8_t cs_flags = 0; - - /* Should we enable HW CKSUM offload */ - if (buf->ol_flags & - (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM | - PKT_TX_OUTER_IP_CKSUM)) { - if (txq_data->tunnel_en && - (buf->ol_flags & - (PKT_TX_TUNNEL_GRE | PKT_TX_TUNNEL_VXLAN))) { - cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | - MLX5_ETH_WQE_L4_INNER_CSUM; - if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) - cs_flags |= MLX5_ETH_WQE_L3_CSUM; - } else { - cs_flags = MLX5_ETH_WQE_L3_CSUM | - MLX5_ETH_WQE_L4_CSUM; - } - } - return cs_flags; + uint32_t idx; + uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK); + const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK | + PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM; + + /* + * The index should have: + * bit[0] = PKT_TX_TCP_SEG + * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM + * bit[4] = PKT_TX_IP_CKSUM + * bit[8] = PKT_TX_OUTER_IP_CKSUM + * bit[9] = tunnel + */ + idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9); + return mlx5_cksum_table[idx]; } /** diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c index b66c2916..0a4aed8f 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.c +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox. + * Copyright 2017 Mellanox Technologies, Ltd */ #include <assert.h> @@ -42,8 +42,6 @@ /** * Count the number of packets having same ol_flags and calculate cs_flags. * - * @param txq - * Pointer to TX queue structure. * @param pkts * Pointer to array of packets. * @param pkts_n @@ -55,8 +53,7 @@ * Number of packets having same ol_flags. */ static inline unsigned int -txq_calc_offload(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, - uint16_t pkts_n, uint8_t *cs_flags) +txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags) { unsigned int pos; const uint64_t ol_mask = @@ -70,7 +67,7 @@ txq_calc_offload(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, for (pos = 1; pos < pkts_n; ++pos) if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask) break; - *cs_flags = txq_ol_cksum_to_cs(txq, pkts[0]); + *cs_flags = txq_ol_cksum_to_cs(pkts[0]); return pos; } @@ -141,7 +138,7 @@ mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS) n = txq_count_contig_single_seg(&pkts[nb_tx], n); if (txq->offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) - n = txq_calc_offload(txq, &pkts[nb_tx], n, &cs_flags); + n = txq_calc_offload(&pkts[nb_tx], n, &cs_flags); ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags); nb_tx += ret; if (!ret) @@ -223,17 +220,14 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) /** * Check Tx queue flags are set for raw vectorized Tx. * - * @param priv - * Pointer to private structure. * @param dev - * Pointer to rte_eth_dev structure. + * Pointer to Ethernet device. * * @return * 1 if supported, negative errno value if not. */ int __attribute__((cold)) -priv_check_raw_vec_tx_support(__rte_unused struct priv *priv, - struct rte_eth_dev *dev) +mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev) { uint64_t offloads = dev->data->dev_conf.txmode.offloads; @@ -246,17 +240,16 @@ priv_check_raw_vec_tx_support(__rte_unused struct priv *priv, /** * Check a device can support vectorized TX. * - * @param priv - * Pointer to private structure. * @param dev - * Pointer to rte_eth_dev structure. + * Pointer to Ethernet device. * * @return * 1 if supported, negative errno value if not. */ int __attribute__((cold)) -priv_check_vec_tx_support(struct priv *priv, struct rte_eth_dev *dev) +mlx5_check_vec_tx_support(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; uint64_t offloads = dev->data->dev_conf.txmode.offloads; if (!priv->config.tx_vec_en || @@ -277,11 +270,13 @@ priv_check_vec_tx_support(struct priv *priv, struct rte_eth_dev *dev) * 1 if supported, negative errno value if not. */ int __attribute__((cold)) -rxq_check_vec_support(struct mlx5_rxq_data *rxq) +mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq) { struct mlx5_rxq_ctrl *ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); + if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv))) + return -ENOTSUP; if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0) return -ENOTSUP; return 1; @@ -290,26 +285,29 @@ rxq_check_vec_support(struct mlx5_rxq_data *rxq) /** * Check a device can support vectorized RX. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return * 1 if supported, negative errno value if not. */ int __attribute__((cold)) -priv_check_vec_rx_support(struct priv *priv) +mlx5_check_vec_rx_support(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; uint16_t i; if (!priv->config.rx_vec_en) return -ENOTSUP; + if (mlx5_mprq_enabled(dev)) + return -ENOTSUP; /* All the configured queues should support. */ for (i = 0; i < priv->rxqs_n; ++i) { struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; if (!rxq) continue; - if (rxq_check_vec_support(rxq) < 0) + if (mlx5_rxq_check_vec_support(rxq) < 0) break; } if (i != priv->rxqs_n) diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h index 44856bbf..598dc751 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox. + * Copyright 2017 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_RXTX_VEC_H_ @@ -87,7 +87,8 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n) const uint16_t q_mask = q_n - 1; uint16_t elts_idx = rxq->rq_ci & q_mask; struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; - volatile struct mlx5_wqe_data_seg *wq = &(*rxq->wqes)[elts_idx]; + volatile struct mlx5_wqe_data_seg *wq = + &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx]; unsigned int i; assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH); @@ -99,9 +100,13 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n) rxq->stats.rx_nombuf += n; return; } - for (i = 0; i < n; ++i) + for (i = 0; i < n; ++i) { wq[i].addr = rte_cpu_to_be_64((uintptr_t)elts[i]->buf_addr + RTE_PKTMBUF_HEADROOM); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]); + } rxq->rq_ci += n; /* Prevent overflowing into consumed mbufs. */ elts_idx = rxq->rq_ci & q_mask; diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h index bbe1818e..71a5eaf2 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox. + * Copyright 2017 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_RXTX_VEC_NEON_H_ @@ -142,7 +142,7 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, break; wqe = &((volatile struct mlx5_wqe64 *) txq->wqes)[wqe_ci & wq_mask].hdr; - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); /* Title WQEBB pointer. */ t_wqe = (uint8x16_t *)wqe; dseg = (uint8_t *)(wqe + 1); @@ -167,8 +167,8 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, vst1q_u8((void *)t_wqe, ctrl); /* Fill ESEG in the header. */ vst1q_u16((void *)(t_wqe + 1), - (uint16x8_t) { 0, 0, cs_flags, rte_cpu_to_be_16(len), - 0, 0, 0, 0 }); + ((uint16x8_t) { 0, 0, cs_flags, rte_cpu_to_be_16(len), + 0, 0, 0, 0 })); txq->wqe_ci = wqe_ci; } if (!n) @@ -300,10 +300,10 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, vst1q_u8((void *)t_wqe, ctrl); /* Fill ESEG in the header. */ vst1q_u8((void *)(t_wqe + 1), - (uint8x16_t) { 0, 0, 0, 0, - cs_flags, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0 }); + ((uint8x16_t) { 0, 0, 0, 0, + cs_flags, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 })); #ifdef MLX5_PMD_SOFT_COUNTERS txq->stats.opackets += pkts_n; #endif @@ -551,6 +551,7 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, const uint64x1_t mbuf_init = vld1_u64(&rxq->mbuf_initializer); const uint64x1_t r32_mask = vcreate_u64(0xffffffff); uint64x2_t rearm0, rearm1, rearm2, rearm3; + uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3; if (rxq->mark) { const uint32x4_t ft_def = vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT); @@ -583,14 +584,18 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, ptype = vshrn_n_u32(ptype_info, 10); /* Errored packets will have RTE_PTYPE_ALL_MASK. */ ptype = vorr_u16(ptype, op_err); - pkts[0]->packet_type = - mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 6)]; - pkts[1]->packet_type = - mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 4)]; - pkts[2]->packet_type = - mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 2)]; - pkts[3]->packet_type = - mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 0)]; + pt_idx0 = vget_lane_u8(vreinterpret_u8_u16(ptype), 6); + pt_idx1 = vget_lane_u8(vreinterpret_u8_u16(ptype), 4); + pt_idx2 = vget_lane_u8(vreinterpret_u8_u16(ptype), 2); + pt_idx3 = vget_lane_u8(vreinterpret_u8_u16(ptype), 0); + pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] | + !!(pt_idx0 & (1 << 6)) * rxq->tunnel; + pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] | + !!(pt_idx1 & (1 << 6)) * rxq->tunnel; + pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] | + !!(pt_idx2 & (1 << 6)) * rxq->tunnel; + pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] | + !!(pt_idx3 & (1 << 6)) * rxq->tunnel; /* Fill flags for checksum and VLAN. */ pinfo = vandq_u32(ptype_info, ptype_ol_mask); pinfo = vreinterpretq_u32_u8( diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h index c088bcb5..3e985d61 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox. + * Copyright 2017 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_ @@ -144,7 +144,7 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, } wqe = &((volatile struct mlx5_wqe64 *) txq->wqes)[wqe_ci & wq_mask].hdr; - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); /* Title WQEBB pointer. */ t_wqe = (__m128i *)wqe; dseg = (__m128i *)(wqe + 1); @@ -542,6 +542,7 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], const __m128i mbuf_init = _mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer); __m128i rearm0, rearm1, rearm2, rearm3; + uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3; /* Extract pkt_info field. */ pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]); @@ -595,10 +596,18 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], /* Errored packets will have RTE_PTYPE_ALL_MASK. */ op_err = _mm_srli_epi16(op_err, 8); ptype = _mm_or_si128(ptype, op_err); - pkts[0]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 0)]; - pkts[1]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 2)]; - pkts[2]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 4)]; - pkts[3]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 6)]; + pt_idx0 = _mm_extract_epi8(ptype, 0); + pt_idx1 = _mm_extract_epi8(ptype, 2); + pt_idx2 = _mm_extract_epi8(ptype, 4); + pt_idx3 = _mm_extract_epi8(ptype, 6); + pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] | + !!(pt_idx0 & (1 << 6)) * rxq->tunnel; + pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] | + !!(pt_idx1 & (1 << 6)) * rxq->tunnel; + pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] | + !!(pt_idx2 & (1 << 6)) * rxq->tunnel; + pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] | + !!(pt_idx3 & (1 << 6)) * rxq->tunnel; /* Fill flags for checksum and VLAN. */ pinfo = _mm_and_si128(pinfo, ptype_ol_mask); pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo); diff --git a/drivers/net/mlx5/mlx5_socket.c b/drivers/net/mlx5/mlx5_socket.c index 61c1a4a5..99297d5c 100644 --- a/drivers/net/mlx5/mlx5_socket.c +++ b/drivers/net/mlx5/mlx5_socket.c @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2016 6WIND S.A. + * Copyright 2016 Mellanox Technologies, Ltd */ #define _GNU_SOURCE @@ -18,21 +19,21 @@ /** * Initialise the socket to communicate with the secondary process * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_socket_init(struct priv *priv) +mlx5_socket_init(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct sockaddr_un sun = { .sun_family = AF_UNIX, }; int ret; int flags; - struct stat file_stat; /* * Initialise the socket to communicate with the secondary @@ -40,70 +41,77 @@ priv_socket_init(struct priv *priv) */ ret = socket(AF_UNIX, SOCK_STREAM, 0); if (ret < 0) { - WARN("secondary process not supported: %s", strerror(errno)); - return ret; + rte_errno = errno; + DRV_LOG(WARNING, "port %u secondary process not supported: %s", + dev->data->port_id, strerror(errno)); + goto error; } priv->primary_socket = ret; flags = fcntl(priv->primary_socket, F_GETFL, 0); - if (flags == -1) - goto out; + if (flags == -1) { + rte_errno = errno; + goto error; + } ret = fcntl(priv->primary_socket, F_SETFL, flags | O_NONBLOCK); - if (ret < 0) - goto out; + if (ret < 0) { + rte_errno = errno; + goto error; + } snprintf(sun.sun_path, sizeof(sun.sun_path), "/var/tmp/%s_%d", MLX5_DRIVER_NAME, priv->primary_socket); - ret = stat(sun.sun_path, &file_stat); - if (!ret) - claim_zero(remove(sun.sun_path)); + remove(sun.sun_path); ret = bind(priv->primary_socket, (const struct sockaddr *)&sun, sizeof(sun)); if (ret < 0) { - WARN("cannot bind socket, secondary process not supported: %s", - strerror(errno)); + rte_errno = errno; + DRV_LOG(WARNING, + "port %u cannot bind socket, secondary process not" + " supported: %s", + dev->data->port_id, strerror(errno)); goto close; } ret = listen(priv->primary_socket, 0); if (ret < 0) { - WARN("Secondary process not supported: %s", strerror(errno)); + rte_errno = errno; + DRV_LOG(WARNING, "port %u secondary process not supported: %s", + dev->data->port_id, strerror(errno)); goto close; } - return ret; + return 0; close: remove(sun.sun_path); -out: +error: claim_zero(close(priv->primary_socket)); priv->primary_socket = 0; - return -(ret); + return -rte_errno; } /** * Un-Initialise the socket to communicate with the secondary process * - * @param[in] priv - * Pointer to private structure. - * - * @return - * 0 on success, errno value on failure. + * @param[in] dev */ -int -priv_socket_uninit(struct priv *priv) +void +mlx5_socket_uninit(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; + MKSTR(path, "/var/tmp/%s_%d", MLX5_DRIVER_NAME, priv->primary_socket); claim_zero(close(priv->primary_socket)); priv->primary_socket = 0; claim_zero(remove(path)); - return 0; } /** * Handle socket interrupts. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. */ void -priv_socket_handle(struct priv *priv) +mlx5_socket_handle(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; int conn_sock; int ret = 0; struct cmsghdr *cmsg = NULL; @@ -125,25 +133,30 @@ priv_socket_handle(struct priv *priv) /* Accept the connection from the client. */ conn_sock = accept(priv->primary_socket, NULL, NULL); if (conn_sock < 0) { - WARN("connection failed: %s", strerror(errno)); + DRV_LOG(WARNING, "port %u connection failed: %s", + dev->data->port_id, strerror(errno)); return; } ret = setsockopt(conn_sock, SOL_SOCKET, SO_PASSCRED, &(int){1}, sizeof(int)); if (ret < 0) { - WARN("cannot change socket options"); - goto out; + ret = errno; + DRV_LOG(WARNING, "port %u cannot change socket options: %s", + dev->data->port_id, strerror(rte_errno)); + goto error; } ret = recvmsg(conn_sock, &msg, MSG_WAITALL); if (ret < 0) { - WARN("received an empty message: %s", strerror(errno)); - goto out; + ret = errno; + DRV_LOG(WARNING, "port %u received an empty message: %s", + dev->data->port_id, strerror(rte_errno)); + goto error; } /* Expect to receive credentials only. */ cmsg = CMSG_FIRSTHDR(&msg); if (cmsg == NULL) { - WARN("no message"); - goto out; + DRV_LOG(WARNING, "port %u no message", dev->data->port_id); + goto error; } if ((cmsg->cmsg_type == SCM_CREDENTIALS) && (cmsg->cmsg_len >= sizeof(*cred))) { @@ -152,14 +165,16 @@ priv_socket_handle(struct priv *priv) } cmsg = CMSG_NXTHDR(&msg, cmsg); if (cmsg != NULL) { - WARN("Message wrongly formatted"); - goto out; + DRV_LOG(WARNING, "port %u message wrongly formatted", + dev->data->port_id); + goto error; } /* Make sure all the ancillary data was received and valid. */ if ((cred == NULL) || (cred->uid != getuid()) || (cred->gid != getgid())) { - WARN("wrong credentials"); - goto out; + DRV_LOG(WARNING, "port %u wrong credentials", + dev->data->port_id); + goto error; } /* Set-up the ancillary data. */ cmsg = CMSG_FIRSTHDR(&msg); @@ -171,27 +186,29 @@ priv_socket_handle(struct priv *priv) *fd = priv->ctx->cmd_fd; ret = sendmsg(conn_sock, &msg, 0); if (ret < 0) - WARN("cannot send response"); -out: + DRV_LOG(WARNING, "port %u cannot send response", + dev->data->port_id); +error: close(conn_sock); } /** * Connect to the primary process. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet structure. * * @return - * fd on success, negative errno value on failure. + * fd on success, negative errno value otherwise and rte_errno is set. */ int -priv_socket_connect(struct priv *priv) +mlx5_socket_connect(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct sockaddr_un sun = { .sun_family = AF_UNIX, }; - int socket_fd; + int socket_fd = -1; int *fd = NULL; int ret; struct ucred *cred; @@ -211,57 +228,75 @@ priv_socket_connect(struct priv *priv) ret = socket(AF_UNIX, SOCK_STREAM, 0); if (ret < 0) { - WARN("cannot connect to primary"); - return ret; + rte_errno = errno; + DRV_LOG(WARNING, "port %u cannot connect to primary", + dev->data->port_id); + goto error; } socket_fd = ret; snprintf(sun.sun_path, sizeof(sun.sun_path), "/var/tmp/%s_%d", MLX5_DRIVER_NAME, priv->primary_socket); ret = connect(socket_fd, (const struct sockaddr *)&sun, sizeof(sun)); if (ret < 0) { - WARN("cannot connect to primary"); - goto out; + rte_errno = errno; + DRV_LOG(WARNING, "port %u cannot connect to primary", + dev->data->port_id); + goto error; } cmsg = CMSG_FIRSTHDR(&msg); if (cmsg == NULL) { - DEBUG("cannot get first message"); - goto out; + rte_errno = EINVAL; + DRV_LOG(DEBUG, "port %u cannot get first message", + dev->data->port_id); + goto error; } cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_CREDENTIALS; cmsg->cmsg_len = CMSG_LEN(sizeof(*cred)); cred = (struct ucred *)CMSG_DATA(cmsg); if (cred == NULL) { - DEBUG("no credentials received"); - goto out; + rte_errno = EINVAL; + DRV_LOG(DEBUG, "port %u no credentials received", + dev->data->port_id); + goto error; } cred->pid = getpid(); cred->uid = getuid(); cred->gid = getgid(); ret = sendmsg(socket_fd, &msg, MSG_DONTWAIT); if (ret < 0) { - WARN("cannot send credentials to primary: %s", - strerror(errno)); - goto out; + rte_errno = errno; + DRV_LOG(WARNING, + "port %u cannot send credentials to primary: %s", + dev->data->port_id, strerror(errno)); + goto error; } ret = recvmsg(socket_fd, &msg, MSG_WAITALL); if (ret <= 0) { - WARN("no message from primary: %s", strerror(errno)); - goto out; + rte_errno = errno; + DRV_LOG(WARNING, "port %u no message from primary: %s", + dev->data->port_id, strerror(errno)); + goto error; } cmsg = CMSG_FIRSTHDR(&msg); if (cmsg == NULL) { - WARN("No file descriptor received"); - goto out; + rte_errno = EINVAL; + DRV_LOG(WARNING, "port %u no file descriptor received", + dev->data->port_id); + goto error; } fd = (int *)CMSG_DATA(cmsg); - if (*fd <= 0) { - WARN("no file descriptor received: %s", strerror(errno)); - ret = *fd; - goto out; + if (*fd < 0) { + DRV_LOG(WARNING, "port %u no file descriptor received: %s", + dev->data->port_id, strerror(errno)); + rte_errno = *fd; + goto error; } ret = *fd; -out: close(socket_fd); return ret; +error: + if (socket_fd != -1) + close(socket_fd); + return -rte_errno; } diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c index 378472a7..875dd102 100644 --- a/drivers/net/mlx5/mlx5_stats.c +++ b/drivers/net/mlx5/mlx5_stats.c @@ -1,10 +1,13 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ +#include <inttypes.h> #include <linux/sockios.h> #include <linux/ethtool.h> +#include <stdint.h> +#include <stdio.h> #include <rte_ethdev_driver.h> #include <rte_common.h> @@ -19,6 +22,7 @@ struct mlx5_counter_ctrl { char dpdk_name[RTE_ETH_XSTATS_NAME_SIZE]; /* Name of the counter on the device table. */ char ctr_name[RTE_ETH_XSTATS_NAME_SIZE]; + uint32_t ib:1; /**< Nonzero for IB counters. */ }; static const struct mlx5_counter_ctrl mlx5_counters_init[] = { @@ -93,6 +97,7 @@ static const struct mlx5_counter_ctrl mlx5_counters_init[] = { { .dpdk_name = "rx_out_of_buffer", .ctr_name = "out_of_buffer", + .ib = 1, }, { .dpdk_name = "tx_packets_phy", @@ -117,39 +122,56 @@ static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init); /** * Read device counters table. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param[out] stats * Counters table output buffer. * * @return - * 0 on success and stats is filled, negative on error. + * 0 on success and stats is filled, negative errno value otherwise and + * rte_errno is set. */ static int -priv_read_dev_counters(struct priv *priv, uint64_t *stats) +mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats) { + struct priv *priv = dev->data->dev_private; struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; unsigned int i; struct ifreq ifr; unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t); unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz]; struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf; + int ret; et_stats->cmd = ETHTOOL_GSTATS; et_stats->n_stats = xstats_ctrl->stats_n; ifr.ifr_data = (caddr_t)et_stats; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr) != 0) { - WARN("unable to read statistic values from device"); - return -1; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, + "port %u unable to read statistic values from device", + dev->data->port_id); + return ret; } for (i = 0; i != xstats_n; ++i) { - if (priv_is_ib_cntr(mlx5_counters_init[i].ctr_name)) - priv_get_cntr_sysfs(priv, - mlx5_counters_init[i].ctr_name, - &stats[i]); - else + if (mlx5_counters_init[i].ib) { + FILE *file; + MKSTR(path, "%s/ports/1/hw_counters/%s", + priv->ibdev_path, + mlx5_counters_init[i].ctr_name); + + file = fopen(path, "rb"); + if (file) { + int n = fscanf(file, "%" SCNu64, &stats[i]); + + fclose(file); + if (n != 1) + stats[i] = 0; + } + } else { stats[i] = (uint64_t) et_stats->data[xstats_ctrl->dev_table_idx[i]]; + } } return 0; } @@ -157,22 +179,26 @@ priv_read_dev_counters(struct priv *priv, uint64_t *stats) /** * Query the number of statistics provided by ETHTOOL. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return - * Number of statistics on success, -1 on error. + * Number of statistics on success, negative errno value otherwise and + * rte_errno is set. */ static int -priv_ethtool_get_stats_n(struct priv *priv) { +mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) { struct ethtool_drvinfo drvinfo; struct ifreq ifr; + int ret; drvinfo.cmd = ETHTOOL_GDRVINFO; ifr.ifr_data = (caddr_t)&drvinfo; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr) != 0) { - WARN("unable to query number of statistics"); - return -1; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, "port %u unable to query number of statistics", + dev->data->port_id); + return ret; } return drvinfo.n_stats; } @@ -180,12 +206,13 @@ priv_ethtool_get_stats_n(struct priv *priv) { /** * Init the structures to read device counters. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. */ void -priv_xstats_init(struct priv *priv) +mlx5_xstats_init(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; unsigned int i; unsigned int j; @@ -193,12 +220,15 @@ priv_xstats_init(struct priv *priv) struct ethtool_gstrings *strings = NULL; unsigned int dev_stats_n; unsigned int str_sz; + int ret; - dev_stats_n = priv_ethtool_get_stats_n(priv); - if (dev_stats_n < 1) { - WARN("no extended statistics available"); + ret = mlx5_ethtool_get_stats_n(dev); + if (ret < 0) { + DRV_LOG(WARNING, "port %u no extended statistics available", + dev->data->port_id); return; } + dev_stats_n = ret; xstats_ctrl->stats_n = dev_stats_n; /* Allocate memory to grab stat names and values. */ str_sz = dev_stats_n * ETH_GSTRING_LEN; @@ -206,15 +236,18 @@ priv_xstats_init(struct priv *priv) rte_malloc("xstats_strings", str_sz + sizeof(struct ethtool_gstrings), 0); if (!strings) { - WARN("unable to allocate memory for xstats"); + DRV_LOG(WARNING, "port %u unable to allocate memory for xstats", + dev->data->port_id); return; } strings->cmd = ETHTOOL_GSTRINGS; strings->string_set = ETH_SS_STATS; strings->len = dev_stats_n; ifr.ifr_data = (caddr_t)strings; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr) != 0) { - WARN("unable to get statistic names"); + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); + if (ret) { + DRV_LOG(WARNING, "port %u unable to get statistic names", + dev->data->port_id); goto free; } for (j = 0; j != xstats_n; ++j) @@ -232,68 +265,67 @@ priv_xstats_init(struct priv *priv) } } for (j = 0; j != xstats_n; ++j) { - if (priv_is_ib_cntr(mlx5_counters_init[j].ctr_name)) + if (mlx5_counters_init[j].ib) continue; if (xstats_ctrl->dev_table_idx[j] >= dev_stats_n) { - WARN("counter \"%s\" is not recognized", - mlx5_counters_init[j].dpdk_name); + DRV_LOG(WARNING, + "port %u counter \"%s\" is not recognized", + dev->data->port_id, + mlx5_counters_init[j].dpdk_name); goto free; } } /* Copy to base at first time. */ assert(xstats_n <= MLX5_MAX_XSTATS); - priv_read_dev_counters(priv, xstats_ctrl->base); + ret = mlx5_read_dev_counters(dev, xstats_ctrl->base); + if (ret) + DRV_LOG(ERR, "port %u cannot read device counters: %s", + dev->data->port_id, strerror(rte_errno)); free: rte_free(strings); } /** - * Get device extended statistics. + * DPDK callback to get extended device statistics. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param[out] stats * Pointer to rte extended stats table. + * @param n + * The size of the stats table. * * @return * Number of extended stats on success and stats is filled, - * negative on error. + * negative on error and rte_errno is set. */ -static int -priv_xstats_get(struct priv *priv, struct rte_eth_xstat *stats) +int +mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats, + unsigned int n) { - struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + struct priv *priv = dev->data->dev_private; unsigned int i; - unsigned int n = xstats_n; uint64_t counters[n]; - if (priv_read_dev_counters(priv, counters) < 0) - return -1; - for (i = 0; i != xstats_n; ++i) { - stats[i].id = i; - stats[i].value = (counters[i] - xstats_ctrl->base[i]); - } - return n; -} - -/** - * Reset device extended statistics. - * - * @param priv - * Pointer to private structure. - */ -static void -priv_xstats_reset(struct priv *priv) -{ - struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; - unsigned int i; - unsigned int n = xstats_n; - uint64_t counters[n]; + if (n >= xstats_n && stats) { + struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + int stats_n; + int ret; - if (priv_read_dev_counters(priv, counters) < 0) - return; - for (i = 0; i != n; ++i) - xstats_ctrl->base[i] = counters[i]; + stats_n = mlx5_ethtool_get_stats_n(dev); + if (stats_n < 0) + return stats_n; + if (xstats_ctrl->stats_n != stats_n) + mlx5_xstats_init(dev); + ret = mlx5_read_dev_counters(dev, counters); + if (ret) + return ret; + for (i = 0; i != xstats_n; ++i) { + stats[i].id = i; + stats[i].value = (counters[i] - xstats_ctrl->base[i]); + } + } + return xstats_n; } /** @@ -303,6 +335,10 @@ priv_xstats_reset(struct priv *priv) * Pointer to Ethernet device structure. * @param[out] stats * Stats structure output buffer. + * + * @return + * 0 on success and stats is filled, negative errno value otherwise and + * rte_errno is set. */ int mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) @@ -312,7 +348,6 @@ mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) unsigned int i; unsigned int idx; - priv_lock(priv); /* Add software counters. */ for (i = 0; (i != priv->rxqs_n); ++i) { struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; @@ -358,7 +393,6 @@ mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) /* FIXME: retrieve and add hardware counters. */ #endif *stats = tmp; - priv_unlock(priv); return 0; } @@ -375,7 +409,6 @@ mlx5_stats_reset(struct rte_eth_dev *dev) unsigned int i; unsigned int idx; - priv_lock(priv); for (i = 0; (i != priv->rxqs_n); ++i) { if ((*priv->rxqs)[i] == NULL) continue; @@ -393,45 +426,6 @@ mlx5_stats_reset(struct rte_eth_dev *dev) #ifndef MLX5_PMD_SOFT_COUNTERS /* FIXME: reset hardware counters. */ #endif - priv_unlock(priv); -} - -/** - * DPDK callback to get extended device statistics. - * - * @param dev - * Pointer to Ethernet device structure. - * @param[out] stats - * Stats table output buffer. - * @param n - * The size of the stats table. - * - * @return - * Number of xstats on success, negative on failure. - */ -int -mlx5_xstats_get(struct rte_eth_dev *dev, - struct rte_eth_xstat *stats, unsigned int n) -{ - struct priv *priv = dev->data->dev_private; - int ret = xstats_n; - - if (n >= xstats_n && stats) { - struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; - int stats_n; - - priv_lock(priv); - stats_n = priv_ethtool_get_stats_n(priv); - if (stats_n < 0) { - priv_unlock(priv); - return -1; - } - if (xstats_ctrl->stats_n != stats_n) - priv_xstats_init(priv); - ret = priv_xstats_get(priv, stats); - priv_unlock(priv); - } - return ret; } /** @@ -446,16 +440,27 @@ mlx5_xstats_reset(struct rte_eth_dev *dev) struct priv *priv = dev->data->dev_private; struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; int stats_n; + unsigned int i; + unsigned int n = xstats_n; + uint64_t counters[n]; + int ret; - priv_lock(priv); - stats_n = priv_ethtool_get_stats_n(priv); - if (stats_n < 0) - goto unlock; + stats_n = mlx5_ethtool_get_stats_n(dev); + if (stats_n < 0) { + DRV_LOG(ERR, "port %u cannot get stats: %s", dev->data->port_id, + strerror(-stats_n)); + return; + } if (xstats_ctrl->stats_n != stats_n) - priv_xstats_init(priv); - priv_xstats_reset(priv); -unlock: - priv_unlock(priv); + mlx5_xstats_init(dev); + ret = mlx5_read_dev_counters(dev, counters); + if (ret) { + DRV_LOG(ERR, "port %u cannot read device counters: %s", + dev->data->port_id, strerror(rte_errno)); + return; + } + for (i = 0; i != n; ++i) + xstats_ctrl->base[i] = counters[i]; } /** @@ -472,21 +477,18 @@ unlock: * Number of xstats names. */ int -mlx5_xstats_get_names(struct rte_eth_dev *dev, - struct rte_eth_xstat_name *xstats_names, unsigned int n) +mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_xstat_name *xstats_names, unsigned int n) { - struct priv *priv = dev->data->dev_private; unsigned int i; if (n >= xstats_n && xstats_names) { - priv_lock(priv); for (i = 0; i != xstats_n; ++i) { strncpy(xstats_names[i].name, mlx5_counters_init[i].dpdk_name, RTE_ETH_XSTATS_NAME_SIZE); xstats_names[i].name[RTE_ETH_XSTATS_NAME_SIZE - 1] = 0; } - priv_unlock(priv); } return xstats_n; } diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c index f5711a99..3e7c0a90 100644 --- a/drivers/net/mlx5/mlx5_trigger.c +++ b/drivers/net/mlx5/mlx5_trigger.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <unistd.h> @@ -14,83 +14,125 @@ #include "mlx5_rxtx.h" #include "mlx5_utils.h" +/** + * Stop traffic on Tx queues. + * + * @param dev + * Pointer to Ethernet device structure. + */ static void -priv_txq_stop(struct priv *priv) +mlx5_txq_stop(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; unsigned int i; for (i = 0; i != priv->txqs_n; ++i) - mlx5_priv_txq_release(priv, i); + mlx5_txq_release(dev, i); } +/** + * Start traffic on Tx queues. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ static int -priv_txq_start(struct priv *priv) +mlx5_txq_start(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; unsigned int i; - int ret = 0; + int ret; /* Add memory regions to Tx queues. */ for (i = 0; i != priv->txqs_n; ++i) { - unsigned int idx = 0; - struct mlx5_mr *mr; - struct mlx5_txq_ctrl *txq_ctrl = mlx5_priv_txq_get(priv, i); + struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i); if (!txq_ctrl) continue; - LIST_FOREACH(mr, &priv->mr, next) { - priv_txq_mp2mr_reg(priv, &txq_ctrl->txq, mr->mp, idx++); - if (idx == MLX5_PMD_TX_MP_CACHE) - break; - } txq_alloc_elts(txq_ctrl); - txq_ctrl->ibv = mlx5_priv_txq_ibv_new(priv, i); + txq_ctrl->ibv = mlx5_txq_ibv_new(dev, i); if (!txq_ctrl->ibv) { - ret = ENOMEM; + rte_errno = ENOMEM; goto error; } } - ret = priv_tx_uar_remap(priv, priv->ctx->cmd_fd); + ret = mlx5_tx_uar_remap(dev, priv->ctx->cmd_fd); if (ret) goto error; - return ret; + return 0; error: - priv_txq_stop(priv); - return ret; + ret = rte_errno; /* Save rte_errno before cleanup. */ + mlx5_txq_stop(dev); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } +/** + * Stop traffic on Rx queues. + * + * @param dev + * Pointer to Ethernet device structure. + */ static void -priv_rxq_stop(struct priv *priv) +mlx5_rxq_stop(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; unsigned int i; for (i = 0; i != priv->rxqs_n; ++i) - mlx5_priv_rxq_release(priv, i); + mlx5_rxq_release(dev, i); } +/** + * Start traffic on Rx queues. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ static int -priv_rxq_start(struct priv *priv) +mlx5_rxq_start(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; unsigned int i; int ret = 0; + /* Allocate/reuse/resize mempool for Multi-Packet RQ. */ + if (mlx5_mprq_alloc_mp(dev)) + goto error; for (i = 0; i != priv->rxqs_n; ++i) { - struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_priv_rxq_get(priv, i); + struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i); + struct rte_mempool *mp; if (!rxq_ctrl) continue; + /* Pre-register Rx mempool. */ + mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? + rxq_ctrl->rxq.mprq_mp : rxq_ctrl->rxq.mp; + DRV_LOG(DEBUG, + "port %u Rx queue %u registering" + " mp %s having %u chunks", + dev->data->port_id, rxq_ctrl->idx, + mp->name, mp->nb_mem_chunks); + mlx5_mr_update_mp(dev, &rxq_ctrl->rxq.mr_ctrl, mp); ret = rxq_alloc_elts(rxq_ctrl); if (ret) goto error; - rxq_ctrl->ibv = mlx5_priv_rxq_ibv_new(priv, i); - if (!rxq_ctrl->ibv) { - ret = ENOMEM; + rxq_ctrl->ibv = mlx5_rxq_ibv_new(dev, i); + if (!rxq_ctrl->ibv) goto error; - } } - return -ret; + return 0; error: - priv_rxq_stop(priv); - return -ret; + ret = rte_errno; /* Save rte_errno before cleanup. */ + mlx5_rxq_stop(dev); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } /** @@ -102,68 +144,64 @@ error: * Pointer to Ethernet device structure. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_dev_start(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; - struct mlx5_mr *mr = NULL; - int err; + int ret; dev->data->dev_started = 1; - priv_lock(priv); - err = priv_flow_create_drop_queue(priv); - if (err) { - ERROR("%p: Drop queue allocation failed: %s", - (void *)dev, strerror(err)); + DRV_LOG(DEBUG, "port %u allocating and configuring hash Rx queues", + dev->data->port_id); + ret = mlx5_txq_start(dev); + if (ret) { + DRV_LOG(ERR, "port %u Tx queue allocation failed: %s", + dev->data->port_id, strerror(rte_errno)); goto error; } - DEBUG("%p: allocating and configuring hash RX queues", (void *)dev); - rte_mempool_walk(mlx5_mp2mr_iter, priv); - err = priv_txq_start(priv); - if (err) { - ERROR("%p: TXQ allocation failed: %s", - (void *)dev, strerror(err)); + ret = mlx5_rxq_start(dev); + if (ret) { + DRV_LOG(ERR, "port %u Rx queue allocation failed: %s", + dev->data->port_id, strerror(rte_errno)); goto error; } - err = priv_rxq_start(priv); - if (err) { - ERROR("%p: RXQ allocation failed: %s", - (void *)dev, strerror(err)); + if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG) + mlx5_mr_dump_dev(dev); + ret = mlx5_rx_intr_vec_enable(dev); + if (ret) { + DRV_LOG(ERR, "port %u Rx interrupt vector creation failed", + dev->data->port_id); goto error; } - err = priv_rx_intr_vec_enable(priv); - if (err) { - ERROR("%p: RX interrupt vector creation failed", - (void *)priv); + mlx5_xstats_init(dev); + ret = mlx5_traffic_enable(dev); + if (ret) { + DRV_LOG(DEBUG, "port %u failed to set defaults flows", + dev->data->port_id); goto error; } - priv_xstats_init(priv); - /* Update link status and Tx/Rx callbacks for the first time. */ - memset(&dev->data->dev_link, 0, sizeof(struct rte_eth_link)); - INFO("Forcing port %u link to be up", dev->data->port_id); - err = priv_force_link_status_change(priv, ETH_LINK_UP); - if (err) { - DEBUG("Failed to set port %u link to be up", - dev->data->port_id); + ret = mlx5_flow_start(dev, &priv->flows); + if (ret) { + DRV_LOG(DEBUG, "port %u failed to set flows", + dev->data->port_id); goto error; } - priv_dev_interrupt_handler_install(priv, dev); - priv_unlock(priv); + dev->tx_pkt_burst = mlx5_select_tx_function(dev); + dev->rx_pkt_burst = mlx5_select_rx_function(dev); + mlx5_dev_interrupt_handler_install(dev); return 0; error: + ret = rte_errno; /* Save rte_errno before cleanup. */ /* Rollback. */ dev->data->dev_started = 0; - for (mr = LIST_FIRST(&priv->mr); mr; mr = LIST_FIRST(&priv->mr)) - priv_mr_release(priv, mr); - priv_flow_stop(priv, &priv->flows); - priv_dev_traffic_disable(priv, dev); - priv_txq_stop(priv); - priv_rxq_stop(priv); - priv_flow_delete_drop_queue(priv); - priv_unlock(priv); - return err; + mlx5_flow_stop(dev, &priv->flows); + mlx5_traffic_disable(dev); + mlx5_txq_stop(dev); + mlx5_rxq_stop(dev); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } /** @@ -178,42 +216,38 @@ void mlx5_dev_stop(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; - struct mlx5_mr *mr; - priv_lock(priv); dev->data->dev_started = 0; /* Prevent crashes when queues are still in use. */ dev->rx_pkt_burst = removed_rx_burst; dev->tx_pkt_burst = removed_tx_burst; rte_wmb(); usleep(1000 * priv->rxqs_n); - DEBUG("%p: cleaning up and destroying hash RX queues", (void *)dev); - priv_flow_stop(priv, &priv->flows); - priv_dev_traffic_disable(priv, dev); - priv_rx_intr_vec_disable(priv); - priv_dev_interrupt_handler_uninstall(priv, dev); - priv_txq_stop(priv); - priv_rxq_stop(priv); - for (mr = LIST_FIRST(&priv->mr); mr; mr = LIST_FIRST(&priv->mr)) - priv_mr_release(priv, mr); - priv_flow_delete_drop_queue(priv); - priv_unlock(priv); + DRV_LOG(DEBUG, "port %u cleaning up and destroying hash Rx queues", + dev->data->port_id); + mlx5_flow_stop(dev, &priv->flows); + mlx5_traffic_disable(dev); + mlx5_rx_intr_vec_disable(dev); + mlx5_dev_interrupt_handler_uninstall(dev); + mlx5_txq_stop(dev); + mlx5_rxq_stop(dev); } /** * Enable traffic flows configured by control plane * - * @param priv + * @param dev * Pointer to Ethernet device private data. * @param dev * Pointer to Ethernet device structure. * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_dev_traffic_enable(struct priv *priv, struct rte_eth_dev *dev) +mlx5_traffic_enable(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct rte_flow_item_eth bcast = { .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", }; @@ -246,8 +280,9 @@ priv_dev_traffic_enable(struct priv *priv, struct rte_eth_dev *dev) .type = 0, }; - claim_zero(mlx5_ctrl_flow(dev, &promisc, &promisc)); - return 0; + ret = mlx5_ctrl_flow(dev, &promisc, &promisc); + if (ret) + goto error; } if (dev->data->all_multicast) { struct rte_flow_item_eth multicast = { @@ -256,7 +291,9 @@ priv_dev_traffic_enable(struct priv *priv, struct rte_eth_dev *dev) .type = 0, }; - claim_zero(mlx5_ctrl_flow(dev, &multicast, &multicast)); + ret = mlx5_ctrl_flow(dev, &multicast, &multicast); + if (ret) + goto error; } else { /* Add broadcast/multicast flows. */ for (i = 0; i != vlan_filter_n; ++i) { @@ -316,74 +353,49 @@ priv_dev_traffic_enable(struct priv *priv, struct rte_eth_dev *dev) goto error; } if (!vlan_filter_n) { - ret = mlx5_ctrl_flow(dev, &unicast, - &unicast_mask); + ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask); if (ret) goto error; } } return 0; error: - return rte_errno; + ret = rte_errno; /* Save rte_errno before cleanup. */ + mlx5_flow_list_flush(dev, &priv->ctrl_flows); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } /** * Disable traffic flows configured by control plane * - * @param priv - * Pointer to Ethernet device private data. * @param dev - * Pointer to Ethernet device structure. - * - * @return - * 0 on success. - */ -int -priv_dev_traffic_disable(struct priv *priv, struct rte_eth_dev *dev) -{ - (void)dev; - priv_flow_flush(priv, &priv->ctrl_flows); - return 0; -} - -/** - * Restart traffic flows configured by control plane - * - * @param priv * Pointer to Ethernet device private data. - * @param dev - * Pointer to Ethernet device structure. - * - * @return - * 0 on success. */ -int -priv_dev_traffic_restart(struct priv *priv, struct rte_eth_dev *dev) +void +mlx5_traffic_disable(struct rte_eth_dev *dev) { - if (dev->data->dev_started) { - priv_dev_traffic_disable(priv, dev); - priv_dev_traffic_enable(priv, dev); - } - return 0; + struct priv *priv = dev->data->dev_private; + + mlx5_flow_list_flush(dev, &priv->ctrl_flows); } /** * Restart traffic flows configured by control plane * * @param dev - * Pointer to Ethernet device structure. + * Pointer to Ethernet device private data. * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_traffic_restart(struct rte_eth_dev *dev) { - struct priv *priv = dev->data->dev_private; - - priv_lock(priv); - priv_dev_traffic_restart(priv, dev); - priv_unlock(priv); + if (dev->data->dev_started) { + mlx5_traffic_disable(dev); + return mlx5_traffic_enable(dev); + } return 0; } diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index ed1c713e..691ea071 100644 --- a/drivers/net/mlx5/mlx5_txq.c +++ b/drivers/net/mlx5/mlx5_txq.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -47,7 +47,8 @@ txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl) for (i = 0; (i != elts_n); ++i) (*txq_ctrl->txq.elts)[i] = NULL; - DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n); + DRV_LOG(DEBUG, "port %u Tx queue %u allocated and configured %u WRs", + PORT_ID(txq_ctrl->priv), txq_ctrl->idx, elts_n); txq_ctrl->txq.elts_head = 0; txq_ctrl->txq.elts_tail = 0; txq_ctrl->txq.elts_comp = 0; @@ -68,7 +69,8 @@ txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl) uint16_t elts_tail = txq_ctrl->txq.elts_tail; struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts; - DEBUG("%p: freeing WRs", (void *)txq_ctrl); + DRV_LOG(DEBUG, "port %u Tx queue %u freeing WRs", + PORT_ID(txq_ctrl->priv), txq_ctrl->idx); txq_ctrl->txq.elts_head = 0; txq_ctrl->txq.elts_tail = 0; txq_ctrl->txq.elts_comp = 0; @@ -91,15 +93,16 @@ txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl) /** * Returns the per-port supported offloads. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return * Supported Tx offloads. */ uint64_t -mlx5_priv_get_tx_port_offloads(struct priv *priv) +mlx5_get_tx_port_offloads(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; uint64_t offloads = (DEV_TX_OFFLOAD_MULTI_SEGS | DEV_TX_OFFLOAD_VLAN_INSERT); struct mlx5_dev_config *config = &priv->config; @@ -116,36 +119,14 @@ mlx5_priv_get_tx_port_offloads(struct priv *priv) if (config->tso) offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO | DEV_TX_OFFLOAD_GRE_TNL_TSO); + if (config->swp) + offloads |= (DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO); } return offloads; } /** - * Checks if the per-queue offload configuration is valid. - * - * @param priv - * Pointer to private structure. - * @param offloads - * Per-queue offloads configuration. - * - * @return - * 1 if the configuration is valid, 0 otherwise. - */ -static int -priv_is_tx_queue_offloads_allowed(struct priv *priv, uint64_t offloads) -{ - uint64_t port_offloads = priv->dev->data->dev_conf.txmode.offloads; - uint64_t port_supp_offloads = mlx5_priv_get_tx_port_offloads(priv); - - /* There are no Tx offloads which are per queue. */ - if ((offloads & port_supp_offloads) != offloads) - return 0; - if ((port_offloads ^ offloads) & port_supp_offloads) - return 0; - return 1; -} - -/** * DPDK callback to configure a TX queue. * * @param dev @@ -160,7 +141,7 @@ priv_is_tx_queue_offloads_allowed(struct priv *priv, uint64_t offloads) * Thresholds parameters. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, @@ -170,64 +151,47 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, struct mlx5_txq_data *txq = (*priv->txqs)[idx]; struct mlx5_txq_ctrl *txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq); - int ret = 0; - priv_lock(priv); - /* - * Don't verify port offloads for application which - * use the old API. - */ - if (!!(conf->txq_flags & ETH_TXQ_FLAGS_IGNORE) && - !priv_is_tx_queue_offloads_allowed(priv, conf->offloads)) { - ret = ENOTSUP; - ERROR("%p: Tx queue offloads 0x%" PRIx64 " don't match port " - "offloads 0x%" PRIx64 " or supported offloads 0x%" PRIx64, - (void *)dev, conf->offloads, - dev->data->dev_conf.txmode.offloads, - mlx5_priv_get_tx_port_offloads(priv)); - goto out; - } if (desc <= MLX5_TX_COMP_THRESH) { - WARN("%p: number of descriptors requested for TX queue %u" - " must be higher than MLX5_TX_COMP_THRESH, using" - " %u instead of %u", - (void *)dev, idx, MLX5_TX_COMP_THRESH + 1, desc); + DRV_LOG(WARNING, + "port %u number of descriptors requested for Tx queue" + " %u must be higher than MLX5_TX_COMP_THRESH, using %u" + " instead of %u", + dev->data->port_id, idx, MLX5_TX_COMP_THRESH + 1, desc); desc = MLX5_TX_COMP_THRESH + 1; } if (!rte_is_power_of_2(desc)) { desc = 1 << log2above(desc); - WARN("%p: increased number of descriptors in TX queue %u" - " to the next power of two (%d)", - (void *)dev, idx, desc); + DRV_LOG(WARNING, + "port %u increased number of descriptors in Tx queue" + " %u to the next power of two (%d)", + dev->data->port_id, idx, desc); } - DEBUG("%p: configuring queue %u for %u descriptors", - (void *)dev, idx, desc); + DRV_LOG(DEBUG, "port %u configuring queue %u for %u descriptors", + dev->data->port_id, idx, desc); if (idx >= priv->txqs_n) { - ERROR("%p: queue index out of range (%u >= %u)", - (void *)dev, idx, priv->txqs_n); - priv_unlock(priv); - return -EOVERFLOW; + DRV_LOG(ERR, "port %u Tx queue index out of range (%u >= %u)", + dev->data->port_id, idx, priv->txqs_n); + rte_errno = EOVERFLOW; + return -rte_errno; } - if (!mlx5_priv_txq_releasable(priv, idx)) { - ret = EBUSY; - ERROR("%p: unable to release queue index %u", - (void *)dev, idx); - goto out; + if (!mlx5_txq_releasable(dev, idx)) { + rte_errno = EBUSY; + DRV_LOG(ERR, "port %u unable to release queue index %u", + dev->data->port_id, idx); + return -rte_errno; } - mlx5_priv_txq_release(priv, idx); - txq_ctrl = mlx5_priv_txq_new(priv, idx, desc, socket, conf); + mlx5_txq_release(dev, idx); + txq_ctrl = mlx5_txq_new(dev, idx, desc, socket, conf); if (!txq_ctrl) { - ERROR("%p: unable to allocate queue index %u", - (void *)dev, idx); - ret = ENOMEM; - goto out; + DRV_LOG(ERR, "port %u unable to allocate queue index %u", + dev->data->port_id, idx); + return -rte_errno; } - DEBUG("%p: adding TX queue %p to list", - (void *)dev, (void *)txq_ctrl); + DRV_LOG(DEBUG, "port %u adding Tx queue %u to list", + dev->data->port_id, idx); (*priv->txqs)[idx] = &txq_ctrl->txq; -out: - priv_unlock(priv); - return -ret; + return 0; } /** @@ -248,15 +212,13 @@ mlx5_tx_queue_release(void *dpdk_txq) return; txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq); priv = txq_ctrl->priv; - priv_lock(priv); for (i = 0; (i != priv->txqs_n); ++i) if ((*priv->txqs)[i] == txq) { - DEBUG("%p: removing TX queue %p from list", - (void *)priv->dev, (void *)txq_ctrl); - mlx5_priv_txq_release(priv, i); + mlx5_txq_release(ETH_DEV(priv), i); + DRV_LOG(DEBUG, "port %u removing Tx queue %u from list", + PORT_ID(priv), txq_ctrl->idx); break; } - priv_unlock(priv); } @@ -265,17 +227,18 @@ mlx5_tx_queue_release(void *dpdk_txq) * Both primary and secondary process do mmap to make UAR address * aligned. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * @param fd * Verbs file descriptor to map UAR pages. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_tx_uar_remap(struct priv *priv, int fd) +mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd) { + struct priv *priv = dev->data->dev_private; unsigned int i, j; uintptr_t pages[priv->txqs_n]; unsigned int pages_n = 0; @@ -287,7 +250,6 @@ priv_tx_uar_remap(struct priv *priv, int fd) struct mlx5_txq_ctrl *txq_ctrl; int already_mapped; size_t page_size = sysconf(_SC_PAGESIZE); - int r; memset(pages, 0, priv->txqs_n * sizeof(uintptr_t)); /* @@ -300,6 +262,7 @@ priv_tx_uar_remap(struct priv *priv, int fd) continue; txq = (*priv->txqs)[i]; txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq); + assert(txq_ctrl->idx == (uint16_t)i); /* UAR addr form verbs used to find dup and offset in page. */ uar_va = (uintptr_t)txq_ctrl->bf_reg_orig; off = uar_va & (page_size - 1); /* offset in page. */ @@ -324,10 +287,12 @@ priv_tx_uar_remap(struct priv *priv, int fd) txq_ctrl->uar_mmap_offset); if (ret != addr) { /* fixed mmap have to return same address */ - ERROR("call to mmap failed on UAR for txq %d\n", - i); - r = ENXIO; - return r; + DRV_LOG(ERR, + "port %u call to mmap failed on UAR" + " for txq %u", + dev->data->port_id, txq_ctrl->idx); + rte_errno = ENXIO; + return -rte_errno; } } if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save once */ @@ -361,17 +326,18 @@ is_empw_burst_func(eth_tx_burst_t tx_pkt_burst) /** * Create the Tx queue Verbs object. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * Queue index in DPDK Rx queue array * * @return - * The Verbs object initialised if it can be created. + * The Verbs object initialised, NULL otherwise and rte_errno is set. */ -struct mlx5_txq_ibv* -mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) +struct mlx5_txq_ibv * +mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_data *txq_data = (*priv->txqs)[idx]; struct mlx5_txq_ctrl *txq_ctrl = container_of(txq_data, struct mlx5_txq_ctrl, txq); @@ -388,18 +354,20 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) struct mlx5dv_cq cq_info; struct mlx5dv_obj obj; const int desc = 1 << txq_data->elts_n; - eth_tx_burst_t tx_pkt_burst = priv_select_tx_function(priv, priv->dev); + eth_tx_burst_t tx_pkt_burst = mlx5_select_tx_function(dev); int ret = 0; assert(txq_data); priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_TX_QUEUE; priv->verbs_alloc_ctx.obj = txq_ctrl; if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) { - ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set"); - goto error; + DRV_LOG(ERR, + "port %u MLX5_ENABLE_CQE_COMPRESSION must never be set", + dev->data->port_id); + rte_errno = EINVAL; + return NULL; } memset(&tmpl, 0, sizeof(struct mlx5_txq_ibv)); - /* MRs will be registered in mp2mr[] later. */ attr.cq = (struct ibv_cq_init_attr_ex){ .comp_mask = 0, }; @@ -409,7 +377,9 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV; tmpl.cq = mlx5_glue->create_cq(priv->ctx, cqe_n, NULL, NULL, 0); if (tmpl.cq == NULL) { - ERROR("%p: CQ creation failure", (void *)txq_ctrl); + DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure", + dev->data->port_id, idx); + rte_errno = errno; goto error; } attr.init = (struct ibv_qp_init_attr_ex){ @@ -450,7 +420,9 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) } tmpl.qp = mlx5_glue->create_qp_ex(priv->ctx, &attr.init); if (tmpl.qp == NULL) { - ERROR("%p: QP creation failure", (void *)txq_ctrl); + DRV_LOG(ERR, "port %u Tx queue %u QP creation failure", + dev->data->port_id, idx); + rte_errno = errno; goto error; } attr.mod = (struct ibv_qp_attr){ @@ -462,7 +434,10 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, (IBV_QP_STATE | IBV_QP_PORT)); if (ret) { - ERROR("%p: QP state to IBV_QPS_INIT failed", (void *)txq_ctrl); + DRV_LOG(ERR, + "port %u Tx queue %u QP state to IBV_QPS_INIT failed", + dev->data->port_id, idx); + rte_errno = errno; goto error; } attr.mod = (struct ibv_qp_attr){ @@ -470,19 +445,27 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) }; ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE); if (ret) { - ERROR("%p: QP state to IBV_QPS_RTR failed", (void *)txq_ctrl); + DRV_LOG(ERR, + "port %u Tx queue %u QP state to IBV_QPS_RTR failed", + dev->data->port_id, idx); + rte_errno = errno; goto error; } attr.mod.qp_state = IBV_QPS_RTS; ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE); if (ret) { - ERROR("%p: QP state to IBV_QPS_RTS failed", (void *)txq_ctrl); + DRV_LOG(ERR, + "port %u Tx queue %u QP state to IBV_QPS_RTS failed", + dev->data->port_id, idx); + rte_errno = errno; goto error; } txq_ibv = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_txq_ibv), 0, txq_ctrl->socket); if (!txq_ibv) { - ERROR("%p: cannot allocate memory", (void *)txq_ctrl); + DRV_LOG(ERR, "port %u Tx queue %u cannot allocate memory", + dev->data->port_id, idx); + rte_errno = ENOMEM; goto error; } obj.cq.in = tmpl.cq; @@ -490,11 +473,16 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) obj.qp.in = tmpl.qp; obj.qp.out = &qp; ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP); - if (ret != 0) + if (ret != 0) { + rte_errno = errno; goto error; + } if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) { - ERROR("Wrong MLX5_CQE_SIZE environment variable value: " - "it should be set to %u", RTE_CACHE_LINE_SIZE); + DRV_LOG(ERR, + "port %u wrong MLX5_CQE_SIZE environment variable" + " value: it should be set to %u", + dev->data->port_id, RTE_CACHE_LINE_SIZE); + rte_errno = EINVAL; goto error; } txq_data->cqe_n = log2above(cq_info.cqe_cnt); @@ -519,37 +507,45 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) { txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset; } else { - ERROR("Failed to retrieve UAR info, invalid libmlx5.so version"); + DRV_LOG(ERR, + "port %u failed to retrieve UAR info, invalid" + " libmlx5.so", + dev->data->port_id); + rte_errno = EINVAL; goto error; } - DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv, - (void *)txq_ibv, rte_atomic32_read(&txq_ibv->refcnt)); + DRV_LOG(DEBUG, "port %u Verbs Tx queue %u: refcnt %d", + dev->data->port_id, idx, rte_atomic32_read(&txq_ibv->refcnt)); LIST_INSERT_HEAD(&priv->txqsibv, txq_ibv, next); + txq_ibv->txq_ctrl = txq_ctrl; priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; return txq_ibv; error: + ret = rte_errno; /* Save rte_errno before cleanup. */ if (tmpl.cq) claim_zero(mlx5_glue->destroy_cq(tmpl.cq)); if (tmpl.qp) claim_zero(mlx5_glue->destroy_qp(tmpl.qp)); priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; + rte_errno = ret; /* Restore rte_errno. */ return NULL; } /** * Get an Tx queue Verbs object. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * Queue index in DPDK Rx queue array * * @return * The Verbs object if it exists. */ -struct mlx5_txq_ibv* -mlx5_priv_txq_ibv_get(struct priv *priv, uint16_t idx) +struct mlx5_txq_ibv * +mlx5_txq_ibv_get(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *txq_ctrl; if (idx >= priv->txqs_n) @@ -559,8 +555,8 @@ mlx5_priv_txq_ibv_get(struct priv *priv, uint16_t idx) txq_ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq); if (txq_ctrl->ibv) { rte_atomic32_inc(&txq_ctrl->ibv->refcnt); - DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv, - (void *)txq_ctrl->ibv, + DRV_LOG(DEBUG, "port %u Verbs Tx queue %u: refcnt %d", + dev->data->port_id, txq_ctrl->idx, rte_atomic32_read(&txq_ctrl->ibv->refcnt)); } return txq_ctrl->ibv; @@ -569,21 +565,19 @@ mlx5_priv_txq_ibv_get(struct priv *priv, uint16_t idx) /** * Release an Tx verbs queue object. * - * @param priv - * Pointer to private structure. * @param txq_ibv * Verbs Tx queue object. * * @return - * 0 on success, errno on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_txq_ibv_release(struct priv *priv, struct mlx5_txq_ibv *txq_ibv) +mlx5_txq_ibv_release(struct mlx5_txq_ibv *txq_ibv) { - (void)priv; assert(txq_ibv); - DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv, - (void *)txq_ibv, rte_atomic32_read(&txq_ibv->refcnt)); + DRV_LOG(DEBUG, "port %u Verbs Tx queue %u: refcnt %d", + PORT_ID(txq_ibv->txq_ctrl->priv), + txq_ibv->txq_ctrl->idx, rte_atomic32_read(&txq_ibv->refcnt)); if (rte_atomic32_dec_and_test(&txq_ibv->refcnt)) { claim_zero(mlx5_glue->destroy_qp(txq_ibv->qp)); claim_zero(mlx5_glue->destroy_cq(txq_ibv->cq)); @@ -591,21 +585,18 @@ mlx5_priv_txq_ibv_release(struct priv *priv, struct mlx5_txq_ibv *txq_ibv) rte_free(txq_ibv); return 0; } - return EBUSY; + return 1; } /** * Return true if a single reference exists on the object. * - * @param priv - * Pointer to private structure. * @param txq_ibv * Verbs Tx queue object. */ int -mlx5_priv_txq_ibv_releasable(struct priv *priv, struct mlx5_txq_ibv *txq_ibv) +mlx5_txq_ibv_releasable(struct mlx5_txq_ibv *txq_ibv) { - (void)priv; assert(txq_ibv); return (rte_atomic32_read(&txq_ibv->refcnt) == 1); } @@ -613,20 +604,22 @@ mlx5_priv_txq_ibv_releasable(struct priv *priv, struct mlx5_txq_ibv *txq_ibv) /** * Verify the Verbs Tx queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_txq_ibv_verify(struct priv *priv) +mlx5_txq_ibv_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; int ret = 0; struct mlx5_txq_ibv *txq_ibv; LIST_FOREACH(txq_ibv, &priv->txqsibv, next) { - DEBUG("%p: Verbs Tx queue %p still referenced", (void *)priv, - (void *)txq_ibv); + DRV_LOG(DEBUG, "port %u Verbs Tx queue %u still referenced", + dev->data->port_id, txq_ibv->txq_ctrl->idx); ++ret; } return ret; @@ -649,9 +642,14 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) unsigned int txq_inline; unsigned int txqs_inline; unsigned int inline_max_packet_sz; - eth_tx_burst_t tx_pkt_burst = priv_select_tx_function(priv, priv->dev); + eth_tx_burst_t tx_pkt_burst = + mlx5_select_tx_function(ETH_DEV(priv)); int is_empw_func = is_empw_burst_func(tx_pkt_burst); - int tso = !!(txq_ctrl->txq.offloads & DEV_TX_OFFLOAD_TCP_TSO); + int tso = !!(txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO | + DEV_TX_OFFLOAD_VXLAN_TNL_TSO | + DEV_TX_OFFLOAD_GRE_TNL_TSO | + DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO)); txq_inline = (config->txq_inline == MLX5_ARG_UNSET) ? 0 : config->txq_inline; @@ -685,18 +683,6 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) inline_max_packet_sz) + (RTE_CACHE_LINE_SIZE - 1)) / RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE; - } else if (tso) { - int inline_diff = txq_ctrl->txq.max_inline - - max_tso_inline; - - /* - * Adjust inline value as Verbs aggregates - * tso_inline and txq_inline fields. - */ - txq_ctrl->max_inline_data = inline_diff > 0 ? - inline_diff * - RTE_CACHE_LINE_SIZE : - 0; } else { txq_ctrl->max_inline_data = txq_ctrl->txq.max_inline * RTE_CACHE_LINE_SIZE; @@ -716,9 +702,10 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) max_inline = max_inline - (max_inline % RTE_CACHE_LINE_SIZE); - WARN("txq inline is too large (%d) setting it to " - "the maximum possible: %d\n", - txq_inline, max_inline); + DRV_LOG(WARNING, + "port %u txq inline is too large (%d) setting" + " it to the maximum possible: %d\n", + PORT_ID(priv), txq_inline, max_inline); txq_ctrl->txq.max_inline = max_inline / RTE_CACHE_LINE_SIZE; } @@ -730,13 +717,17 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) txq_ctrl->txq.tso_en = 1; } txq_ctrl->txq.tunnel_en = config->tunnel_en; + txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO | + DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) & + txq_ctrl->txq.offloads) && config->swp; } /** * Create a DPDK Tx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * @param desc @@ -747,76 +738,80 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) * Thresholds parameters. * * @return - * A DPDK queue object on success. + * A DPDK queue object on success, NULL otherwise and rte_errno is set. */ -struct mlx5_txq_ctrl* -mlx5_priv_txq_new(struct priv *priv, uint16_t idx, uint16_t desc, - unsigned int socket, - const struct rte_eth_txconf *conf) +struct mlx5_txq_ctrl * +mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_txconf *conf) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *tmpl; tmpl = rte_calloc_socket("TXQ", 1, sizeof(*tmpl) + desc * sizeof(struct rte_mbuf *), 0, socket); - if (!tmpl) + if (!tmpl) { + rte_errno = ENOMEM; return NULL; + } + if (mlx5_mr_btree_init(&tmpl->txq.mr_ctrl.cache_bh, + MLX5_MR_BTREE_CACHE_N, socket)) { + /* rte_errno is already set. */ + goto error; + } + /* Save pointer of global generation number to check memory event. */ + tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen; assert(desc > MLX5_TX_COMP_THRESH); - tmpl->txq.offloads = conf->offloads; + tmpl->txq.offloads = conf->offloads | + dev->data->dev_conf.txmode.offloads; tmpl->priv = priv; tmpl->socket = socket; tmpl->txq.elts_n = log2above(desc); + tmpl->idx = idx; txq_set_params(tmpl); - /* MRs will be registered in mp2mr[] later. */ - DEBUG("priv->device_attr.max_qp_wr is %d", - priv->device_attr.orig_attr.max_qp_wr); - DEBUG("priv->device_attr.max_sge is %d", - priv->device_attr.orig_attr.max_sge); + DRV_LOG(DEBUG, "port %u priv->device_attr.max_qp_wr is %d", + dev->data->port_id, priv->device_attr.orig_attr.max_qp_wr); + DRV_LOG(DEBUG, "port %u priv->device_attr.max_sge is %d", + dev->data->port_id, priv->device_attr.orig_attr.max_sge); tmpl->txq.elts = (struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])(tmpl + 1); tmpl->txq.stats.idx = idx; rte_atomic32_inc(&tmpl->refcnt); - DEBUG("%p: Tx queue %p: refcnt %d", (void *)priv, - (void *)tmpl, rte_atomic32_read(&tmpl->refcnt)); + DRV_LOG(DEBUG, "port %u Tx queue %u: refcnt %d", dev->data->port_id, + idx, rte_atomic32_read(&tmpl->refcnt)); LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next); return tmpl; +error: + rte_free(tmpl); + return NULL; } /** * Get a Tx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * * @return * A pointer to the queue if it exists. */ -struct mlx5_txq_ctrl* -mlx5_priv_txq_get(struct priv *priv, uint16_t idx) +struct mlx5_txq_ctrl * +mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *ctrl = NULL; if ((*priv->txqs)[idx]) { ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq); - unsigned int i; - - mlx5_priv_txq_ibv_get(priv, idx); - for (i = 0; i != MLX5_PMD_TX_MP_CACHE; ++i) { - struct mlx5_mr *mr = NULL; - - (void)mr; - if (ctrl->txq.mp2mr[i]) { - mr = priv_mr_get(priv, ctrl->txq.mp2mr[i]->mp); - assert(mr); - } - } + mlx5_txq_ibv_get(dev, idx); rte_atomic32_inc(&ctrl->refcnt); - DEBUG("%p: Tx queue %p: refcnt %d", (void *)priv, - (void *)ctrl, rte_atomic32_read(&ctrl->refcnt)); + DRV_LOG(DEBUG, "port %u Tx queue %u refcnt %d", + dev->data->port_id, + ctrl->idx, rte_atomic32_read(&ctrl->refcnt)); } return ctrl; } @@ -824,57 +819,47 @@ mlx5_priv_txq_get(struct priv *priv, uint16_t idx) /** * Release a Tx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * * @return - * 0 on success, errno on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_txq_release(struct priv *priv, uint16_t idx) +mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx) { - unsigned int i; + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *txq; size_t page_size = sysconf(_SC_PAGESIZE); if (!(*priv->txqs)[idx]) return 0; txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq); - DEBUG("%p: Tx queue %p: refcnt %d", (void *)priv, - (void *)txq, rte_atomic32_read(&txq->refcnt)); - if (txq->ibv) { - int ret; - - ret = mlx5_priv_txq_ibv_release(priv, txq->ibv); - if (!ret) - txq->ibv = NULL; - } - for (i = 0; i != MLX5_PMD_TX_MP_CACHE; ++i) { - if (txq->txq.mp2mr[i]) { - priv_mr_release(priv, txq->txq.mp2mr[i]); - txq->txq.mp2mr[i] = NULL; - } - } + DRV_LOG(DEBUG, "port %u Tx queue %u: refcnt %d", dev->data->port_id, + txq->idx, rte_atomic32_read(&txq->refcnt)); + if (txq->ibv && !mlx5_txq_ibv_release(txq->ibv)) + txq->ibv = NULL; if (priv->uar_base) munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->txq.bf_reg, page_size), page_size); if (rte_atomic32_dec_and_test(&txq->refcnt)) { txq_free_elts(txq); + mlx5_mr_btree_free(&txq->txq.mr_ctrl.cache_bh); LIST_REMOVE(txq, next); rte_free(txq); (*priv->txqs)[idx] = NULL; return 0; } - return EBUSY; + return 1; } /** * Verify if the queue can be released. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * @@ -882,8 +867,9 @@ mlx5_priv_txq_release(struct priv *priv, uint16_t idx) * 1 if the queue can be released. */ int -mlx5_priv_txq_releasable(struct priv *priv, uint16_t idx) +mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *txq; if (!(*priv->txqs)[idx]) @@ -895,20 +881,22 @@ mlx5_priv_txq_releasable(struct priv *priv, uint16_t idx) /** * Verify the Tx Queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_txq_verify(struct priv *priv) +mlx5_txq_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *txq; int ret = 0; LIST_FOREACH(txq, &priv->txqsctrl, next) { - DEBUG("%p: Tx Queue %p still referenced", (void *)priv, - (void *)txq); + DRV_LOG(DEBUG, "port %u Tx queue %u still referenced", + dev->data->port_id, txq->idx); ++ret; } return ret; diff --git a/drivers/net/mlx5/mlx5_utils.h b/drivers/net/mlx5/mlx5_utils.h index e1bfb9cd..886f60e6 100644 --- a/drivers/net/mlx5/mlx5_utils.h +++ b/drivers/net/mlx5/mlx5_utils.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_UTILS_H_ @@ -61,14 +61,21 @@ pmd_drv_log_basename(const char *s) return s; } +extern int mlx5_logtype; + +#define PMD_DRV_LOG___(level, ...) \ + rte_log(RTE_LOG_ ## level, \ + mlx5_logtype, \ + RTE_FMT(MLX5_DRIVER_NAME ": " \ + RTE_FMT_HEAD(__VA_ARGS__,), \ + RTE_FMT_TAIL(__VA_ARGS__,))) + /* * When debugging is enabled (NDEBUG not defined), file, line and function * information replace the driver name (MLX5_DRIVER_NAME) in log messages. */ #ifndef NDEBUG -#define PMD_DRV_LOG___(level, ...) \ - ERRNO_SAFE(RTE_LOG(level, PMD, __VA_ARGS__)) #define PMD_DRV_LOG__(level, ...) \ PMD_DRV_LOG___(level, "%s:%u: %s(): " __VA_ARGS__) #define PMD_DRV_LOG_(level, s, ...) \ @@ -80,9 +87,6 @@ pmd_drv_log_basename(const char *s) __VA_ARGS__) #else /* NDEBUG */ - -#define PMD_DRV_LOG___(level, ...) \ - ERRNO_SAFE(RTE_LOG(level, PMD, MLX5_DRIVER_NAME ": " __VA_ARGS__)) #define PMD_DRV_LOG__(level, ...) \ PMD_DRV_LOG___(level, __VA_ARGS__) #define PMD_DRV_LOG_(level, s, ...) \ @@ -91,18 +95,15 @@ pmd_drv_log_basename(const char *s) #endif /* NDEBUG */ /* Generic printf()-like logging macro with automatic line feed. */ -#define PMD_DRV_LOG(level, ...) \ +#define DRV_LOG(level, ...) \ PMD_DRV_LOG_(level, \ __VA_ARGS__ PMD_DRV_LOG_STRIP PMD_DRV_LOG_OPAREN, \ PMD_DRV_LOG_CPAREN) -/* - * Like assert(), DEBUG() becomes a no-op and claim_zero() does not perform - * any check when debugging is disabled. - */ +/* claim_zero() does not perform any check when debugging is disabled. */ #ifndef NDEBUG -#define DEBUG(...) PMD_DRV_LOG(DEBUG, __VA_ARGS__) +#define DEBUG(...) DRV_LOG(DEBUG, __VA_ARGS__) #define claim_zero(...) assert((__VA_ARGS__) == 0) #define claim_nonzero(...) assert((__VA_ARGS__) != 0) @@ -114,9 +115,9 @@ pmd_drv_log_basename(const char *s) #endif /* NDEBUG */ -#define INFO(...) PMD_DRV_LOG(INFO, __VA_ARGS__) -#define WARN(...) PMD_DRV_LOG(WARNING, __VA_ARGS__) -#define ERROR(...) PMD_DRV_LOG(ERR, __VA_ARGS__) +#define INFO(...) DRV_LOG(INFO, __VA_ARGS__) +#define WARN(...) DRV_LOG(WARNING, __VA_ARGS__) +#define ERROR(...) DRV_LOG(ERR, __VA_ARGS__) /* Convenience macros for accessing mbuf fields. */ #define NEXT(m) ((m)->next) diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c index 75c34562..c91d08be 100644 --- a/drivers/net/mlx5/mlx5_vlan.c +++ b/drivers/net/mlx5/mlx5_vlan.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -8,6 +8,12 @@ #include <assert.h> #include <stdint.h> +/* + * Not needed by this file; included to work around the lack of off_t + * definition for mlx5dv.h with unpatched rdma-core versions. + */ +#include <sys/types.h> + /* Verbs headers do not support -pedantic. */ #ifdef PEDANTIC #pragma GCC diagnostic ignored "-Wpedantic" @@ -37,26 +43,24 @@ * Toggle filter. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) { struct priv *priv = dev->data->dev_private; unsigned int i; - int ret = 0; - priv_lock(priv); - DEBUG("%p: %s VLAN filter ID %" PRIu16, - (void *)dev, (on ? "enable" : "disable"), vlan_id); + DRV_LOG(DEBUG, "port %u %s VLAN filter ID %" PRIu16, + dev->data->port_id, (on ? "enable" : "disable"), vlan_id); assert(priv->vlan_filter_n <= RTE_DIM(priv->vlan_filter)); for (i = 0; (i != priv->vlan_filter_n); ++i) if (priv->vlan_filter[i] == vlan_id) break; /* Check if there's room for another VLAN filter. */ if (i == RTE_DIM(priv->vlan_filter)) { - ret = -ENOMEM; - goto out; + rte_errno = ENOMEM; + return -rte_errno; } if (i < priv->vlan_filter_n) { assert(priv->vlan_filter_n != 0); @@ -79,37 +83,49 @@ mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) priv->vlan_filter[priv->vlan_filter_n] = vlan_id; ++priv->vlan_filter_n; } - if (dev->data->dev_started) - priv_dev_traffic_restart(priv, dev); out: - priv_unlock(priv); - return ret; + if (dev->data->dev_started) + return mlx5_traffic_restart(dev); + return 0; } /** - * Set/reset VLAN stripping for a specific queue. + * Callback to set/reset VLAN stripping for a specific queue. * - * @param priv - * Pointer to private structure. - * @param idx + * @param dev + * Pointer to Ethernet device structure. + * @param queue * RX queue index. * @param on * Enable/disable VLAN stripping. */ -static void -priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on) +void +mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on) { - struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx]; + struct priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq = (*priv->rxqs)[queue]; struct mlx5_rxq_ctrl *rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); struct ibv_wq_attr mod; uint16_t vlan_offloads = (on ? IBV_WQ_FLAGS_CVLAN_STRIPPING : 0) | 0; - int err; + int ret; - DEBUG("set VLAN offloads 0x%x for port %d queue %d", - vlan_offloads, rxq->port_id, idx); + /* Validate hw support */ + if (!priv->config.hw_vlan_strip) { + DRV_LOG(ERR, "port %u VLAN stripping is not supported", + dev->data->port_id); + return; + } + /* Validate queue number */ + if (queue >= priv->rxqs_n) { + DRV_LOG(ERR, "port %u VLAN stripping, invalid queue number %d", + dev->data->port_id, queue); + return; + } + DRV_LOG(DEBUG, "port %u set VLAN offloads 0x%x for port %uqueue %d", + dev->data->port_id, vlan_offloads, rxq->port_id, queue); if (!rxq_ctrl->ibv) { /* Update related bits in RX queue. */ rxq->vlan_strip = !!on; @@ -120,57 +136,26 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on) .flags_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING, .flags = vlan_offloads, }; - - err = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod); - if (err) { - ERROR("%p: failed to modified stripping mode: %s", - (void *)priv, strerror(err)); + ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod); + if (ret) { + DRV_LOG(ERR, "port %u failed to modified stripping mode: %s", + dev->data->port_id, strerror(rte_errno)); return; } - /* Update related bits in RX queue. */ rxq->vlan_strip = !!on; } /** - * Callback to set/reset VLAN stripping for a specific queue. - * - * @param dev - * Pointer to Ethernet device structure. - * @param queue - * RX queue index. - * @param on - * Enable/disable VLAN stripping. - */ -void -mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on) -{ - struct priv *priv = dev->data->dev_private; - - /* Validate hw support */ - if (!priv->config.hw_vlan_strip) { - ERROR("VLAN stripping is not supported"); - return; - } - - /* Validate queue number */ - if (queue >= priv->rxqs_n) { - ERROR("VLAN stripping, invalid queue number %d", queue); - return; - } - - priv_lock(priv); - priv_vlan_strip_queue_set(priv, queue, on); - priv_unlock(priv); -} - -/** * Callback to set/reset VLAN offloads for a port. * * @param dev * Pointer to Ethernet device structure. * @param mask * VLAN offload bit mask. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask) @@ -183,16 +168,13 @@ mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask) DEV_RX_OFFLOAD_VLAN_STRIP); if (!priv->config.hw_vlan_strip) { - ERROR("VLAN stripping is not supported"); + DRV_LOG(ERR, "port %u VLAN stripping is not supported", + dev->data->port_id); return 0; } - /* Run on every RX queue and set/reset VLAN stripping. */ - priv_lock(priv); for (i = 0; (i != priv->rxqs_n); i++) - priv_vlan_strip_queue_set(priv, i, hw_vlan_strip); - priv_unlock(priv); + mlx5_vlan_strip_queue_set(dev, i, hw_vlan_strip); } - return 0; } |