diff options
Diffstat (limited to 'drivers/net/mlx5')
29 files changed, 11512 insertions, 5410 deletions
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index 3bc9736c..2e70dec5 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -1,33 +1,6 @@ -# BSD LICENSE -# +# SPDX-License-Identifier: BSD-3-Clause # Copyright 2015 6WIND S.A. -# Copyright 2015 Mellanox. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of 6WIND S.A. nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Copyright 2015 Mellanox Technologies, Ltd include $(RTE_SDK)/mk/rte.vars.mk @@ -35,7 +8,7 @@ include $(RTE_SDK)/mk/rte.vars.mk LIB = librte_pmd_mlx5.a LIB_GLUE = $(LIB_GLUE_BASE).$(LIB_GLUE_VERSION) LIB_GLUE_BASE = librte_pmd_mlx5_glue.so -LIB_GLUE_VERSION = 18.02.0 +LIB_GLUE_VERSION = 18.05.0 # Sources. SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c @@ -59,6 +32,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl_flow.c ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y) INSTALL-$(CONFIG_RTE_LIBRTE_MLX5_PMD)-lib += $(LIB_GLUE) @@ -82,6 +57,7 @@ LDLIBS += -ldl else LDLIBS += -libverbs -lmlx5 endif +LDLIBS += -lmnl LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs LDLIBS += -lrte_bus_pci @@ -92,6 +68,9 @@ CFLAGS += -Wno-error=cast-qual EXPORT_MAP := rte_pmd_mlx5_version.map LIBABIVER := 1 +# memseg walk is not part of stable API +CFLAGS += -DALLOW_EXPERIMENTAL_API + # DEBUG which is usually provided on the command-line may enable # CONFIG_RTE_LIBRTE_MLX5_DEBUG. ifeq ($(DEBUG),1) @@ -105,10 +84,6 @@ else CFLAGS += -DNDEBUG -UPEDANTIC endif -ifdef CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE -CFLAGS += -DMLX5_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE) -endif - include $(RTE_SDK)/mk/rte.lib.mk # Generate and clean-up mlx5_autoconf.h. @@ -125,9 +100,19 @@ mlx5_autoconf.h.new: FORCE mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh $Q $(RM) -f -- '$@' $Q sh -- '$<' '$@' \ - HAVE_IBV_DEVICE_VXLAN_SUPPORT \ + HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT \ + infiniband/mlx5dv.h \ + enum MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_IBV_DEVICE_TUNNEL_SUPPORT \ + infiniband/mlx5dv.h \ + enum MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_IBV_DEVICE_MPLS_SUPPORT \ infiniband/verbs.h \ - enum IBV_DEVICE_VXLAN_SUPPORT \ + enum IBV_FLOW_SPEC_MPLS \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ HAVE_IBV_WQ_FLAG_RX_END_PADDING \ @@ -135,6 +120,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh enum IBV_WQ_FLAG_RX_END_PADDING \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ + HAVE_IBV_MLX5_MOD_SWP \ + infiniband/mlx5dv.h \ + type 'struct mlx5dv_sw_parsing_caps' \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ HAVE_IBV_MLX5_MOD_MPW \ infiniband/mlx5dv.h \ enum MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED \ @@ -162,7 +152,237 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh $Q sh -- '$<' '$@' \ HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT \ infiniband/verbs.h \ - enum IBV_FLOW_SPEC_ACTION_COUNT \ + type 'struct ibv_counter_set_init_attr' \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_RDMA_NL_NLDEV \ + rdma/rdma_netlink.h \ + enum RDMA_NL_NLDEV \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_RDMA_NLDEV_CMD_GET \ + rdma/rdma_netlink.h \ + enum RDMA_NLDEV_CMD_GET \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_RDMA_NLDEV_CMD_PORT_GET \ + rdma/rdma_netlink.h \ + enum RDMA_NLDEV_CMD_PORT_GET \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_RDMA_NLDEV_ATTR_DEV_INDEX \ + rdma/rdma_netlink.h \ + enum RDMA_NLDEV_ATTR_DEV_INDEX \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_RDMA_NLDEV_ATTR_DEV_NAME \ + rdma/rdma_netlink.h \ + enum RDMA_NLDEV_ATTR_DEV_NAME \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_RDMA_NLDEV_ATTR_PORT_INDEX \ + rdma/rdma_netlink.h \ + enum RDMA_NLDEV_ATTR_PORT_INDEX \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX \ + rdma/rdma_netlink.h \ + enum RDMA_NLDEV_ATTR_NDEV_INDEX \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_IFLA_PHYS_SWITCH_ID \ + linux/if_link.h \ + enum IFLA_PHYS_SWITCH_ID \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_IFLA_PHYS_PORT_NAME \ + linux/if_link.h \ + enum IFLA_PHYS_PORT_NAME \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_ACT \ + linux/pkt_cls.h \ + enum TCA_FLOWER_ACT \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_FLAGS \ + linux/pkt_cls.h \ + enum TCA_FLOWER_FLAGS \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ETH_TYPE \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ETH_TYPE \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ETH_DST \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ETH_DST \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ETH_DST_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ETH_DST_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ETH_SRC \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ETH_SRC \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ETH_SRC_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_IP_PROTO \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_IP_PROTO \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_IPV4_SRC \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_IPV4_SRC \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_IPV4_SRC_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_IPV4_DST \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_IPV4_DST \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_IPV4_DST_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_IPV6_SRC \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_IPV6_SRC \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_IPV6_SRC_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_IPV6_DST \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_IPV6_DST \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_IPV6_DST_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_TCP_SRC \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_TCP_SRC \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_TCP_SRC_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_TCP_DST \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_TCP_DST \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_TCP_DST_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_TCP_DST_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_UDP_SRC \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_UDP_SRC \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_UDP_SRC_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_UDP_DST \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_UDP_DST \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_UDP_DST_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_UDP_DST_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_VLAN_ID \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_VLAN_ID \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_VLAN_PRIO \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_VLAN_PRIO \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_VLAN_ETH_TYPE \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TC_ACT_VLAN \ + linux/tc_act/tc_vlan.h \ + enum TCA_VLAN_PUSH_VLAN_PRIORITY \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_SUPPORTED_40000baseKR4_Full \ + /usr/include/linux/ethtool.h \ + define SUPPORTED_40000baseKR4_Full \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_SUPPORTED_40000baseCR4_Full \ + /usr/include/linux/ethtool.h \ + define SUPPORTED_40000baseCR4_Full \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_SUPPORTED_40000baseSR4_Full \ + /usr/include/linux/ethtool.h \ + define SUPPORTED_40000baseSR4_Full \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_SUPPORTED_40000baseLR4_Full \ + /usr/include/linux/ethtool.h \ + define SUPPORTED_40000baseLR4_Full \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_SUPPORTED_56000baseKR4_Full \ + /usr/include/linux/ethtool.h \ + define SUPPORTED_56000baseKR4_Full \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_SUPPORTED_56000baseCR4_Full \ + /usr/include/linux/ethtool.h \ + define SUPPORTED_56000baseCR4_Full \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_SUPPORTED_56000baseSR4_Full \ + /usr/include/linux/ethtool.h \ + define SUPPORTED_56000baseSR4_Full \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_SUPPORTED_56000baseLR4_Full \ + /usr/include/linux/ethtool.h \ + define SUPPORTED_56000baseLR4_Full \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_STATIC_ASSERT \ + /usr/include/assert.h \ + define static_assert \ $(AUTOCONF_OUTPUT) # Create mlx5_autoconf.h or update it in case it differs from the new one. @@ -181,10 +401,15 @@ ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y) $(LIB): $(LIB_GLUE) +ifeq ($(LINK_USING_CC),1) +GLUE_LDFLAGS := $(call linkerprefix,$(LDFLAGS)) +else +GLUE_LDFLAGS := $(LDFLAGS) +endif $(LIB_GLUE): mlx5_glue.o - $Q $(LD) $(LDFLAGS) $(EXTRA_LDFLAGS) \ + $Q $(LD) $(GLUE_LDFLAGS) $(EXTRA_LDFLAGS) \ -Wl,-h,$(LIB_GLUE) \ - -s -shared -o $@ $< -libverbs -lmlx5 + -shared -o $@ $< -libverbs -lmlx5 mlx5_glue.o: mlx5_autoconf.h diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 6c0985bd..ec63bc6e 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -13,6 +13,8 @@ #include <errno.h> #include <net/if.h> #include <sys/mman.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> /* Verbs header. */ /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ @@ -33,6 +35,9 @@ #include <rte_config.h> #include <rte_eal_memconfig.h> #include <rte_kvargs.h> +#include <rte_rwlock.h> +#include <rte_spinlock.h> +#include <rte_string_fns.h> #include "mlx5.h" #include "mlx5_utils.h" @@ -40,10 +45,23 @@ #include "mlx5_autoconf.h" #include "mlx5_defs.h" #include "mlx5_glue.h" +#include "mlx5_mr.h" /* Device parameter to enable RX completion queue compression. */ #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" +/* Device parameter to enable Multi-Packet Rx queue. */ +#define MLX5_RX_MPRQ_EN "mprq_en" + +/* Device parameter to configure log 2 of the number of strides for MPRQ. */ +#define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num" + +/* Device parameter to limit the size of memcpy'd packet for MPRQ. */ +#define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len" + +/* Device parameter to set the minimum number of Rx queues to enable MPRQ. */ +#define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq" + /* Device parameter to configure inline send. */ #define MLX5_TXQ_INLINE "txq_inline" @@ -68,6 +86,15 @@ /* Device parameter to enable hardware Rx vector. */ #define MLX5_RX_VEC_EN "rx_vec_en" +/* Allow L3 VXLAN flow creation. */ +#define MLX5_L3_VXLAN_EN "l3_vxlan_en" + +/* Activate Netlink support in VF mode. */ +#define MLX5_VF_NL_EN "vf_nl_en" + +/* Select port representors to instantiate. */ +#define MLX5_REPRESENTOR "representor" + #ifndef HAVE_IBV_MLX5_MOD_MPW #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) @@ -77,6 +104,50 @@ #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) #endif +static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; + +/* Shared memory between primary and secondary processes. */ +struct mlx5_shared_data *mlx5_shared_data; + +/* Spinlock for mlx5_shared_data allocation. */ +static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; + +/** Driver-specific log messages type. */ +int mlx5_logtype; + +/** + * Prepare shared data between primary and secondary process. + */ +static void +mlx5_prepare_shared_data(void) +{ + const struct rte_memzone *mz; + + rte_spinlock_lock(&mlx5_shared_data_lock); + if (mlx5_shared_data == NULL) { + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* Allocate shared memory. */ + mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, + sizeof(*mlx5_shared_data), + SOCKET_ID_ANY, 0); + } else { + /* Lookup allocated shared memory. */ + mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); + } + if (mz == NULL) + rte_panic("Cannot allocate mlx5 shared data\n"); + mlx5_shared_data = mz->addr; + /* Initialize shared data. */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + LIST_INIT(&mlx5_shared_data->mem_event_cb_list); + rte_rwlock_init(&mlx5_shared_data->mem_event_rwlock); + } + rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", + mlx5_mr_mem_event_cb, NULL); + } + rte_spinlock_unlock(&mlx5_shared_data_lock); +} + /** * Retrieve integer value from environment variable. * @@ -108,7 +179,7 @@ mlx5_getenv_int(const char *name) * A pointer to the callback data. * * @return - * a pointer to the allocate space. + * Allocated buffer, NULL otherwise and rte_errno is set. */ static void * mlx5_alloc_verbs_buf(size_t size, void *data) @@ -130,7 +201,8 @@ mlx5_alloc_verbs_buf(size_t size, void *data) } assert(data != NULL); ret = rte_malloc_socket(__func__, size, alignment, socket); - DEBUG("Extern alloc size: %lu, align: %lu: %p", size, alignment, ret); + if (!ret && size) + rte_errno = ENOMEM; return ret; } @@ -146,7 +218,6 @@ static void mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) { assert(data != NULL); - DEBUG("Extern free request: %p", ptr); rte_free(ptr); } @@ -165,13 +236,13 @@ mlx5_dev_close(struct rte_eth_dev *dev) unsigned int i; int ret; - priv_lock(priv); - DEBUG("%p: closing device \"%s\"", - (void *)dev, - ((priv->ctx != NULL) ? priv->ctx->device->name : "")); + DRV_LOG(DEBUG, "port %u closing device \"%s\"", + dev->data->port_id, + ((priv->ctx != NULL) ? priv->ctx->device->name : "")); /* In case mlx5_dev_stop() has not been called. */ - priv_dev_interrupt_handler_uninstall(priv, dev); - priv_dev_traffic_disable(priv, dev); + mlx5_dev_interrupt_handler_uninstall(dev); + mlx5_traffic_disable(dev); + mlx5_flow_flush(dev, NULL); /* Prevent crashes when queues are still in use. */ dev->rx_pkt_burst = removed_rx_burst; dev->tx_pkt_burst = removed_tx_burst; @@ -179,7 +250,7 @@ mlx5_dev_close(struct rte_eth_dev *dev) /* XXX race condition if mlx5_rx_burst() is still running. */ usleep(1000); for (i = 0; (i != priv->rxqs_n); ++i) - mlx5_priv_rxq_release(priv, i); + mlx5_rxq_release(dev, i); priv->rxqs_n = 0; priv->rxqs = NULL; } @@ -187,10 +258,12 @@ mlx5_dev_close(struct rte_eth_dev *dev) /* XXX race condition if mlx5_tx_burst() is still running. */ usleep(1000); for (i = 0; (i != priv->txqs_n); ++i) - mlx5_priv_txq_release(priv, i); + mlx5_txq_release(dev, i); priv->txqs_n = 0; priv->txqs = NULL; } + mlx5_mprq_free_mp(dev); + mlx5_mr_release(dev); if (priv->pd != NULL) { assert(priv->ctx != NULL); claim_zero(mlx5_glue->dealloc_pd(priv->pd)); @@ -202,33 +275,64 @@ mlx5_dev_close(struct rte_eth_dev *dev) if (priv->reta_idx != NULL) rte_free(priv->reta_idx); if (priv->primary_socket) - priv_socket_uninit(priv); - ret = mlx5_priv_hrxq_ibv_verify(priv); + mlx5_socket_uninit(dev); + if (priv->config.vf) + mlx5_nl_mac_addr_flush(dev); + if (priv->nl_socket_route >= 0) + close(priv->nl_socket_route); + if (priv->nl_socket_rdma >= 0) + close(priv->nl_socket_rdma); + if (priv->mnl_socket) + mlx5_nl_flow_socket_destroy(priv->mnl_socket); + ret = mlx5_hrxq_ibv_verify(dev); if (ret) - WARN("%p: some Hash Rx queue still remain", (void *)priv); - ret = mlx5_priv_ind_table_ibv_verify(priv); + DRV_LOG(WARNING, "port %u some hash Rx queue still remain", + dev->data->port_id); + ret = mlx5_ind_table_ibv_verify(dev); if (ret) - WARN("%p: some Indirection table still remain", (void *)priv); - ret = mlx5_priv_rxq_ibv_verify(priv); + DRV_LOG(WARNING, "port %u some indirection table still remain", + dev->data->port_id); + ret = mlx5_rxq_ibv_verify(dev); if (ret) - WARN("%p: some Verbs Rx queue still remain", (void *)priv); - ret = mlx5_priv_rxq_verify(priv); + DRV_LOG(WARNING, "port %u some Verbs Rx queue still remain", + dev->data->port_id); + ret = mlx5_rxq_verify(dev); if (ret) - WARN("%p: some Rx Queues still remain", (void *)priv); - ret = mlx5_priv_txq_ibv_verify(priv); + DRV_LOG(WARNING, "port %u some Rx queues still remain", + dev->data->port_id); + ret = mlx5_txq_ibv_verify(dev); if (ret) - WARN("%p: some Verbs Tx queue still remain", (void *)priv); - ret = mlx5_priv_txq_verify(priv); + DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain", + dev->data->port_id); + ret = mlx5_txq_verify(dev); if (ret) - WARN("%p: some Tx Queues still remain", (void *)priv); - ret = priv_flow_verify(priv); + DRV_LOG(WARNING, "port %u some Tx queues still remain", + dev->data->port_id); + ret = mlx5_flow_verify(dev); if (ret) - WARN("%p: some flows still remain", (void *)priv); - ret = priv_mr_verify(priv); - if (ret) - WARN("%p: some Memory Region still remain", (void *)priv); - priv_unlock(priv); + DRV_LOG(WARNING, "port %u some flows still remain", + dev->data->port_id); + if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { + unsigned int c = 0; + unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0); + uint16_t port_id[i]; + + i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i); + while (i--) { + struct priv *opriv = + rte_eth_devices[port_id[i]].data->dev_private; + + if (!opriv || + opriv->domain_id != priv->domain_id || + &rte_eth_devices[port_id[i]] == dev) + continue; + ++c; + } + if (!c) + claim_zero(rte_eth_switch_domain_free(priv->domain_id)); + } memset(priv, 0, sizeof(*priv)); + priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; } const struct eth_dev_ops mlx5_dev_ops = { @@ -260,6 +364,7 @@ const struct eth_dev_ops mlx5_dev_ops = { .mac_addr_remove = mlx5_mac_addr_remove, .mac_addr_add = mlx5_mac_addr_add, .mac_addr_set = mlx5_mac_addr_set, + .set_mc_addr_list = mlx5_set_mc_addr_list, .mtu_set = mlx5_dev_set_mtu, .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, .vlan_offload_set = mlx5_vlan_offload_set, @@ -294,6 +399,10 @@ const struct eth_dev_ops mlx5_dev_ops_isolate = { .dev_set_link_down = mlx5_set_link_down, .dev_set_link_up = mlx5_set_link_up, .dev_close = mlx5_dev_close, + .promiscuous_enable = mlx5_promiscuous_enable, + .promiscuous_disable = mlx5_promiscuous_disable, + .allmulticast_enable = mlx5_allmulticast_enable, + .allmulticast_disable = mlx5_allmulticast_disable, .link_update = mlx5_link_update, .stats_get = mlx5_stats_get, .stats_reset = mlx5_stats_reset, @@ -312,6 +421,7 @@ const struct eth_dev_ops mlx5_dev_ops_isolate = { .mac_addr_remove = mlx5_mac_addr_remove, .mac_addr_add = mlx5_mac_addr_add, .mac_addr_set = mlx5_mac_addr_set, + .set_mc_addr_list = mlx5_set_mc_addr_list, .mtu_set = mlx5_dev_set_mtu, .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, .vlan_offload_set = mlx5_vlan_offload_set, @@ -323,39 +433,6 @@ const struct eth_dev_ops mlx5_dev_ops_isolate = { .is_removed = mlx5_is_removed, }; -static struct { - struct rte_pci_addr pci_addr; /* associated PCI address */ - uint32_t ports; /* physical ports bitfield. */ -} mlx5_dev[32]; - -/** - * Get device index in mlx5_dev[] from PCI bus address. - * - * @param[in] pci_addr - * PCI bus address to look for. - * - * @return - * mlx5_dev[] index on success, -1 on failure. - */ -static int -mlx5_dev_idx(struct rte_pci_addr *pci_addr) -{ - unsigned int i; - int ret = -1; - - assert(pci_addr != NULL); - for (i = 0; (i != RTE_DIM(mlx5_dev)); ++i) { - if ((mlx5_dev[i].pci_addr.domain == pci_addr->domain) && - (mlx5_dev[i].pci_addr.bus == pci_addr->bus) && - (mlx5_dev[i].pci_addr.devid == pci_addr->devid) && - (mlx5_dev[i].pci_addr.function == pci_addr->function)) - return i; - if ((mlx5_dev[i].ports == 0) && (ret == -1)) - ret = i; - } - return ret; -} - /** * Verify and store value for device argument. * @@ -367,7 +444,7 @@ mlx5_dev_idx(struct rte_pci_addr *pci_addr) * User data. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_args_check(const char *key, const char *val, void *opaque) @@ -375,14 +452,26 @@ mlx5_args_check(const char *key, const char *val, void *opaque) struct mlx5_dev_config *config = opaque; unsigned long tmp; + /* No-op, port representors are processed in mlx5_dev_spawn(). */ + if (!strcmp(MLX5_REPRESENTOR, key)) + return 0; errno = 0; tmp = strtoul(val, NULL, 0); if (errno) { - WARN("%s: \"%s\" is not a valid integer", key, val); - return errno; + rte_errno = errno; + DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val); + return -rte_errno; } if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { config->cqe_comp = !!tmp; + } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) { + config->mprq.enabled = !!tmp; + } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) { + config->mprq.stride_num_n = tmp; + } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) { + config->mprq.max_memcpy_len = tmp; + } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) { + config->mprq.min_rxqs_num = tmp; } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { config->txq_inline = tmp; } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { @@ -397,9 +486,14 @@ mlx5_args_check(const char *key, const char *val, void *opaque) config->tx_vec_en = !!tmp; } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { config->rx_vec_en = !!tmp; + } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) { + config->l3_vxlan_en = !!tmp; + } else if (strcmp(MLX5_VF_NL_EN, key) == 0) { + config->vf_nl_en = !!tmp; } else { - WARN("%s: unknown parameter", key); - return -EINVAL; + DRV_LOG(WARNING, "%s: unknown parameter", key); + rte_errno = EINVAL; + return -rte_errno; } return 0; } @@ -413,13 +507,17 @@ mlx5_args_check(const char *key, const char *val, void *opaque) * Device arguments structure. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) { const char **params = (const char *[]){ MLX5_RXQ_CQE_COMP_EN, + MLX5_RX_MPRQ_EN, + MLX5_RX_MPRQ_LOG_STRIDE_NUM, + MLX5_RX_MPRQ_MAX_MEMCPY_LEN, + MLX5_RXQS_MIN_MPRQ, MLX5_TXQ_INLINE, MLX5_TXQS_MIN_INLINE, MLX5_TXQ_MPW_EN, @@ -427,6 +525,9 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) MLX5_TXQ_MAX_INLINE_LEN, MLX5_TX_VEC_EN, MLX5_RX_VEC_EN, + MLX5_L3_VXLAN_EN, + MLX5_VF_NL_EN, + MLX5_REPRESENTOR, NULL, }; struct rte_kvargs *kvlist; @@ -444,9 +545,10 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) if (rte_kvargs_count(kvlist, params[i])) { ret = rte_kvargs_process(kvlist, params[i], mlx5_args_check, config); - if (ret != 0) { + if (ret) { + rte_errno = EINVAL; rte_kvargs_free(kvlist); - return ret; + return -rte_errno; } } } @@ -465,50 +567,60 @@ static struct rte_pci_driver mlx5_driver; */ static void *uar_base; +static int +find_lower_va_bound(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + void **addr = arg; + + if (*addr == NULL) + *addr = ms->addr; + else + *addr = RTE_MIN(*addr, ms->addr); + + return 0; +} + /** * Reserve UAR address space for primary process. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_uar_init_primary(struct priv *priv) +mlx5_uar_init_primary(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; void *addr = (void *)0; - int i; - const struct rte_mem_config *mcfg; - int ret; if (uar_base) { /* UAR address space mapped. */ priv->uar_base = uar_base; return 0; } /* find out lower bound of hugepage segments */ - mcfg = rte_eal_get_configuration()->mem_config; - for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) { - if (addr) - addr = RTE_MIN(addr, mcfg->memseg[i].addr); - else - addr = mcfg->memseg[i].addr; - } + rte_memseg_walk(find_lower_va_bound, &addr); + /* keep distance to hugepages to minimize potential conflicts. */ - addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE); + addr = RTE_PTR_SUB(addr, (uintptr_t)(MLX5_UAR_OFFSET + MLX5_UAR_SIZE)); /* anonymous mmap, no real memory consumption. */ addr = mmap(addr, MLX5_UAR_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) { - ERROR("Failed to reserve UAR address space, please adjust " - "MLX5_UAR_SIZE or try --base-virtaddr"); - ret = ENOMEM; - return ret; + DRV_LOG(ERR, + "port %u failed to reserve UAR address space, please" + " adjust MLX5_UAR_SIZE or try --base-virtaddr", + dev->data->port_id); + rte_errno = ENOMEM; + return -rte_errno; } /* Accept either same addr or a new addr returned from mmap if target * range occupied. */ - INFO("Reserved UAR address space: %p", addr); + DRV_LOG(INFO, "port %u reserved UAR address space: %p", + dev->data->port_id, addr); priv->uar_base = addr; /* for primary and secondary UAR re-mmap. */ uar_base = addr; /* process local, don't reserve again. */ return 0; @@ -518,17 +630,17 @@ priv_uar_init_primary(struct priv *priv) * Reserve UAR address space for secondary process, align with * primary process. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_uar_init_secondary(struct priv *priv) +mlx5_uar_init_secondary(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; void *addr; - int ret; assert(priv->uar_base); if (uar_base) { /* already reserved. */ @@ -539,467 +651,802 @@ priv_uar_init_secondary(struct priv *priv) addr = mmap(priv->uar_base, MLX5_UAR_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) { - ERROR("UAR mmap failed: %p size: %llu", - priv->uar_base, MLX5_UAR_SIZE); - ret = ENXIO; - return ret; + DRV_LOG(ERR, "port %u UAR mmap failed: %p size: %llu", + dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE); + rte_errno = ENXIO; + return -rte_errno; } if (priv->uar_base != addr) { - ERROR("UAR address %p size %llu occupied, please adjust " - "MLX5_UAR_OFFSET or try EAL parameter --base-virtaddr", - priv->uar_base, MLX5_UAR_SIZE); - ret = ENXIO; - return ret; + DRV_LOG(ERR, + "port %u UAR address %p size %llu occupied, please" + " adjust MLX5_UAR_OFFSET or try EAL parameter" + " --base-virtaddr", + dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE); + rte_errno = ENXIO; + return -rte_errno; } uar_base = addr; /* process local, don't reserve again */ - INFO("Reserved UAR address space: %p", addr); + DRV_LOG(INFO, "port %u reserved UAR address space: %p", + dev->data->port_id, addr); return 0; } /** - * DPDK callback to register a PCI device. + * Spawn an Ethernet device from Verbs information. * - * This function creates an Ethernet device for each port of a given - * PCI device. - * - * @param[in] pci_drv - * PCI driver structure (mlx5_driver). - * @param[in] pci_dev - * PCI device information. + * @param dpdk_dev + * Backing DPDK device. + * @param ibv_dev + * Verbs device. + * @param vf + * If nonzero, enable VF-specific features. + * @param[in] switch_info + * Switch properties of Ethernet device. * * @return - * 0 on success, negative errno value on failure. + * A valid Ethernet device object on success, NULL otherwise and rte_errno + * is set. The following error is defined: + * + * EBUSY: device is not supposed to be spawned. */ -static int -mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) +static struct rte_eth_dev * +mlx5_dev_spawn(struct rte_device *dpdk_dev, + struct ibv_device *ibv_dev, + int vf, + const struct mlx5_switch_info *switch_info) { - struct ibv_device **list; - struct ibv_device *ibv_dev; + struct ibv_context *ctx; + struct ibv_device_attr_ex attr; + struct ibv_port_attr port_attr; + struct ibv_pd *pd = NULL; + struct mlx5dv_context dv_attr = { .comp_mask = 0 }; + struct mlx5_dev_config config = { + .vf = !!vf, + .tx_vec_en = 1, + .rx_vec_en = 1, + .mpw_hdr_dseg = 0, + .txq_inline = MLX5_ARG_UNSET, + .txqs_inline = MLX5_ARG_UNSET, + .inline_max_packet_sz = MLX5_ARG_UNSET, + .vf_nl_en = 1, + .mprq = { + .enabled = 0, + .stride_num_n = MLX5_MPRQ_STRIDE_NUM_N, + .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, + .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, + }, + }; + struct rte_eth_dev *eth_dev = NULL; + struct priv *priv = NULL; int err = 0; - struct ibv_context *attr_ctx = NULL; - struct ibv_device_attr_ex device_attr; - unsigned int sriov; unsigned int mps; unsigned int cqe_comp; unsigned int tunnel_en = 0; - int idx; - int i; - struct mlx5dv_context attrs_out; + unsigned int mpls_en = 0; + unsigned int swp = 0; + unsigned int mprq = 0; + unsigned int mprq_min_stride_size_n = 0; + unsigned int mprq_max_stride_size_n = 0; + unsigned int mprq_min_stride_num_n = 0; + unsigned int mprq_max_stride_num_n = 0; #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT - struct ibv_counter_set_description cs_desc; + struct ibv_counter_set_description cs_desc = { .counter_type = 0 }; #endif + struct ether_addr mac; + char name[RTE_ETH_NAME_MAX_LEN]; + int own_domain_id = 0; + unsigned int i; - (void)pci_drv; - assert(pci_drv == &mlx5_driver); - /* Get mlx5_dev[] index. */ - idx = mlx5_dev_idx(&pci_dev->addr); - if (idx == -1) { - ERROR("this driver cannot support any more adapters"); - return -ENOMEM; - } - DEBUG("using driver device index %d", idx); - - /* Save PCI address. */ - mlx5_dev[idx].pci_addr = pci_dev->addr; - list = mlx5_glue->get_device_list(&i); - if (list == NULL) { - assert(errno); - if (errno == ENOSYS) - ERROR("cannot list devices, is ib_uverbs loaded?"); - return -errno; - } - assert(i >= 0); - /* - * For each listed device, check related sysfs entry against - * the provided PCI ID. - */ - while (i != 0) { - struct rte_pci_addr pci_addr; + /* Determine if this port representor is supposed to be spawned. */ + if (switch_info->representor && dpdk_dev->devargs) { + struct rte_eth_devargs eth_da; - --i; - DEBUG("checking device \"%s\"", list[i]->name); - if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr)) - continue; - if ((pci_dev->addr.domain != pci_addr.domain) || - (pci_dev->addr.bus != pci_addr.bus) || - (pci_dev->addr.devid != pci_addr.devid) || - (pci_dev->addr.function != pci_addr.function)) - continue; - sriov = ((pci_dev->id.device_id == - PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) || - (pci_dev->id.device_id == - PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) || - (pci_dev->id.device_id == - PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) || - (pci_dev->id.device_id == - PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)); - switch (pci_dev->id.device_id) { - case PCI_DEVICE_ID_MELLANOX_CONNECTX4: - tunnel_en = 1; - break; - case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: - case PCI_DEVICE_ID_MELLANOX_CONNECTX5: - case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: - case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX: - case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: - tunnel_en = 1; - break; - default: - break; + err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); + if (err) { + rte_errno = -err; + DRV_LOG(ERR, "failed to process device arguments: %s", + strerror(rte_errno)); + return NULL; } - INFO("PCI information matches, using device \"%s\"" - " (SR-IOV: %s)", - list[i]->name, - sriov ? "true" : "false"); - attr_ctx = mlx5_glue->open_device(list[i]); - err = errno; - break; - } - if (attr_ctx == NULL) { - mlx5_glue->free_device_list(list); - switch (err) { - case 0: - ERROR("cannot access device, is mlx5_ib loaded?"); - return -ENODEV; - case EINVAL: - ERROR("cannot use device, are drivers up to date?"); - return -EINVAL; + for (i = 0; i < eth_da.nb_representor_ports; ++i) + if (eth_da.representor_ports[i] == + (uint16_t)switch_info->port_name) + break; + if (i == eth_da.nb_representor_ports) { + rte_errno = EBUSY; + return NULL; } - assert(err > 0); - return -err; } - ibv_dev = list[i]; - - DEBUG("device opened"); + /* Prepare shared data between primary and secondary process. */ + mlx5_prepare_shared_data(); + errno = 0; + ctx = mlx5_glue->open_device(ibv_dev); + if (!ctx) { + rte_errno = errno ? errno : ENODEV; + return NULL; + } +#ifdef HAVE_IBV_MLX5_MOD_SWP + dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; +#endif /* * Multi-packet send is supported by ConnectX-4 Lx PF as well * as all ConnectX-5 devices. */ - mlx5_glue->dv_query_device(attr_ctx, &attrs_out); - if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { - if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { - DEBUG("Enhanced MPW is supported"); +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; +#endif +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; +#endif + mlx5_glue->dv_query_device(ctx, &dv_attr); + if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { + if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { + DRV_LOG(DEBUG, "enhanced MPW is supported"); mps = MLX5_MPW_ENHANCED; } else { - DEBUG("MPW is supported"); + DRV_LOG(DEBUG, "MPW is supported"); mps = MLX5_MPW; } } else { - DEBUG("MPW isn't supported"); + DRV_LOG(DEBUG, "MPW isn't supported"); mps = MLX5_MPW_DISABLED; } + config.mps = mps; +#ifdef HAVE_IBV_MLX5_MOD_SWP + if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) + swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; + DRV_LOG(DEBUG, "SWP support: %u", swp); +#endif + config.swp = !!swp; +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { + struct mlx5dv_striding_rq_caps mprq_caps = + dv_attr.striding_rq_caps; + + DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", + mprq_caps.min_single_stride_log_num_of_bytes); + DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", + mprq_caps.max_single_stride_log_num_of_bytes); + DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", + mprq_caps.min_single_wqe_log_num_of_strides); + DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", + mprq_caps.max_single_wqe_log_num_of_strides); + DRV_LOG(DEBUG, "\tsupported_qpts: %d", + mprq_caps.supported_qpts); + DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); + mprq = 1; + mprq_min_stride_size_n = + mprq_caps.min_single_stride_log_num_of_bytes; + mprq_max_stride_size_n = + mprq_caps.max_single_stride_log_num_of_bytes; + mprq_min_stride_num_n = + mprq_caps.min_single_wqe_log_num_of_strides; + mprq_max_stride_num_n = + mprq_caps.max_single_wqe_log_num_of_strides; + config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, + mprq_min_stride_num_n); + } +#endif if (RTE_CACHE_LINE_SIZE == 128 && - !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) + !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) cqe_comp = 0; else cqe_comp = 1; - if (mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr)) + config.cqe_comp = cqe_comp; +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { + tunnel_en = ((dv_attr.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && + (dv_attr.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE)); + } + DRV_LOG(DEBUG, "tunnel offloading is %ssupported", + tunnel_en ? "" : "not "); +#else + DRV_LOG(WARNING, + "tunnel offloading disabled due to old OFED/rdma-core version"); +#endif + config.tunnel_en = tunnel_en; +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + mpls_en = ((dv_attr.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && + (dv_attr.tunnel_offloads_caps & + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); + DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", + mpls_en ? "" : "not "); +#else + DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" + " old OFED/rdma-core version or firmware configuration"); +#endif + config.mpls_en = mpls_en; + err = mlx5_glue->query_device_ex(ctx, NULL, &attr); + if (err) { + DEBUG("ibv_query_device_ex() failed"); goto error; - INFO("%u port(s) detected", device_attr.orig_attr.phys_port_cnt); - - for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) { - char name[RTE_ETH_NAME_MAX_LEN]; - int len; - uint32_t port = i + 1; /* ports are indexed from one */ - uint32_t test = (1 << i); - struct ibv_context *ctx = NULL; - struct ibv_port_attr port_attr; - struct ibv_pd *pd = NULL; - struct priv *priv = NULL; - struct rte_eth_dev *eth_dev; - struct ibv_device_attr_ex device_attr_ex; - struct ether_addr mac; - uint16_t num_vfs = 0; - struct ibv_device_attr_ex device_attr; - struct mlx5_dev_config config = { - .cqe_comp = cqe_comp, - .mps = mps, - .tunnel_en = tunnel_en, - .tx_vec_en = 1, - .rx_vec_en = 1, - .mpw_hdr_dseg = 0, - .txq_inline = MLX5_ARG_UNSET, - .txqs_inline = MLX5_ARG_UNSET, - .inline_max_packet_sz = MLX5_ARG_UNSET, - }; - - len = snprintf(name, sizeof(name), PCI_PRI_FMT, - pci_dev->addr.domain, pci_dev->addr.bus, - pci_dev->addr.devid, pci_dev->addr.function); - if (device_attr.orig_attr.phys_port_cnt > 1) - snprintf(name + len, sizeof(name), " port %u", i); - - mlx5_dev[idx].ports |= test; - - if (rte_eal_process_type() == RTE_PROC_SECONDARY) { - eth_dev = rte_eth_dev_attach_secondary(name); - if (eth_dev == NULL) { - ERROR("can not attach rte ethdev"); - err = ENOMEM; - goto error; - } - eth_dev->device = &pci_dev->device; - eth_dev->dev_ops = &mlx5_dev_sec_ops; - priv = eth_dev->data->dev_private; - err = priv_uar_init_secondary(priv); - if (err < 0) { - err = -err; - goto error; - } - /* Receive command fd from primary process */ - err = priv_socket_connect(priv); - if (err < 0) { - err = -err; - goto error; - } - /* Remap UAR for Tx queues. */ - err = priv_tx_uar_remap(priv, err); - if (err) - goto error; - /* - * Ethdev pointer is still required as input since - * the primary device is not accessible from the - * secondary process. - */ - eth_dev->rx_pkt_burst = - priv_select_rx_function(priv, eth_dev); - eth_dev->tx_pkt_burst = - priv_select_tx_function(priv, eth_dev); - continue; - } - - DEBUG("using port %u (%08" PRIx32 ")", port, test); - - ctx = mlx5_glue->open_device(ibv_dev); - if (ctx == NULL) { - err = ENODEV; - goto port_error; + } + if (!switch_info->representor) + rte_strlcpy(name, dpdk_dev->name, sizeof(name)); + else + snprintf(name, sizeof(name), "%s_representor_%u", + dpdk_dev->name, switch_info->port_name); + DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + eth_dev = rte_eth_dev_attach_secondary(name); + if (eth_dev == NULL) { + DRV_LOG(ERR, "can not attach rte ethdev"); + rte_errno = ENOMEM; + err = rte_errno; + goto error; } - - mlx5_glue->query_device_ex(ctx, NULL, &device_attr); - /* Check port status. */ - err = mlx5_glue->query_port(ctx, port, &port_attr); + eth_dev->device = dpdk_dev; + eth_dev->dev_ops = &mlx5_dev_sec_ops; + err = mlx5_uar_init_secondary(eth_dev); if (err) { - ERROR("port query failed: %s", strerror(err)); - goto port_error; + err = rte_errno; + goto error; } - - if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { - ERROR("port %d is not configured in Ethernet mode", - port); - err = EINVAL; - goto port_error; + /* Receive command fd from primary process */ + err = mlx5_socket_connect(eth_dev); + if (err < 0) { + err = rte_errno; + goto error; } - - if (port_attr.state != IBV_PORT_ACTIVE) - DEBUG("port %d is not active: \"%s\" (%d)", - port, mlx5_glue->port_state_str(port_attr.state), - port_attr.state); - - /* Allocate protection domain. */ - pd = mlx5_glue->alloc_pd(ctx); - if (pd == NULL) { - ERROR("PD allocation failure"); - err = ENOMEM; - goto port_error; + /* Remap UAR for Tx queues. */ + err = mlx5_tx_uar_remap(eth_dev, err); + if (err) { + err = rte_errno; + goto error; } - - mlx5_dev[idx].ports |= test; - - /* from rte_ethdev.c */ - priv = rte_zmalloc("ethdev private structure", - sizeof(*priv), - RTE_CACHE_LINE_SIZE); - if (priv == NULL) { - ERROR("priv allocation failure"); - err = ENOMEM; - goto port_error; + /* + * Ethdev pointer is still required as input since + * the primary device is not accessible from the + * secondary process. + */ + eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); + eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); + claim_zero(mlx5_glue->close_device(ctx)); + return eth_dev; + } + /* Check port status. */ + err = mlx5_glue->query_port(ctx, 1, &port_attr); + if (err) { + DRV_LOG(ERR, "port query failed: %s", strerror(err)); + goto error; + } + if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { + DRV_LOG(ERR, "port is not configured in Ethernet mode"); + err = EINVAL; + goto error; + } + if (port_attr.state != IBV_PORT_ACTIVE) + DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", + mlx5_glue->port_state_str(port_attr.state), + port_attr.state); + /* Allocate protection domain. */ + pd = mlx5_glue->alloc_pd(ctx); + if (pd == NULL) { + DRV_LOG(ERR, "PD allocation failure"); + err = ENOMEM; + goto error; + } + priv = rte_zmalloc("ethdev private structure", + sizeof(*priv), + RTE_CACHE_LINE_SIZE); + if (priv == NULL) { + DRV_LOG(ERR, "priv allocation failure"); + err = ENOMEM; + goto error; + } + priv->ctx = ctx; + strncpy(priv->ibdev_name, priv->ctx->device->name, + sizeof(priv->ibdev_name)); + strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path, + sizeof(priv->ibdev_path)); + priv->device_attr = attr; + priv->pd = pd; + priv->mtu = ETHER_MTU; +#ifndef RTE_ARCH_64 + /* Initialize UAR access locks for 32bit implementations. */ + rte_spinlock_init(&priv->uar_lock_cq); + for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++) + rte_spinlock_init(&priv->uar_lock[i]); +#endif + /* Some internal functions rely on Netlink sockets, open them now. */ + priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); + priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); + priv->nl_sn = 0; + priv->representor = !!switch_info->representor; + priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; + priv->representor_id = + switch_info->representor ? switch_info->port_name : -1; + /* + * Look for sibling devices in order to reuse their switch domain + * if any, otherwise allocate one. + */ + i = mlx5_dev_to_port_id(dpdk_dev, NULL, 0); + if (i > 0) { + uint16_t port_id[i]; + + i = RTE_MIN(mlx5_dev_to_port_id(dpdk_dev, port_id, i), i); + while (i--) { + const struct priv *opriv = + rte_eth_devices[port_id[i]].data->dev_private; + + if (!opriv || + opriv->domain_id == + RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) + continue; + priv->domain_id = opriv->domain_id; + break; } - - priv->ctx = ctx; - strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path, - sizeof(priv->ibdev_path)); - priv->device_attr = device_attr; - priv->port = port; - priv->pd = pd; - priv->mtu = ETHER_MTU; - err = mlx5_args(&config, pci_dev->device.devargs); + } + if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { + err = rte_eth_switch_domain_alloc(&priv->domain_id); if (err) { - ERROR("failed to process device arguments: %s", - strerror(err)); - goto port_error; + err = rte_errno; + DRV_LOG(ERR, "unable to allocate switch domain: %s", + strerror(rte_errno)); + goto error; } - if (mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex)) { - ERROR("ibv_query_device_ex() failed"); - goto port_error; - } - - config.hw_csum = !!(device_attr_ex.device_cap_flags_ex & - IBV_DEVICE_RAW_IP_CSUM); - DEBUG("checksum offloading is %ssupported", - (config.hw_csum ? "" : "not ")); - -#ifdef HAVE_IBV_DEVICE_VXLAN_SUPPORT - config.hw_csum_l2tun = - !!(exp_device_attr.exp_device_cap_flags & - IBV_DEVICE_VXLAN_SUPPORT); -#endif - DEBUG("Rx L2 tunnel checksum offloads are %ssupported", - (config.hw_csum_l2tun ? "" : "not ")); - + own_domain_id = 1; + } + err = mlx5_args(&config, dpdk_dev->devargs); + if (err) { + err = rte_errno; + DRV_LOG(ERR, "failed to process device arguments: %s", + strerror(rte_errno)); + goto error; + } + config.hw_csum = !!(attr.device_cap_flags_ex & IBV_DEVICE_RAW_IP_CSUM); + DRV_LOG(DEBUG, "checksum offloading is %ssupported", + (config.hw_csum ? "" : "not ")); #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT - config.flow_counter_en = !!(device_attr.max_counter_sets); - mlx5_glue->describe_counter_set(ctx, 0, &cs_desc); - DEBUG("counter type = %d, num of cs = %ld, attributes = %d", - cs_desc.counter_type, cs_desc.num_of_cs, - cs_desc.attributes); + config.flow_counter_en = !!attr.max_counter_sets; + mlx5_glue->describe_counter_set(ctx, 0, &cs_desc); + DRV_LOG(DEBUG, "counter type = %d, num of cs = %ld, attributes = %d", + cs_desc.counter_type, cs_desc.num_of_cs, + cs_desc.attributes); #endif - config.ind_table_max_size = - device_attr_ex.rss_caps.max_rwq_indirection_table_size; - /* Remove this check once DPDK supports larger/variable - * indirection tables. */ - if (config.ind_table_max_size > - (unsigned int)ETH_RSS_RETA_SIZE_512) - config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; - DEBUG("maximum RX indirection table size is %u", - config.ind_table_max_size); - config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps & - IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); - DEBUG("VLAN stripping is %ssupported", - (config.hw_vlan_strip ? "" : "not ")); - - config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps & - IBV_RAW_PACKET_CAP_SCATTER_FCS); - DEBUG("FCS stripping configuration is %ssupported", - (config.hw_fcs_strip ? "" : "not ")); - + config.ind_table_max_size = + attr.rss_caps.max_rwq_indirection_table_size; + /* + * Remove this check once DPDK supports larger/variable + * indirection tables. + */ + if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) + config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; + DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", + config.ind_table_max_size); + config.hw_vlan_strip = !!(attr.raw_packet_caps & + IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); + DRV_LOG(DEBUG, "VLAN stripping is %ssupported", + (config.hw_vlan_strip ? "" : "not ")); + config.hw_fcs_strip = !!(attr.raw_packet_caps & + IBV_RAW_PACKET_CAP_SCATTER_FCS); + DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", + (config.hw_fcs_strip ? "" : "not ")); #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING - config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align; + config.hw_padding = !!attr.rx_pad_end_addr_align; #endif - DEBUG("hardware RX end alignment padding is %ssupported", - (config.hw_padding ? "" : "not ")); - - priv_get_num_vfs(priv, &num_vfs); - config.sriov = (num_vfs || sriov); - config.tso = ((device_attr_ex.tso_caps.max_tso > 0) && - (device_attr_ex.tso_caps.supported_qpts & - (1 << IBV_QPT_RAW_PACKET))); - if (config.tso) - config.tso_max_payload_sz = - device_attr_ex.tso_caps.max_tso; - if (config.mps && !mps) { - ERROR("multi-packet send not supported on this device" - " (" MLX5_TXQ_MPW_EN ")"); - err = ENOTSUP; - goto port_error; - } - INFO("%sMPS is %s", - config.mps == MLX5_MPW_ENHANCED ? "Enhanced " : "", - config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); - if (config.cqe_comp && !cqe_comp) { - WARN("Rx CQE compression isn't supported"); - config.cqe_comp = 0; - } - err = priv_uar_init_primary(priv); - if (err) - goto port_error; - /* Configure the first MAC address by default. */ - if (priv_get_mac(priv, &mac.addr_bytes)) { - ERROR("cannot get MAC address, is mlx5_en loaded?" - " (errno: %s)", strerror(errno)); - err = ENODEV; - goto port_error; + DRV_LOG(DEBUG, "hardware Rx end alignment padding is %ssupported", + (config.hw_padding ? "" : "not ")); + config.tso = (attr.tso_caps.max_tso > 0 && + (attr.tso_caps.supported_qpts & + (1 << IBV_QPT_RAW_PACKET))); + if (config.tso) + config.tso_max_payload_sz = attr.tso_caps.max_tso; + if (config.mps && !mps) { + DRV_LOG(ERR, + "multi-packet send not supported on this device" + " (" MLX5_TXQ_MPW_EN ")"); + err = ENOTSUP; + goto error; + } + DRV_LOG(INFO, "%sMPS is %s", + config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "", + config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); + if (config.cqe_comp && !cqe_comp) { + DRV_LOG(WARNING, "Rx CQE compression isn't supported"); + config.cqe_comp = 0; + } + if (config.mprq.enabled && mprq) { + if (config.mprq.stride_num_n > mprq_max_stride_num_n || + config.mprq.stride_num_n < mprq_min_stride_num_n) { + config.mprq.stride_num_n = + RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, + mprq_min_stride_num_n); + DRV_LOG(WARNING, + "the number of strides" + " for Multi-Packet RQ is out of range," + " setting default value (%u)", + 1 << config.mprq.stride_num_n); } - INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", - priv->port, - mac.addr_bytes[0], mac.addr_bytes[1], - mac.addr_bytes[2], mac.addr_bytes[3], - mac.addr_bytes[4], mac.addr_bytes[5]); + config.mprq.min_stride_size_n = mprq_min_stride_size_n; + config.mprq.max_stride_size_n = mprq_max_stride_size_n; + } else if (config.mprq.enabled && !mprq) { + DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); + config.mprq.enabled = 0; + } + eth_dev = rte_eth_dev_allocate(name); + if (eth_dev == NULL) { + DRV_LOG(ERR, "can not allocate rte ethdev"); + err = ENOMEM; + goto error; + } + if (priv->representor) + eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; + eth_dev->data->dev_private = priv; + priv->dev_data = eth_dev->data; + eth_dev->data->mac_addrs = priv->mac; + eth_dev->device = dpdk_dev; + eth_dev->device->driver = &mlx5_driver.driver; + err = mlx5_uar_init_primary(eth_dev); + if (err) { + err = rte_errno; + goto error; + } + /* Configure the first MAC address by default. */ + if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { + DRV_LOG(ERR, + "port %u cannot get MAC address, is mlx5_en" + " loaded? (errno: %s)", + eth_dev->data->port_id, strerror(rte_errno)); + err = ENODEV; + goto error; + } + DRV_LOG(INFO, + "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", + eth_dev->data->port_id, + mac.addr_bytes[0], mac.addr_bytes[1], + mac.addr_bytes[2], mac.addr_bytes[3], + mac.addr_bytes[4], mac.addr_bytes[5]); #ifndef NDEBUG - { - char ifname[IF_NAMESIZE]; - - if (priv_get_ifname(priv, &ifname) == 0) - DEBUG("port %u ifname is \"%s\"", - priv->port, ifname); - else - DEBUG("port %u ifname is unknown", priv->port); - } + { + char ifname[IF_NAMESIZE]; + + if (mlx5_get_ifname(eth_dev, &ifname) == 0) + DRV_LOG(DEBUG, "port %u ifname is \"%s\"", + eth_dev->data->port_id, ifname); + else + DRV_LOG(DEBUG, "port %u ifname is unknown", + eth_dev->data->port_id); + } #endif - /* Get actual MTU if possible. */ - priv_get_mtu(priv, &priv->mtu); - DEBUG("port %u MTU is %u", priv->port, priv->mtu); + /* Get actual MTU if possible. */ + err = mlx5_get_mtu(eth_dev, &priv->mtu); + if (err) { + err = rte_errno; + goto error; + } + DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, + priv->mtu); + /* Initialize burst functions to prevent crashes before link-up. */ + eth_dev->rx_pkt_burst = removed_rx_burst; + eth_dev->tx_pkt_burst = removed_tx_burst; + eth_dev->dev_ops = &mlx5_dev_ops; + /* Register MAC address. */ + claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); + if (vf && config.vf_nl_en) + mlx5_nl_mac_addr_sync(eth_dev); + priv->mnl_socket = mlx5_nl_flow_socket_create(); + if (!priv->mnl_socket) { + err = -rte_errno; + DRV_LOG(WARNING, + "flow rules relying on switch offloads will not be" + " supported: cannot open libmnl socket: %s", + strerror(rte_errno)); + } else { + struct rte_flow_error error; + unsigned int ifindex = mlx5_ifindex(eth_dev); - eth_dev = rte_eth_dev_allocate(name); - if (eth_dev == NULL) { - ERROR("can not allocate rte ethdev"); - err = ENOMEM; - goto port_error; + if (!ifindex) { + err = -rte_errno; + error.message = + "cannot retrieve network interface index"; + } else { + err = mlx5_nl_flow_init(priv->mnl_socket, ifindex, + &error); + } + if (err) { + DRV_LOG(WARNING, + "flow rules relying on switch offloads will" + " not be supported: %s: %s", + error.message, strerror(rte_errno)); + mlx5_nl_flow_socket_destroy(priv->mnl_socket); + priv->mnl_socket = NULL; } - eth_dev->data->dev_private = priv; - eth_dev->data->mac_addrs = priv->mac; - eth_dev->device = &pci_dev->device; - rte_eth_copy_pci_info(eth_dev, pci_dev); - eth_dev->device->driver = &mlx5_driver.driver; - /* - * Initialize burst functions to prevent crashes before link-up. - */ - eth_dev->rx_pkt_burst = removed_rx_burst; - eth_dev->tx_pkt_burst = removed_tx_burst; - priv->dev = eth_dev; - eth_dev->dev_ops = &mlx5_dev_ops; - /* Register MAC address. */ - claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); - TAILQ_INIT(&priv->flows); - TAILQ_INIT(&priv->ctrl_flows); - - /* Hint libmlx5 to use PMD allocator for data plane resources */ - struct mlx5dv_ctx_allocators alctr = { - .alloc = &mlx5_alloc_verbs_buf, - .free = &mlx5_free_verbs_buf, - .data = priv, - }; - mlx5_glue->dv_set_context_attr(ctx, - MLX5DV_CTX_ATTR_BUF_ALLOCATORS, - (void *)((uintptr_t)&alctr)); - - /* Bring Ethernet device up. */ - DEBUG("forcing Ethernet interface up"); - priv_set_flags(priv, ~IFF_UP, IFF_UP); - /* Store device configuration on private structure. */ - priv->config = config; - continue; - -port_error: - if (priv) - rte_free(priv); - if (pd) - claim_zero(mlx5_glue->dealloc_pd(pd)); - if (ctx) - claim_zero(mlx5_glue->close_device(ctx)); - break; } - + TAILQ_INIT(&priv->flows); + TAILQ_INIT(&priv->ctrl_flows); + /* Hint libmlx5 to use PMD allocator for data plane resources */ + struct mlx5dv_ctx_allocators alctr = { + .alloc = &mlx5_alloc_verbs_buf, + .free = &mlx5_free_verbs_buf, + .data = priv, + }; + mlx5_glue->dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS, + (void *)((uintptr_t)&alctr)); + /* Bring Ethernet device up. */ + DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", + eth_dev->data->port_id); + mlx5_set_link_up(eth_dev); /* - * XXX if something went wrong in the loop above, there is a resource - * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as - * long as the dpdk does not provide a way to deallocate a ethdev and a - * way to enumerate the registered ethdevs to free the previous ones. + * Even though the interrupt handler is not installed yet, + * interrupts will still trigger on the asyn_fd from + * Verbs context returned by ibv_open_device(). */ - - /* no port found, complain */ - if (!mlx5_dev[idx].ports) { - err = ENODEV; + mlx5_link_update(eth_dev, 0); + /* Store device configuration on private structure. */ + priv->config = config; + /* Supported Verbs flow priority number detection. */ + err = mlx5_flow_discover_priorities(eth_dev); + if (err < 0) + goto error; + priv->config.flow_prio = err; + /* + * Once the device is added to the list of memory event + * callback, its global MR cache table cannot be expanded + * on the fly because of deadlock. If it overflows, lookup + * should be done by searching MR list linearly, which is slow. + */ + err = mlx5_mr_btree_init(&priv->mr.cache, + MLX5_MR_BTREE_CACHE_N * 2, + eth_dev->device->numa_node); + if (err) { + err = rte_errno; goto error; } - + /* Add device to memory callback list. */ + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, + priv, mem_event_cb); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); + return eth_dev; error: - if (attr_ctx) - claim_zero(mlx5_glue->close_device(attr_ctx)); - if (list) - mlx5_glue->free_device_list(list); - assert(err >= 0); - return -err; + if (priv) { + if (priv->nl_socket_route >= 0) + close(priv->nl_socket_route); + if (priv->nl_socket_rdma >= 0) + close(priv->nl_socket_rdma); + if (priv->mnl_socket) + mlx5_nl_flow_socket_destroy(priv->mnl_socket); + if (own_domain_id) + claim_zero(rte_eth_switch_domain_free(priv->domain_id)); + rte_free(priv); + } + if (pd) + claim_zero(mlx5_glue->dealloc_pd(pd)); + if (eth_dev) + rte_eth_dev_release_port(eth_dev); + if (ctx) + claim_zero(mlx5_glue->close_device(ctx)); + assert(err > 0); + rte_errno = err; + return NULL; +} + +/** Data associated with devices to spawn. */ +struct mlx5_dev_spawn_data { + unsigned int ifindex; /**< Network interface index. */ + struct mlx5_switch_info info; /**< Switch information. */ + struct ibv_device *ibv_dev; /**< Associated IB device. */ + struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ +}; + +/** + * Comparison callback to sort device data. + * + * This is meant to be used with qsort(). + * + * @param a[in] + * Pointer to pointer to first data object. + * @param b[in] + * Pointer to pointer to second data object. + * + * @return + * 0 if both objects are equal, less than 0 if the first argument is less + * than the second, greater than 0 otherwise. + */ +static int +mlx5_dev_spawn_data_cmp(const void *a, const void *b) +{ + const struct mlx5_switch_info *si_a = + &((const struct mlx5_dev_spawn_data *)a)->info; + const struct mlx5_switch_info *si_b = + &((const struct mlx5_dev_spawn_data *)b)->info; + int ret; + + /* Master device first. */ + ret = si_b->master - si_a->master; + if (ret) + return ret; + /* Then representor devices. */ + ret = si_b->representor - si_a->representor; + if (ret) + return ret; + /* Unidentified devices come last in no specific order. */ + if (!si_a->representor) + return 0; + /* Order representors by name. */ + return si_a->port_name - si_b->port_name; +} + +/** + * DPDK callback to register a PCI device. + * + * This function spawns Ethernet devices out of a given PCI device. + * + * @param[in] pci_drv + * PCI driver structure (mlx5_driver). + * @param[in] pci_dev + * PCI device information. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, + struct rte_pci_device *pci_dev) +{ + struct ibv_device **ibv_list; + unsigned int n = 0; + int vf; + int ret; + + assert(pci_drv == &mlx5_driver); + errno = 0; + ibv_list = mlx5_glue->get_device_list(&ret); + if (!ibv_list) { + rte_errno = errno ? errno : ENOSYS; + DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); + return -rte_errno; + } + + struct ibv_device *ibv_match[ret + 1]; + + while (ret-- > 0) { + struct rte_pci_addr pci_addr; + + DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); + if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr)) + continue; + if (pci_dev->addr.domain != pci_addr.domain || + pci_dev->addr.bus != pci_addr.bus || + pci_dev->addr.devid != pci_addr.devid || + pci_dev->addr.function != pci_addr.function) + continue; + DRV_LOG(INFO, "PCI information matches for device \"%s\"", + ibv_list[ret]->name); + ibv_match[n++] = ibv_list[ret]; + } + ibv_match[n] = NULL; + + struct mlx5_dev_spawn_data list[n]; + int nl_route = n ? mlx5_nl_init(NETLINK_ROUTE) : -1; + int nl_rdma = n ? mlx5_nl_init(NETLINK_RDMA) : -1; + unsigned int i; + unsigned int u; + + /* + * The existence of several matching entries (n > 1) means port + * representors have been instantiated. No existing Verbs call nor + * /sys entries can tell them apart, this can only be done through + * Netlink calls assuming kernel drivers are recent enough to + * support them. + * + * In the event of identification failure through Netlink, try again + * through sysfs, then either: + * + * 1. No device matches (n == 0), complain and bail out. + * 2. A single IB device matches (n == 1) and is not a representor, + * assume no switch support. + * 3. Otherwise no safe assumptions can be made; complain louder and + * bail out. + */ + for (i = 0; i != n; ++i) { + list[i].ibv_dev = ibv_match[i]; + list[i].eth_dev = NULL; + if (nl_rdma < 0) + list[i].ifindex = 0; + else + list[i].ifindex = mlx5_nl_ifindex + (nl_rdma, list[i].ibv_dev->name); + if (nl_route < 0 || + !list[i].ifindex || + mlx5_nl_switch_info(nl_route, list[i].ifindex, + &list[i].info) || + ((!list[i].info.representor && !list[i].info.master) && + mlx5_sysfs_switch_info(list[i].ifindex, &list[i].info))) { + list[i].ifindex = 0; + memset(&list[i].info, 0, sizeof(list[i].info)); + continue; + } + } + if (nl_rdma >= 0) + close(nl_rdma); + if (nl_route >= 0) + close(nl_route); + /* Count unidentified devices. */ + for (u = 0, i = 0; i != n; ++i) + if (!list[i].info.master && !list[i].info.representor) + ++u; + if (u) { + if (n == 1 && u == 1) { + /* Case #2. */ + DRV_LOG(INFO, "no switch support detected"); + } else { + /* Case #3. */ + DRV_LOG(ERR, + "unable to tell which of the matching devices" + " is the master (lack of kernel support?)"); + n = 0; + } + } + /* + * Sort list to probe devices in natural order for users convenience + * (i.e. master first, then representors from lowest to highest ID). + */ + if (n) + qsort(list, n, sizeof(*list), mlx5_dev_spawn_data_cmp); + switch (pci_dev->id.device_id) { + case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: + vf = 1; + break; + default: + vf = 0; + } + for (i = 0; i != n; ++i) { + uint32_t restore; + + list[i].eth_dev = mlx5_dev_spawn + (&pci_dev->device, list[i].ibv_dev, vf, &list[i].info); + if (!list[i].eth_dev) { + if (rte_errno != EBUSY) + break; + /* Device is disabled, ignore it. */ + continue; + } + restore = list[i].eth_dev->data->dev_flags; + rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); + /* Restore non-PCI flags cleared by the above call. */ + list[i].eth_dev->data->dev_flags |= restore; + rte_eth_dev_probing_finish(list[i].eth_dev); + } + mlx5_glue->free_device_list(ibv_list); + if (!n) { + DRV_LOG(WARNING, + "no Verbs device matches PCI device " PCI_PRI_FMT "," + " are kernel drivers loaded?", + pci_dev->addr.domain, pci_dev->addr.bus, + pci_dev->addr.devid, pci_dev->addr.function); + rte_errno = ENOENT; + ret = -rte_errno; + } else if (i != n) { + DRV_LOG(ERR, + "probe of PCI device " PCI_PRI_FMT " aborted after" + " encountering an error: %s", + pci_dev->addr.domain, pci_dev->addr.bus, + pci_dev->addr.devid, pci_dev->addr.function, + strerror(rte_errno)); + ret = -rte_errno; + /* Roll back. */ + while (i--) { + if (!list[i].eth_dev) + continue; + mlx5_dev_close(list[i].eth_dev); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_free(list[i].eth_dev->data->dev_private); + claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); + } + /* Restore original error. */ + rte_errno = -ret; + } else { + ret = 0; + } + return ret; } static const struct rte_pci_id mlx5_pci_id_map[] = { @@ -1036,6 +1483,10 @@ static const struct rte_pci_id mlx5_pci_id_map[] = { PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) }, { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) + }, + { .vendor_id = 0 } }; @@ -1052,11 +1503,54 @@ static struct rte_pci_driver mlx5_driver = { #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS /** + * Suffix RTE_EAL_PMD_PATH with "-glue". + * + * This function performs a sanity check on RTE_EAL_PMD_PATH before + * suffixing its last component. + * + * @param buf[out] + * Output buffer, should be large enough otherwise NULL is returned. + * @param size + * Size of @p out. + * + * @return + * Pointer to @p buf or @p NULL in case suffix cannot be appended. + */ +static char * +mlx5_glue_path(char *buf, size_t size) +{ + static const char *const bad[] = { "/", ".", "..", NULL }; + const char *path = RTE_EAL_PMD_PATH; + size_t len = strlen(path); + size_t off; + int i; + + while (len && path[len - 1] == '/') + --len; + for (off = len; off && path[off - 1] != '/'; --off) + ; + for (i = 0; bad[i]; ++i) + if (!strncmp(path + off, bad[i], (int)(len - off))) + goto error; + i = snprintf(buf, size, "%.*s-glue", (int)len, path); + if (i == -1 || (size_t)i >= size) + goto error; + return buf; +error: + DRV_LOG(ERR, + "unable to append \"-glue\" to last component of" + " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\")," + " please re-configure DPDK"); + return NULL; +} + +/** * Initialization routine for run-time dependency on rdma-core. */ static int mlx5_glue_init(void) { + char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; const char *path[] = { /* * A basic security check is necessary before trusting @@ -1064,7 +1558,13 @@ mlx5_glue_init(void) */ (geteuid() == getuid() && getegid() == getgid() ? getenv("MLX5_GLUE_PATH") : NULL), - RTE_EAL_PMD_PATH, + /* + * When RTE_EAL_PMD_PATH is set, use its glue-suffixed + * variant, otherwise let dlopen() look up libraries on its + * own. + */ + (*RTE_EAL_PMD_PATH ? + mlx5_glue_path(glue_path, sizeof(glue_path)) : ""), }; unsigned int i = 0; void *handle = NULL; @@ -1095,7 +1595,8 @@ mlx5_glue_init(void) break; if (sizeof(name) != (size_t)ret + 1) continue; - DEBUG("looking for rdma-core glue as \"%s\"", name); + DRV_LOG(DEBUG, "looking for rdma-core glue as \"%s\"", + name); handle = dlopen(name, RTLD_LAZY); break; } while (1); @@ -1107,7 +1608,7 @@ mlx5_glue_init(void) rte_errno = EINVAL; dlmsg = dlerror(); if (dlmsg) - WARN("cannot load glue library: %s", dlmsg); + DRV_LOG(WARNING, "cannot load glue library: %s", dlmsg); goto glue_error; } sym = dlsym(handle, "mlx5_glue"); @@ -1115,7 +1616,7 @@ mlx5_glue_init(void) rte_errno = EINVAL; dlmsg = dlerror(); if (dlmsg) - ERROR("cannot resolve glue symbol: %s", dlmsg); + DRV_LOG(ERR, "cannot resolve glue symbol: %s", dlmsg); goto glue_error; } mlx5_glue = *sym; @@ -1123,9 +1624,9 @@ mlx5_glue_init(void) glue_error: if (handle) dlclose(handle); - WARN("cannot initialize PMD due to missing run-time" - " dependency on rdma-core libraries (libibverbs," - " libmlx5)"); + DRV_LOG(WARNING, + "cannot initialize PMD due to missing run-time dependency on" + " rdma-core libraries (libibverbs, libmlx5)"); return -rte_errno; } @@ -1134,12 +1635,17 @@ glue_error: /** * Driver initialization routine. */ -RTE_INIT(rte_mlx5_pmd_init); -static void -rte_mlx5_pmd_init(void) +RTE_INIT(rte_mlx5_pmd_init) { - /* Build the static table for ptype conversion. */ + /* Initialize driver log type. */ + mlx5_logtype = rte_log_register("pmd.net.mlx5"); + if (mlx5_logtype >= 0) + rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE); + + /* Build the static tables for Verbs conversion. */ mlx5_set_ptype_table(); + mlx5_set_cksum_table(); + mlx5_set_swp_types_table(); /* * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use * huge pages. Calling ibv_fork_init() during init allows @@ -1150,6 +1656,11 @@ rte_mlx5_pmd_init(void) /* Match the size of Rx completion entry to the size of a cacheline. */ if (RTE_CACHE_LINE_SIZE == 128) setenv("MLX5_CQE_SIZE", "128", 0); + /* + * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to + * cleanup all the Verbs resources even when the device was removed. + */ + setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1); #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS if (mlx5_glue_init()) return; @@ -1165,8 +1676,9 @@ rte_mlx5_pmd_init(void) } #endif if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) { - ERROR("rdma-core glue \"%s\" mismatch: \"%s\" is required", - mlx5_glue->version, MLX5_GLUE_VERSION); + DRV_LOG(ERR, + "rdma-core glue \"%s\" mismatch: \"%s\" is required", + mlx5_glue->version, MLX5_GLUE_VERSION); return; } mlx5_glue->fork_init(); diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 965c19f2..a3a34cff 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_H_ @@ -26,12 +26,13 @@ #include <rte_pci.h> #include <rte_ether.h> #include <rte_ethdev_driver.h> -#include <rte_spinlock.h> +#include <rte_rwlock.h> #include <rte_interrupts.h> #include <rte_errno.h> #include <rte_flow.h> #include "mlx5_utils.h" +#include "mlx5_mr.h" #include "mlx5_rxtx.h" #include "mlx5_autoconf.h" #include "mlx5_defs.h" @@ -49,8 +50,27 @@ enum { PCI_DEVICE_ID_MELLANOX_CONNECTX5VF = 0x1018, PCI_DEVICE_ID_MELLANOX_CONNECTX5EX = 0x1019, PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF = 0x101a, + PCI_DEVICE_ID_MELLANOX_CONNECTX5BF = 0xa2d2, }; +/** Switch information returned by mlx5_nl_switch_info(). */ +struct mlx5_switch_info { + uint32_t master:1; /**< Master device. */ + uint32_t representor:1; /**< Representor device. */ + int32_t port_name; /**< Representor port name. */ + uint64_t switch_id; /**< Switch identifier. */ +}; + +LIST_HEAD(mlx5_dev_list, priv); + +/* Shared memory between primary and secondary processes. */ +struct mlx5_shared_data { + struct mlx5_dev_list mem_event_cb_list; + rte_rwlock_t mem_event_rwlock; +}; + +extern struct mlx5_shared_data *mlx5_shared_data; + struct mlx5_xstats_ctrl { /* Number of device stats. */ uint16_t stats_n; @@ -75,19 +95,34 @@ TAILQ_HEAD(mlx5_flows, rte_flow); */ struct mlx5_dev_config { unsigned int hw_csum:1; /* Checksum offload is supported. */ - unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */ unsigned int hw_vlan_strip:1; /* VLAN stripping is supported. */ unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */ unsigned int hw_padding:1; /* End alignment padding is supported. */ - unsigned int sriov:1; /* This is a VF or PF with VF devices. */ + unsigned int vf:1; /* This is a VF. */ unsigned int mps:2; /* Multi-packet send supported mode. */ - unsigned int tunnel_en:1; /* Whether tunnel is supported. */ + unsigned int tunnel_en:1; + /* Whether tunnel stateless offloads are supported. */ + unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */ unsigned int flow_counter_en:1; /* Whether flow counter is supported. */ unsigned int cqe_comp:1; /* CQE compression is enabled. */ unsigned int tso:1; /* Whether TSO is supported. */ unsigned int tx_vec_en:1; /* Tx vector is enabled. */ unsigned int rx_vec_en:1; /* Rx vector is enabled. */ unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */ + unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */ + unsigned int vf_nl_en:1; /* Enable Netlink requests in VF mode. */ + unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload. */ + struct { + unsigned int enabled:1; /* Whether MPRQ is enabled. */ + unsigned int stride_num_n; /* Number of strides. */ + unsigned int min_stride_size_n; /* Min size of a stride. */ + unsigned int max_stride_size_n; /* Max size of a stride. */ + unsigned int max_memcpy_len; + /* Maximum packet size to memcpy Rx packets. */ + unsigned int min_rxqs_num; + /* Rx queue count threshold to enable MPRQ. */ + } mprq; /* Configurations for Multi-Packet RQ. */ + unsigned int flow_prio; /* Number of flow priorities. */ unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */ unsigned int ind_table_max_size; /* Maximum indirection table size. */ int txq_inline; /* Maximum packet size for inlining. */ @@ -113,33 +148,63 @@ struct mlx5_verbs_alloc_ctx { const void *obj; /* Pointer to the DPDK object. */ }; +LIST_HEAD(mlx5_mr_list, mlx5_mr); + +/* Flow drop context necessary due to Verbs API. */ +struct mlx5_drop { + struct mlx5_hrxq *hrxq; /* Hash Rx queue queue. */ + struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */ +}; + +/** DPDK port to network interface index (ifindex) conversion. */ +struct mlx5_nl_flow_ptoi { + uint16_t port_id; /**< DPDK port ID. */ + unsigned int ifindex; /**< Network interface index. */ +}; + +struct mnl_socket; + struct priv { - struct rte_eth_dev *dev; /* Ethernet device of master process. */ + LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */ + struct rte_eth_dev_data *dev_data; /* Pointer to device data. */ struct ibv_context *ctx; /* Verbs context. */ struct ibv_device_attr_ex device_attr; /* Device properties. */ struct ibv_pd *pd; /* Protection Domain. */ + char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */ char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */ struct ether_addr mac[MLX5_MAX_MAC_ADDRESSES]; /* MAC addresses. */ + BITFIELD_DECLARE(mac_own, uint64_t, MLX5_MAX_MAC_ADDRESSES); + /* Bit-field of MAC addresses owned by the PMD. */ uint16_t vlan_filter[MLX5_MAX_VLAN_IDS]; /* VLAN filters table. */ unsigned int vlan_filter_n; /* Number of configured VLAN filters. */ /* Device properties. */ uint16_t mtu; /* Configured MTU. */ - uint8_t port; /* Physical port number. */ - unsigned int pending_alarm:1; /* An alarm is pending. */ unsigned int isolated:1; /* Whether isolated mode is enabled. */ + unsigned int representor:1; /* Device is a port representor. */ + uint16_t domain_id; /* Switch domain identifier. */ + int32_t representor_id; /* Port representor identifier. */ /* RX/TX queues. */ unsigned int rxqs_n; /* RX queues array size. */ unsigned int txqs_n; /* TX queues array size. */ struct mlx5_rxq_data *(*rxqs)[]; /* RX queues. */ struct mlx5_txq_data *(*txqs)[]; /* TX queues. */ + struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */ struct rte_eth_rss_conf rss_conf; /* RSS configuration. */ struct rte_intr_handle intr_handle; /* Interrupt handler. */ unsigned int (*reta_idx)[]; /* RETA index table. */ unsigned int reta_idx_n; /* RETA index size. */ - struct mlx5_hrxq_drop *flow_drop_queue; /* Flow drop queue. */ + struct mlx5_drop drop_queue; /* Flow drop queues. */ struct mlx5_flows flows; /* RTE Flow rules. */ struct mlx5_flows ctrl_flows; /* Control flow rules. */ - LIST_HEAD(mr, mlx5_mr) mr; /* Memory region. */ + LIST_HEAD(counters, mlx5_flow_counter) flow_counters; + /* Flow counters. */ + struct { + uint32_t dev_gen; /* Generation number to flush local caches. */ + rte_rwlock_t rwlock; /* MR Lock. */ + struct mlx5_mr_btree cache; /* Global MR cache table. */ + struct mlx5_mr_list mr_list; /* Registered MR list. */ + struct mlx5_mr_list mr_free_list; /* Freed MR list. */ + } mr; LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */ LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */ LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */ @@ -149,55 +214,25 @@ struct priv { LIST_HEAD(ind_tables, mlx5_ind_table_ibv) ind_tbls; uint32_t link_speed_capa; /* Link speed capabilities. */ struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */ - rte_spinlock_t lock; /* Lock for control functions. */ int primary_socket; /* Unix socket for primary process. */ void *uar_base; /* Reserved address space for UAR mapping */ struct rte_intr_handle intr_handle_socket; /* Interrupt handler. */ struct mlx5_dev_config config; /* Device configuration. */ struct mlx5_verbs_alloc_ctx verbs_alloc_ctx; /* Context for Verbs allocator. */ + int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */ + int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */ + uint32_t nl_sn; /* Netlink message sequence number. */ +#ifndef RTE_ARCH_64 + rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */ + rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX]; + /* UAR same-page access control required in 32bit implementations. */ +#endif + struct mnl_socket *mnl_socket; /* Libmnl socket. */ }; -/** - * Lock private structure to protect it from concurrent access in the - * control path. - * - * @param priv - * Pointer to private structure. - */ -static inline void -priv_lock(struct priv *priv) -{ - rte_spinlock_lock(&priv->lock); -} - -/** - * Try to lock private structure to protect it from concurrent access in the - * control path. - * - * @param priv - * Pointer to private structure. - * - * @return - * 1 if the lock is successfully taken; 0 otherwise. - */ -static inline int -priv_trylock(struct priv *priv) -{ - return rte_spinlock_trylock(&priv->lock); -} - -/** - * Unlock private structure. - * - * @param priv - * Pointer to private structure. - */ -static inline void -priv_unlock(struct priv *priv) -{ - rte_spinlock_unlock(&priv->lock); -} +#define PORT_ID(priv) ((priv)->dev_data->port_id) +#define ETH_DEV(priv) (&rte_eth_devices[PORT_ID(priv)]) /* mlx5.c */ @@ -205,131 +240,179 @@ int mlx5_getenv_int(const char *); /* mlx5_ethdev.c */ -struct priv *mlx5_get_priv(struct rte_eth_dev *dev); -int mlx5_is_secondary(void); -int priv_get_ifname(const struct priv *, char (*)[IF_NAMESIZE]); -int priv_ifreq(const struct priv *, int req, struct ifreq *); -int priv_is_ib_cntr(const char *); -int priv_get_cntr_sysfs(struct priv *, const char *, uint64_t *); -int priv_get_num_vfs(struct priv *, uint16_t *); -int priv_get_mtu(struct priv *, uint16_t *); -int priv_set_flags(struct priv *, unsigned int, unsigned int); -int mlx5_dev_configure(struct rte_eth_dev *); -void mlx5_dev_infos_get(struct rte_eth_dev *, struct rte_eth_dev_info *); +int mlx5_get_master_ifname(const struct rte_eth_dev *dev, + char (*ifname)[IF_NAMESIZE]); +int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]); +unsigned int mlx5_ifindex(const struct rte_eth_dev *dev); +int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr, + int master); +int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu); +int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, + unsigned int flags); +int mlx5_dev_configure(struct rte_eth_dev *dev); +void mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info); const uint32_t *mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev); -int priv_link_update(struct priv *, int); -int priv_force_link_status_change(struct priv *, int); -int mlx5_link_update(struct rte_eth_dev *, int); -int mlx5_dev_set_mtu(struct rte_eth_dev *, uint16_t); -int mlx5_dev_get_flow_ctrl(struct rte_eth_dev *, struct rte_eth_fc_conf *); -int mlx5_dev_set_flow_ctrl(struct rte_eth_dev *, struct rte_eth_fc_conf *); -int mlx5_ibv_device_to_pci_addr(const struct ibv_device *, - struct rte_pci_addr *); -void mlx5_dev_link_status_handler(void *); -void mlx5_dev_interrupt_handler(void *); -void priv_dev_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *); -void priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *); +int mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete); +int mlx5_force_link_status_change(struct rte_eth_dev *dev, int status); +int mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu); +int mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +int mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +int mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, + struct rte_pci_addr *pci_addr); +void mlx5_dev_link_status_handler(void *arg); +void mlx5_dev_interrupt_handler(void *arg); +void mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev); +void mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev); int mlx5_set_link_down(struct rte_eth_dev *dev); int mlx5_set_link_up(struct rte_eth_dev *dev); int mlx5_is_removed(struct rte_eth_dev *dev); -eth_tx_burst_t priv_select_tx_function(struct priv *, struct rte_eth_dev *); -eth_rx_burst_t priv_select_rx_function(struct priv *, struct rte_eth_dev *); +eth_tx_burst_t mlx5_select_tx_function(struct rte_eth_dev *dev); +eth_rx_burst_t mlx5_select_rx_function(struct rte_eth_dev *dev); +unsigned int mlx5_dev_to_port_id(const struct rte_device *dev, + uint16_t *port_list, + unsigned int port_list_n); +int mlx5_sysfs_switch_info(unsigned int ifindex, + struct mlx5_switch_info *info); /* mlx5_mac.c */ -int priv_get_mac(struct priv *, uint8_t (*)[ETHER_ADDR_LEN]); -void mlx5_mac_addr_remove(struct rte_eth_dev *, uint32_t); -int mlx5_mac_addr_add(struct rte_eth_dev *, struct ether_addr *, uint32_t, - uint32_t); -void mlx5_mac_addr_set(struct rte_eth_dev *, struct ether_addr *); +int mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN]); +void mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index); +int mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index, uint32_t vmdq); +int mlx5_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr); +int mlx5_set_mc_addr_list(struct rte_eth_dev *dev, + struct ether_addr *mc_addr_set, uint32_t nb_mc_addr); /* mlx5_rss.c */ -int mlx5_rss_hash_update(struct rte_eth_dev *, struct rte_eth_rss_conf *); -int mlx5_rss_hash_conf_get(struct rte_eth_dev *, struct rte_eth_rss_conf *); -int priv_rss_reta_index_resize(struct priv *, unsigned int); -int mlx5_dev_rss_reta_query(struct rte_eth_dev *, - struct rte_eth_rss_reta_entry64 *, uint16_t); -int mlx5_dev_rss_reta_update(struct rte_eth_dev *, - struct rte_eth_rss_reta_entry64 *, uint16_t); +int mlx5_rss_hash_update(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf); +int mlx5_rss_hash_conf_get(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf); +int mlx5_rss_reta_index_resize(struct rte_eth_dev *dev, unsigned int reta_size); +int mlx5_dev_rss_reta_query(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size); +int mlx5_dev_rss_reta_update(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, + uint16_t reta_size); /* mlx5_rxmode.c */ -void mlx5_promiscuous_enable(struct rte_eth_dev *); -void mlx5_promiscuous_disable(struct rte_eth_dev *); -void mlx5_allmulticast_enable(struct rte_eth_dev *); -void mlx5_allmulticast_disable(struct rte_eth_dev *); +void mlx5_promiscuous_enable(struct rte_eth_dev *dev); +void mlx5_promiscuous_disable(struct rte_eth_dev *dev); +void mlx5_allmulticast_enable(struct rte_eth_dev *dev); +void mlx5_allmulticast_disable(struct rte_eth_dev *dev); /* mlx5_stats.c */ -void priv_xstats_init(struct priv *); -int mlx5_stats_get(struct rte_eth_dev *, struct rte_eth_stats *); -void mlx5_stats_reset(struct rte_eth_dev *); -int mlx5_xstats_get(struct rte_eth_dev *, - struct rte_eth_xstat *, unsigned int); -void mlx5_xstats_reset(struct rte_eth_dev *); -int mlx5_xstats_get_names(struct rte_eth_dev *, - struct rte_eth_xstat_name *, unsigned int); +void mlx5_xstats_init(struct rte_eth_dev *dev); +int mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats); +void mlx5_stats_reset(struct rte_eth_dev *dev); +int mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats, + unsigned int n); +void mlx5_xstats_reset(struct rte_eth_dev *dev); +int mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_xstat_name *xstats_names, + unsigned int n); /* mlx5_vlan.c */ -int mlx5_vlan_filter_set(struct rte_eth_dev *, uint16_t, int); -int mlx5_vlan_offload_set(struct rte_eth_dev *, int); -void mlx5_vlan_strip_queue_set(struct rte_eth_dev *, uint16_t, int); +int mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on); +void mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on); +int mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask); /* mlx5_trigger.c */ -int mlx5_dev_start(struct rte_eth_dev *); -void mlx5_dev_stop(struct rte_eth_dev *); -int priv_dev_traffic_enable(struct priv *, struct rte_eth_dev *); -int priv_dev_traffic_disable(struct priv *, struct rte_eth_dev *); -int priv_dev_traffic_restart(struct priv *, struct rte_eth_dev *); -int mlx5_traffic_restart(struct rte_eth_dev *); +int mlx5_dev_start(struct rte_eth_dev *dev); +void mlx5_dev_stop(struct rte_eth_dev *dev); +int mlx5_traffic_enable(struct rte_eth_dev *dev); +void mlx5_traffic_disable(struct rte_eth_dev *dev); +int mlx5_traffic_restart(struct rte_eth_dev *dev); /* mlx5_flow.c */ -int mlx5_dev_filter_ctrl(struct rte_eth_dev *, enum rte_filter_type, - enum rte_filter_op, void *); -int mlx5_flow_validate(struct rte_eth_dev *, const struct rte_flow_attr *, - const struct rte_flow_item [], - const struct rte_flow_action [], - struct rte_flow_error *); -struct rte_flow *mlx5_flow_create(struct rte_eth_dev *, - const struct rte_flow_attr *, - const struct rte_flow_item [], - const struct rte_flow_action [], - struct rte_flow_error *); -int mlx5_flow_destroy(struct rte_eth_dev *, struct rte_flow *, - struct rte_flow_error *); -void priv_flow_flush(struct priv *, struct mlx5_flows *); -int mlx5_flow_flush(struct rte_eth_dev *, struct rte_flow_error *); -int mlx5_flow_query(struct rte_eth_dev *, struct rte_flow *, - enum rte_flow_action_type, void *, - struct rte_flow_error *); -int mlx5_flow_isolate(struct rte_eth_dev *, int, struct rte_flow_error *); -int priv_flow_start(struct priv *, struct mlx5_flows *); -void priv_flow_stop(struct priv *, struct mlx5_flows *); -int priv_flow_verify(struct priv *); -int mlx5_ctrl_flow_vlan(struct rte_eth_dev *, struct rte_flow_item_eth *, - struct rte_flow_item_eth *, struct rte_flow_item_vlan *, - struct rte_flow_item_vlan *); -int mlx5_ctrl_flow(struct rte_eth_dev *, struct rte_flow_item_eth *, - struct rte_flow_item_eth *); -int priv_flow_create_drop_queue(struct priv *); -void priv_flow_delete_drop_queue(struct priv *); +int mlx5_flow_discover_priorities(struct rte_eth_dev *dev); +void mlx5_flow_print(struct rte_flow *flow); +int mlx5_flow_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); +struct rte_flow *mlx5_flow_create(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); +int mlx5_flow_destroy(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error); +void mlx5_flow_list_flush(struct rte_eth_dev *dev, struct mlx5_flows *list); +int mlx5_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error); +int mlx5_flow_query(struct rte_eth_dev *dev, struct rte_flow *flow, + const struct rte_flow_action *action, void *data, + struct rte_flow_error *error); +int mlx5_flow_isolate(struct rte_eth_dev *dev, int enable, + struct rte_flow_error *error); +int mlx5_dev_filter_ctrl(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg); +int mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list); +void mlx5_flow_stop(struct rte_eth_dev *dev, struct mlx5_flows *list); +int mlx5_flow_verify(struct rte_eth_dev *dev); +int mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, + struct rte_flow_item_eth *eth_spec, + struct rte_flow_item_eth *eth_mask, + struct rte_flow_item_vlan *vlan_spec, + struct rte_flow_item_vlan *vlan_mask); +int mlx5_ctrl_flow(struct rte_eth_dev *dev, + struct rte_flow_item_eth *eth_spec, + struct rte_flow_item_eth *eth_mask); +int mlx5_flow_create_drop_queue(struct rte_eth_dev *dev); +void mlx5_flow_delete_drop_queue(struct rte_eth_dev *dev); /* mlx5_socket.c */ -int priv_socket_init(struct priv *priv); -int priv_socket_uninit(struct priv *priv); -void priv_socket_handle(struct priv *priv); -int priv_socket_connect(struct priv *priv); - -/* mlx5_mr.c */ - -struct mlx5_mr *priv_mr_new(struct priv *, struct rte_mempool *); -struct mlx5_mr *priv_mr_get(struct priv *, struct rte_mempool *); -int priv_mr_release(struct priv *, struct mlx5_mr *); -int priv_mr_verify(struct priv *); +int mlx5_socket_init(struct rte_eth_dev *priv); +void mlx5_socket_uninit(struct rte_eth_dev *priv); +void mlx5_socket_handle(struct rte_eth_dev *priv); +int mlx5_socket_connect(struct rte_eth_dev *priv); + +/* mlx5_nl.c */ + +int mlx5_nl_init(int protocol); +int mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index); +int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index); +void mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev); +void mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev); +int mlx5_nl_promisc(struct rte_eth_dev *dev, int enable); +int mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable); +unsigned int mlx5_nl_ifindex(int nl, const char *name); +int mlx5_nl_switch_info(int nl, unsigned int ifindex, + struct mlx5_switch_info *info); + +/* mlx5_nl_flow.c */ + +int mlx5_nl_flow_transpose(void *buf, + size_t size, + const struct mlx5_nl_flow_ptoi *ptoi, + const struct rte_flow_attr *attr, + const struct rte_flow_item *pattern, + const struct rte_flow_action *actions, + struct rte_flow_error *error); +void mlx5_nl_flow_brand(void *buf, uint32_t handle); +int mlx5_nl_flow_create(struct mnl_socket *nl, void *buf, + struct rte_flow_error *error); +int mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf, + struct rte_flow_error *error); +int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex, + struct rte_flow_error *error); +struct mnl_socket *mlx5_nl_flow_socket_create(void); +void mlx5_nl_flow_socket_destroy(struct mnl_socket *nl); #endif /* RTE_PMD_MLX5_H_ */ diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index c3334ca3..f2a16795 100644 --- a/drivers/net/mlx5/mlx5_defs.h +++ b/drivers/net/mlx5/mlx5_defs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_DEFS_H_ @@ -13,8 +13,13 @@ /* Reported driver name. */ #define MLX5_DRIVER_NAME "net_mlx5" +/* Maximum number of simultaneous unicast MAC addresses. */ +#define MLX5_MAX_UC_MAC_ADDRESSES 128 +/* Maximum number of simultaneous Multicast MAC addresses. */ +#define MLX5_MAX_MC_MAC_ADDRESSES 128 /* Maximum number of simultaneous MAC addresses. */ -#define MLX5_MAX_MAC_ADDRESSES 128 +#define MLX5_MAX_MAC_ADDRESSES \ + (MLX5_MAX_UC_MAC_ADDRESSES + MLX5_MAX_MC_MAC_ADDRESSES) /* Maximum number of simultaneous VLAN filters. */ #define MLX5_MAX_VLAN_IDS 128 @@ -32,16 +37,11 @@ */ #define MLX5_TX_COMP_THRESH_INLINE_DIV (1 << 3) -/* - * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP - * from which buffers are to be transmitted will have to be mapped by this - * driver to their own Memory Region (MR). This is a slow operation. - * - * This value is always 1 for RX queues. - */ -#ifndef MLX5_PMD_TX_MP_CACHE -#define MLX5_PMD_TX_MP_CACHE 8 -#endif +/* Size of per-queue MR cache array for linear search. */ +#define MLX5_MR_CACHE_N 8 + +/* Size of MR cache table for binary search. */ +#define MLX5_MR_BTREE_CACHE_N 256 /* * If defined, only use software counters. The PMD will never ask the hardware @@ -58,16 +58,17 @@ #define MLX5_MAX_XSTATS 32 /* Maximum Packet headers size (L2+L3+L4) for TSO. */ -#define MLX5_MAX_TSO_HEADER 128 +#define MLX5_MAX_TSO_HEADER 192 /* Default minimum number of Tx queues for vectorized Tx. */ #define MLX5_VPMD_MIN_TXQS 4 /* Threshold of buffer replenishment for vectorized Rx. */ -#define MLX5_VPMD_RXQ_RPLNSH_THRESH 64U +#define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \ + (RTE_MIN(MLX5_VPMD_RX_MAX_BURST, (unsigned int)(n) >> 2)) /* Maximum size of burst for vectorized Rx. */ -#define MLX5_VPMD_RX_MAX_BURST MLX5_VPMD_RXQ_RPLNSH_THRESH +#define MLX5_VPMD_RX_MAX_BURST 64U /* * Maximum size of burst for vectorized Tx. This is related to the maximum size @@ -82,17 +83,54 @@ /* Supported RSS */ #define MLX5_RSS_HF_MASK (~(ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP)) -/* Maximum number of attempts to query link status before giving up. */ -#define MLX5_MAX_LINK_QUERY_ATTEMPTS 5 +/* Timeout in seconds to get a valid link status. */ +#define MLX5_LINK_STATUS_TIMEOUT 10 /* Reserved address space for UAR mapping. */ -#define MLX5_UAR_SIZE (1ULL << 32) +#define MLX5_UAR_SIZE (1ULL << (sizeof(uintptr_t) * 4)) /* Offset of reserved UAR address space to hugepage memory. Offset is used here * to minimize possibility of address next to hugepage being used by other code * in either primary or secondary process, failing to map TX UAR would make TX * packets invisible to HW. */ -#define MLX5_UAR_OFFSET (1ULL << 32) +#define MLX5_UAR_OFFSET (1ULL << (sizeof(uintptr_t) * 4)) + +/* Maximum number of UAR pages used by a port, + * These are the size and mask for an array of mutexes used to synchronize + * the access to port's UARs on platforms that do not support 64 bit writes. + * In such systems it is possible to issue the 64 bits DoorBells through two + * consecutive writes, each write 32 bits. The access to a UAR page (which can + * be accessible by all threads in the process) must be synchronized + * (for example, using a semaphore). Such a synchronization is not required + * when ringing DoorBells on different UAR pages. + * A port with 512 Tx queues uses 8, 4kBytes, UAR pages which are shared + * among the ports. + */ +#define MLX5_UAR_PAGE_NUM_MAX 64 +#define MLX5_UAR_PAGE_NUM_MASK ((MLX5_UAR_PAGE_NUM_MAX) - 1) + +/* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */ +#define MLX5_MPRQ_STRIDE_NUM_N 6U + +/* Two-byte shift is disabled for Multi-Packet RQ. */ +#define MLX5_MPRQ_TWO_BYTE_SHIFT 0 + +/* + * Minimum size of packet to be memcpy'd instead of being attached as an + * external buffer. + */ +#define MLX5_MPRQ_MEMCPY_DEFAULT_LEN 128 + +/* Minimum number Rx queues to enable Multi-Packet RQ. */ +#define MLX5_MPRQ_MIN_RXQS 12 + +/* Cache size of mempool for Multi-Packet RQ. */ +#define MLX5_MPRQ_MP_CACHE_SZ 32U + +/* Definition of static_assert found in /usr/include/assert.h */ +#ifndef HAVE_STATIC_ASSERT +#define static_assert _Static_assert +#endif #endif /* RTE_PMD_MLX5_DEFS_H_ */ diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index 66650769..34c5b95e 100644 --- a/drivers/net/mlx5/mlx5_ethdev.c +++ b/drivers/net/mlx5/mlx5_ethdev.c @@ -1,12 +1,13 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #define _GNU_SOURCE #include <stddef.h> #include <assert.h> +#include <inttypes.h> #include <unistd.h> #include <stdint.h> #include <stdio.h> @@ -17,14 +18,13 @@ #include <net/if.h> #include <sys/ioctl.h> #include <sys/socket.h> -#include <sys/utsname.h> #include <netinet/in.h> #include <linux/ethtool.h> #include <linux/sockios.h> -#include <linux/version.h> #include <fcntl.h> #include <stdalign.h> #include <sys/un.h> +#include <time.h> #include <rte_atomic.h> #include <rte_ethdev_driver.h> @@ -32,14 +32,41 @@ #include <rte_mbuf.h> #include <rte_common.h> #include <rte_interrupts.h> -#include <rte_alarm.h> #include <rte_malloc.h> +#include <rte_string_fns.h> +#include <rte_rwlock.h> #include "mlx5.h" #include "mlx5_glue.h" #include "mlx5_rxtx.h" #include "mlx5_utils.h" +/* Supported speed values found in /usr/include/linux/ethtool.h */ +#ifndef HAVE_SUPPORTED_40000baseKR4_Full +#define SUPPORTED_40000baseKR4_Full (1 << 23) +#endif +#ifndef HAVE_SUPPORTED_40000baseCR4_Full +#define SUPPORTED_40000baseCR4_Full (1 << 24) +#endif +#ifndef HAVE_SUPPORTED_40000baseSR4_Full +#define SUPPORTED_40000baseSR4_Full (1 << 25) +#endif +#ifndef HAVE_SUPPORTED_40000baseLR4_Full +#define SUPPORTED_40000baseLR4_Full (1 << 26) +#endif +#ifndef HAVE_SUPPORTED_56000baseKR4_Full +#define SUPPORTED_56000baseKR4_Full (1 << 27) +#endif +#ifndef HAVE_SUPPORTED_56000baseCR4_Full +#define SUPPORTED_56000baseCR4_Full (1 << 28) +#endif +#ifndef HAVE_SUPPORTED_56000baseSR4_Full +#define SUPPORTED_56000baseSR4_Full (1 << 29) +#endif +#ifndef HAVE_SUPPORTED_56000baseLR4_Full +#define SUPPORTED_56000baseLR4_Full (1 << 30) +#endif + /* Add defines in case the running kernel is not the same as user headers. */ #ifndef ETHTOOL_GLINKSETTINGS struct ethtool_link_settings { @@ -92,19 +119,21 @@ struct ethtool_link_settings { #endif /** - * Get interface name from private structure. + * Get master interface name from private structure. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * @param[out] ifname * Interface name output buffer. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) +mlx5_get_master_ifname(const struct rte_eth_dev *dev, + char (*ifname)[IF_NAMESIZE]) { + struct priv *priv = dev->data->dev_private; DIR *dir; struct dirent *dent; unsigned int dev_type = 0; @@ -115,8 +144,10 @@ priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) MKSTR(path, "%s/device/net", priv->ibdev_path); dir = opendir(path); - if (dir == NULL) - return -1; + if (dir == NULL) { + rte_errno = errno; + return -rte_errno; + } } while ((dent = readdir(dir)) != NULL) { char *name = dent->d_name; @@ -162,359 +193,198 @@ try_dev_id: if (dev_port == dev_port_prev) goto try_dev_id; dev_port_prev = dev_port; - if (dev_port == (priv->port - 1u)) - snprintf(match, sizeof(match), "%s", name); + if (dev_port == 0) + strlcpy(match, name, sizeof(match)); } closedir(dir); - if (match[0] == '\0') - return -1; + if (match[0] == '\0') { + rte_errno = ENOENT; + return -rte_errno; + } strncpy(*ifname, match, sizeof(*ifname)); return 0; } /** - * Check if the counter is located on ib counters file. + * Get interface name from private structure. * - * @param[in] cntr - * Counter name. + * This is a port representor-aware version of mlx5_get_master_ifname(). * - * @return - * 1 if counter is located on ib counters file , 0 otherwise. - */ -int -priv_is_ib_cntr(const char *cntr) -{ - if (!strcmp(cntr, "out_of_buffer")) - return 1; - return 0; -} - -/** - * Read from sysfs entry. - * - * @param[in] priv - * Pointer to private structure. - * @param[in] entry - * Entry name relative to sysfs path. - * @param[out] buf - * Data output buffer. - * @param size - * Buffer size. + * @param[in] dev + * Pointer to Ethernet device. + * @param[out] ifname + * Interface name output buffer. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -priv_sysfs_read(const struct priv *priv, const char *entry, - char *buf, size_t size) +int +mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]) { - char ifname[IF_NAMESIZE]; - FILE *file; - int ret; - int err; - - if (priv_get_ifname(priv, &ifname)) - return -1; - - if (priv_is_ib_cntr(entry)) { - MKSTR(path, "%s/ports/1/hw_counters/%s", - priv->ibdev_path, entry); - file = fopen(path, "rb"); - } else { - MKSTR(path, "%s/device/net/%s/%s", - priv->ibdev_path, ifname, entry); - file = fopen(path, "rb"); + struct priv *priv = dev->data->dev_private; + unsigned int ifindex = + priv->nl_socket_rdma >= 0 ? + mlx5_nl_ifindex(priv->nl_socket_rdma, priv->ibdev_name) : 0; + + if (!ifindex) { + if (!priv->representor) + return mlx5_get_master_ifname(dev, ifname); + rte_errno = ENXIO; + return -rte_errno; } - if (file == NULL) - return -1; - ret = fread(buf, 1, size, file); - err = errno; - if (((size_t)ret < size) && (ferror(file))) - ret = -1; - else - ret = size; - fclose(file); - errno = err; - return ret; + if (if_indextoname(ifindex, &(*ifname)[0])) + return 0; + rte_errno = errno; + return -rte_errno; } /** - * Write to sysfs entry. + * Get the interface index from device name. * - * @param[in] priv - * Pointer to private structure. - * @param[in] entry - * Entry name relative to sysfs path. - * @param[in] buf - * Data buffer. - * @param size - * Buffer size. + * @param[in] dev + * Pointer to Ethernet device. * * @return - * 0 on success, -1 on failure and errno is set. + * Nonzero interface index on success, zero otherwise and rte_errno is set. */ -static int -priv_sysfs_write(const struct priv *priv, const char *entry, - char *buf, size_t size) +unsigned int +mlx5_ifindex(const struct rte_eth_dev *dev) { char ifname[IF_NAMESIZE]; - FILE *file; - int ret; - int err; - - if (priv_get_ifname(priv, &ifname)) - return -1; - - MKSTR(path, "%s/device/net/%s/%s", priv->ibdev_path, ifname, entry); - - file = fopen(path, "wb"); - if (file == NULL) - return -1; - ret = fwrite(buf, 1, size, file); - err = errno; - if (((size_t)ret < size) || (ferror(file))) - ret = -1; - else - ret = size; - fclose(file); - errno = err; - return ret; -} - -/** - * Get unsigned long sysfs property. - * - * @param priv - * Pointer to private structure. - * @param[in] name - * Entry name relative to sysfs path. - * @param[out] value - * Value output buffer. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) -{ - int ret; - unsigned long value_ret; - char value_str[32]; + unsigned int ifindex; - ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); - if (ret == -1) { - DEBUG("cannot read %s value from sysfs: %s", - name, strerror(errno)); - return -1; - } - value_str[ret] = '\0'; - errno = 0; - value_ret = strtoul(value_str, NULL, 0); - if (errno) { - DEBUG("invalid %s value `%s': %s", name, value_str, - strerror(errno)); - return -1; - } - *value = value_ret; - return 0; -} - -/** - * Set unsigned long sysfs property. - * - * @param priv - * Pointer to private structure. - * @param[in] name - * Entry name relative to sysfs path. - * @param value - * Value to set. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) -{ - int ret; - MKSTR(value_str, "%lu", value); - - ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); - if (ret == -1) { - DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", - name, value_str, value, strerror(errno)); - return -1; - } - return 0; + if (mlx5_get_ifname(dev, &ifname)) + return 0; + ifindex = if_nametoindex(ifname); + if (!ifindex) + rte_errno = errno; + return ifindex; } /** * Perform ifreq ioctl() on associated Ethernet device. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * @param req * Request number to pass to ioctl(). * @param[out] ifr * Interface request structure output buffer. + * @param master + * When device is a port representor, perform request on master device + * instead. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) +mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr, + int master) { int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); - int ret = -1; + int ret = 0; - if (sock == -1) - return ret; - if (priv_get_ifname(priv, &ifr->ifr_name) == 0) - ret = ioctl(sock, req, ifr); + if (sock == -1) { + rte_errno = errno; + return -rte_errno; + } + if (master) + ret = mlx5_get_master_ifname(dev, &ifr->ifr_name); + else + ret = mlx5_get_ifname(dev, &ifr->ifr_name); + if (ret) + goto error; + ret = ioctl(sock, req, ifr); + if (ret == -1) { + rte_errno = errno; + goto error; + } close(sock); - return ret; -} - -/** - * Return the number of active VFs for the current device. - * - * @param[in] priv - * Pointer to private structure. - * @param[out] num_vfs - * Number of active VFs. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -int -priv_get_num_vfs(struct priv *priv, uint16_t *num_vfs) -{ - /* The sysfs entry name depends on the operating system. */ - const char **name = (const char *[]){ - "device/sriov_numvfs", - "device/mlx5_num_vfs", - NULL, - }; - int ret; - - do { - unsigned long ulong_num_vfs; - - ret = priv_get_sysfs_ulong(priv, *name, &ulong_num_vfs); - if (!ret) - *num_vfs = ulong_num_vfs; - } while (*(++name) && ret); - return ret; + return 0; +error: + close(sock); + return -rte_errno; } /** * Get device MTU. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param[out] mtu * MTU value output buffer. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_get_mtu(struct priv *priv, uint16_t *mtu) +mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu) { - unsigned long ulong_mtu; + struct ifreq request; + int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0); - if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) - return -1; - *mtu = ulong_mtu; - return 0; -} - -/** - * Read device counter from sysfs. - * - * @param priv - * Pointer to private structure. - * @param name - * Counter name. - * @param[out] cntr - * Counter output buffer. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -int -priv_get_cntr_sysfs(struct priv *priv, const char *name, uint64_t *cntr) -{ - unsigned long ulong_ctr; - - if (priv_get_sysfs_ulong(priv, name, &ulong_ctr) == -1) - return -1; - *cntr = ulong_ctr; + if (ret) + return ret; + *mtu = request.ifr_mtu; return 0; } /** * Set device MTU. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param mtu * MTU value to set. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_set_mtu(struct priv *priv, uint16_t mtu) +mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) { - uint16_t new_mtu; + struct ifreq request = { .ifr_mtu = mtu, }; - if (priv_set_sysfs_ulong(priv, "mtu", mtu) || - priv_get_mtu(priv, &new_mtu)) - return -1; - if (new_mtu == mtu) - return 0; - errno = EINVAL; - return -1; + return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0); } /** * Set device flags. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param keep * Bitmask for flags that must remain untouched. * @param flags * Bitmask for flags to modify. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) +mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags) { - unsigned long tmp; + struct ifreq request; + int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0); - if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) - return -1; - tmp &= keep; - tmp |= (flags & (~keep)); - return priv_set_sysfs_ulong(priv, "flags", tmp); + if (ret) + return ret; + request.ifr_flags &= keep; + request.ifr_flags |= flags & ~keep; + return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0); } /** - * Ethernet device configuration. - * - * Prepare the driver for a given number of TX and RX queues. + * DPDK callback for Ethernet device configuration. * * @param dev * Pointer to Ethernet device structure. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -dev_configure(struct rte_eth_dev *dev) +int +mlx5_dev_configure(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; unsigned int rxqs_n = dev->data->nb_rx_queues; @@ -524,60 +394,49 @@ dev_configure(struct rte_eth_dev *dev) unsigned int reta_idx_n; const uint8_t use_app_rss_key = !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key; - uint64_t supp_tx_offloads = mlx5_priv_get_tx_port_offloads(priv); - uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; - uint64_t supp_rx_offloads = - (mlx5_priv_get_rx_port_offloads(priv) | - mlx5_priv_get_rx_queue_offloads(priv)); - uint64_t rx_offloads = dev->data->dev_conf.rxmode.offloads; - - if ((tx_offloads & supp_tx_offloads) != tx_offloads) { - ERROR("Some Tx offloads are not supported " - "requested 0x%" PRIx64 " supported 0x%" PRIx64, - tx_offloads, supp_tx_offloads); - return ENOTSUP; - } - if ((rx_offloads & supp_rx_offloads) != rx_offloads) { - ERROR("Some Rx offloads are not supported " - "requested 0x%" PRIx64 " supported 0x%" PRIx64, - rx_offloads, supp_rx_offloads); - return ENOTSUP; - } + int ret = 0; + if (use_app_rss_key && (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len != - rss_hash_default_key_len)) { - /* MLX5 RSS only support 40bytes key. */ - return EINVAL; + MLX5_RSS_HASH_KEY_LEN)) { + DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long", + dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN)); + rte_errno = EINVAL; + return -rte_errno; } priv->rss_conf.rss_key = rte_realloc(priv->rss_conf.rss_key, - rss_hash_default_key_len, 0); + MLX5_RSS_HASH_KEY_LEN, 0); if (!priv->rss_conf.rss_key) { - ERROR("cannot allocate RSS hash key memory (%u)", rxqs_n); - return ENOMEM; + DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)", + dev->data->port_id, rxqs_n); + rte_errno = ENOMEM; + return -rte_errno; } memcpy(priv->rss_conf.rss_key, use_app_rss_key ? dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key : rss_hash_default_key, - rss_hash_default_key_len); - priv->rss_conf.rss_key_len = rss_hash_default_key_len; + MLX5_RSS_HASH_KEY_LEN); + priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN; priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; priv->rxqs = (void *)dev->data->rx_queues; priv->txqs = (void *)dev->data->tx_queues; if (txqs_n != priv->txqs_n) { - INFO("%p: TX queues number update: %u -> %u", - (void *)dev, priv->txqs_n, txqs_n); + DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u", + dev->data->port_id, priv->txqs_n, txqs_n); priv->txqs_n = txqs_n; } if (rxqs_n > priv->config.ind_table_max_size) { - ERROR("cannot handle this many RX queues (%u)", rxqs_n); - return EINVAL; + DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)", + dev->data->port_id, rxqs_n); + rte_errno = EINVAL; + return -rte_errno; } if (rxqs_n == priv->rxqs_n) return 0; - INFO("%p: RX queues number update: %u -> %u", - (void *)dev, priv->rxqs_n, rxqs_n); + DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u", + dev->data->port_id, priv->rxqs_n, rxqs_n); priv->rxqs_n = rxqs_n; /* If the requested number of RX queues is not a power of two, use the * maximum indirection table size for better balancing. @@ -585,8 +444,9 @@ dev_configure(struct rte_eth_dev *dev) reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ? priv->config.ind_table_max_size : rxqs_n)); - if (priv_rss_reta_index_resize(priv, reta_idx_n)) - return ENOMEM; + ret = mlx5_rss_reta_index_resize(dev, reta_idx_n); + if (ret) + return ret; /* When the number of RX queues is not a power of two, the remaining * table entries are padded with reused WQs and hashes are not spread * uniformly. */ @@ -599,25 +459,42 @@ dev_configure(struct rte_eth_dev *dev) } /** - * DPDK callback for Ethernet device configuration. + * Sets default tuning parameters. * * @param dev - * Pointer to Ethernet device structure. - * - * @return - * 0 on success, negative errno value on failure. + * Pointer to Ethernet device. + * @param[out] info + * Info structure output buffer. */ -int -mlx5_dev_configure(struct rte_eth_dev *dev) +static void +mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) { struct priv *priv = dev->data->dev_private; - int ret; - priv_lock(priv); - ret = dev_configure(dev); - assert(ret >= 0); - priv_unlock(priv); - return -ret; + /* Minimum CPU utilization. */ + info->default_rxportconf.ring_size = 256; + info->default_txportconf.ring_size = 256; + info->default_rxportconf.burst_size = 64; + info->default_txportconf.burst_size = 64; + if (priv->link_speed_capa & ETH_LINK_SPEED_100G) { + info->default_rxportconf.nb_queues = 16; + info->default_txportconf.nb_queues = 16; + if (dev->data->nb_rx_queues > 2 || + dev->data->nb_tx_queues > 2) { + /* Max Throughput. */ + info->default_rxportconf.ring_size = 2048; + info->default_txportconf.ring_size = 2048; + } + } else { + info->default_rxportconf.nb_queues = 8; + info->default_txportconf.nb_queues = 8; + if (dev->data->nb_rx_queues > 2 || + dev->data->nb_tx_queues > 2) { + /* Max Throughput. */ + info->default_rxportconf.ring_size = 4096; + info->default_txportconf.ring_size = 4096; + } + } } /** @@ -636,9 +513,6 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) unsigned int max; char ifname[IF_NAMESIZE]; - info->pci_dev = RTE_ETH_DEV_TO_PCI(dev); - - priv_lock(priv); /* FIXME: we should ask the device for these values. */ info->min_rx_bufsize = 32; info->max_rx_pktlen = 65536; @@ -653,22 +527,54 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) max = 65535; info->max_rx_queues = max; info->max_tx_queues = max; - info->max_mac_addrs = RTE_DIM(priv->mac); - info->rx_queue_offload_capa = - mlx5_priv_get_rx_queue_offloads(priv); - info->rx_offload_capa = (mlx5_priv_get_rx_port_offloads(priv) | + info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES; + info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev); + info->rx_offload_capa = (mlx5_get_rx_port_offloads() | info->rx_queue_offload_capa); - info->tx_offload_capa = mlx5_priv_get_tx_port_offloads(priv); - if (priv_get_ifname(priv, &ifname) == 0) + info->tx_offload_capa = mlx5_get_tx_port_offloads(dev); + if (mlx5_get_ifname(dev, &ifname) == 0) info->if_index = if_nametoindex(ifname); info->reta_size = priv->reta_idx_n ? priv->reta_idx_n : config->ind_table_max_size; - info->hash_key_size = priv->rss_conf.rss_key_len; + info->hash_key_size = MLX5_RSS_HASH_KEY_LEN; info->speed_capa = priv->link_speed_capa; info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK; - priv_unlock(priv); + mlx5_set_default_params(dev, info); + info->switch_info.name = dev->data->name; + info->switch_info.domain_id = priv->domain_id; + info->switch_info.port_id = priv->representor_id; + if (priv->representor) { + unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0); + uint16_t port_id[i]; + + i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i); + while (i--) { + struct priv *opriv = + rte_eth_devices[port_id[i]].data->dev_private; + + if (!opriv || + opriv->representor || + opriv->domain_id != priv->domain_id) + continue; + /* + * Override switch name with that of the master + * device. + */ + info->switch_info.name = opriv->dev_data->name; + break; + } + } } +/** + * Get supported packet types. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * A pointer to the supported Packet types array. + */ const uint32_t * mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) { @@ -691,6 +597,7 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) }; if (dev->rx_pkt_burst == mlx5_rx_burst || + dev->rx_pkt_burst == mlx5_rx_burst_mprq || dev->rx_pkt_burst == mlx5_rx_burst_vec) return ptypes; return NULL; @@ -701,11 +608,15 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) * * @param dev * Pointer to Ethernet device structure. - * @param wait_to_complete - * Wait for request completion (ignored). + * @param[out] link + * Storage for current link status. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete) +mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, + struct rte_eth_link *link) { struct priv *priv = dev->data->dev_private; struct ethtool_cmd edata = { @@ -714,26 +625,28 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete) struct ifreq ifr; struct rte_eth_link dev_link; int link_speed = 0; + int ret; - /* priv_lock() is not taken to allow concurrent calls. */ - - (void)wait_to_complete; - if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { - WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); - return -1; + ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1); + if (ret) { + DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } memset(&dev_link, 0, sizeof(dev_link)); dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING)); ifr.ifr_data = (void *)&edata; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", - strerror(errno)); - return -1; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + if (ret) { + DRV_LOG(WARNING, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } link_speed = ethtool_cmd_speed(&edata); if (link_speed == -1) - dev_link.link_speed = 0; + dev_link.link_speed = ETH_SPEED_NUM_NONE; else dev_link.link_speed = link_speed; priv->link_speed_capa = 0; @@ -753,13 +666,13 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete) ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & ETH_LINK_SPEED_FIXED); - if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { - /* Link status changed. */ - dev->data->dev_link = dev_link; - return 0; + if ((dev_link.link_speed && !dev_link.link_status) || + (!dev_link.link_speed && dev_link.link_status)) { + rte_errno = EAGAIN; + return -rte_errno; } - /* Link status is still the same. */ - return -1; + *link = dev_link; + return 0; } /** @@ -767,31 +680,41 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete) * * @param dev * Pointer to Ethernet device structure. - * @param wait_to_complete - * Wait for request completion (ignored). + * @param[out] link + * Storage for current link status. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete) +mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, + struct rte_eth_link *link) + { struct priv *priv = dev->data->dev_private; struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS }; struct ifreq ifr; struct rte_eth_link dev_link; uint64_t sc; + int ret; - (void)wait_to_complete; - if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { - WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); - return -1; + ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1); + if (ret) { + DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } memset(&dev_link, 0, sizeof(dev_link)); dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING)); ifr.ifr_data = (void *)&gcmd; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s", - strerror(errno)); - return -1; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + if (ret) { + DRV_LOG(DEBUG, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)" + " failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords; @@ -802,10 +725,13 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete) *ecmd = gcmd; ifr.ifr_data = (void *)ecmd; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s", - strerror(errno)); - return -1; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + if (ret) { + DRV_LOG(DEBUG, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)" + " failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } dev_link.link_speed = ecmd->speed; sc = ecmd->link_mode_masks[0] | @@ -849,121 +775,13 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete) ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & ETH_LINK_SPEED_FIXED); - if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { - /* Link status changed. */ - dev->data->dev_link = dev_link; - return 0; - } - /* Link status is still the same. */ - return -1; -} - -/** - * Enable receiving and transmitting traffic. - * - * @param priv - * Pointer to private structure. - */ -static void -priv_link_start(struct priv *priv) -{ - struct rte_eth_dev *dev = priv->dev; - int err; - - dev->tx_pkt_burst = priv_select_tx_function(priv, dev); - dev->rx_pkt_burst = priv_select_rx_function(priv, dev); - err = priv_dev_traffic_enable(priv, dev); - if (err) - ERROR("%p: error occurred while configuring control flows: %s", - (void *)priv, strerror(err)); - err = priv_flow_start(priv, &priv->flows); - if (err) - ERROR("%p: error occurred while configuring flows: %s", - (void *)priv, strerror(err)); -} - -/** - * Disable receiving and transmitting traffic. - * - * @param priv - * Pointer to private structure. - */ -static void -priv_link_stop(struct priv *priv) -{ - struct rte_eth_dev *dev = priv->dev; - - priv_flow_stop(priv, &priv->flows); - priv_dev_traffic_disable(priv, dev); - dev->rx_pkt_burst = removed_rx_burst; - dev->tx_pkt_burst = removed_tx_burst; -} - -/** - * Retrieve physical link information and update rx/tx_pkt_burst callbacks - * accordingly. - * - * @param priv - * Pointer to private structure. - * @param wait_to_complete - * Wait for request completion (ignored). - */ -int -priv_link_update(struct priv *priv, int wait_to_complete) -{ - struct rte_eth_dev *dev = priv->dev; - struct utsname utsname; - int ver[3]; - int ret; - struct rte_eth_link dev_link = dev->data->dev_link; - - if (uname(&utsname) == -1 || - sscanf(utsname.release, "%d.%d.%d", - &ver[0], &ver[1], &ver[2]) != 3 || - KERNEL_VERSION(ver[0], ver[1], ver[2]) < KERNEL_VERSION(4, 9, 0)) - ret = mlx5_link_update_unlocked_gset(dev, wait_to_complete); - else - ret = mlx5_link_update_unlocked_gs(dev, wait_to_complete); - /* If lsc interrupt is disabled, should always be ready for traffic. */ - if (!dev->data->dev_conf.intr_conf.lsc) { - priv_link_start(priv); - return ret; - } - /* Re-select burst callbacks only if link status has been changed. */ - if (!ret && dev_link.link_status != dev->data->dev_link.link_status) { - if (dev->data->dev_link.link_status == ETH_LINK_UP) - priv_link_start(priv); - else - priv_link_stop(priv); - } - return ret; -} - -/** - * Querying the link status till it changes to the desired state. - * Number of query attempts is bounded by MLX5_MAX_LINK_QUERY_ATTEMPTS. - * - * @param priv - * Pointer to private structure. - * @param status - * Link desired status. - * - * @return - * 0 on success, negative errno value on failure. - */ -int -priv_force_link_status_change(struct priv *priv, int status) -{ - int try = 0; - - while (try < MLX5_MAX_LINK_QUERY_ATTEMPTS) { - priv_link_update(priv, 0); - if (priv->dev->data->dev_link.link_status == status) - return 0; - try++; - sleep(1); + if ((dev_link.link_speed && !dev_link.link_status) || + (!dev_link.link_speed && dev_link.link_status)) { + rte_errno = EAGAIN; + return -rte_errno; } - return -EAGAIN; + *link = dev_link; + return 0; } /** @@ -972,17 +790,42 @@ priv_force_link_status_change(struct priv *priv, int status) * @param dev * Pointer to Ethernet device structure. * @param wait_to_complete - * Wait for request completion (ignored). + * Wait for request completion. + * + * @return + * 0 if link status was not updated, positive if it was, a negative errno + * value otherwise and rte_errno is set. */ int mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) { - struct priv *priv = dev->data->dev_private; int ret; + struct rte_eth_link dev_link; + time_t start_time = time(NULL); - priv_lock(priv); - ret = priv_link_update(priv, wait_to_complete); - priv_unlock(priv); + do { + ret = mlx5_link_update_unlocked_gs(dev, &dev_link); + if (ret) + ret = mlx5_link_update_unlocked_gset(dev, &dev_link); + if (ret == 0) + break; + /* Handle wait to complete situation. */ + if (wait_to_complete && ret == -EAGAIN) { + if (abs((int)difftime(time(NULL), start_time)) < + MLX5_LINK_STATUS_TIMEOUT) { + usleep(0); + continue; + } else { + rte_errno = EBUSY; + return -rte_errno; + } + } else if (ret < 0) { + return ret; + } + } while (wait_to_complete); + ret = !!memcmp(&dev->data->dev_link, &dev_link, + sizeof(struct rte_eth_link)); + dev->data->dev_link = dev_link; return ret; } @@ -995,39 +838,33 @@ mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) * New MTU. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) { struct priv *priv = dev->data->dev_private; - uint16_t kern_mtu; - int ret = 0; + uint16_t kern_mtu = 0; + int ret; - priv_lock(priv); - ret = priv_get_mtu(priv, &kern_mtu); + ret = mlx5_get_mtu(dev, &kern_mtu); if (ret) - goto out; + return ret; /* Set kernel interface MTU first. */ - ret = priv_set_mtu(priv, mtu); + ret = mlx5_set_mtu(dev, mtu); if (ret) - goto out; - ret = priv_get_mtu(priv, &kern_mtu); + return ret; + ret = mlx5_get_mtu(dev, &kern_mtu); if (ret) - goto out; + return ret; if (kern_mtu == mtu) { priv->mtu = mtu; - DEBUG("adapter port %u MTU set to %u", priv->port, mtu); + DRV_LOG(DEBUG, "port %u adapter MTU set to %u", + dev->data->port_id, mtu); + return 0; } - priv_unlock(priv); - return 0; -out: - ret = errno; - WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, - strerror(ret)); - priv_unlock(priv); - assert(ret >= 0); - return -ret; + rte_errno = EAGAIN; + return -rte_errno; } /** @@ -1039,12 +876,11 @@ out: * Flow control output buffer. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) { - struct priv *priv = dev->data->dev_private; struct ifreq ifr; struct ethtool_pauseparam ethpause = { .cmd = ETHTOOL_GPAUSEPARAM @@ -1052,15 +888,14 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) int ret; ifr.ifr_data = (void *)ðpause; - priv_lock(priv); - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - ret = errno; - WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" - " failed: %s", - strerror(ret)); - goto out; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + if (ret) { + DRV_LOG(WARNING, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:" + " %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } - fc_conf->autoneg = ethpause.autoneg; if (ethpause.rx_pause && ethpause.tx_pause) fc_conf->mode = RTE_FC_FULL; @@ -1070,12 +905,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) fc_conf->mode = RTE_FC_TX_PAUSE; else fc_conf->mode = RTE_FC_NONE; - ret = 0; - -out: - priv_unlock(priv); - assert(ret >= 0); - return -ret; + return 0; } /** @@ -1087,12 +917,11 @@ out: * Flow control parameters. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) { - struct priv *priv = dev->data->dev_private; struct ifreq ifr; struct ethtool_pauseparam ethpause = { .cmd = ETHTOOL_SPAUSEPARAM @@ -1112,21 +941,15 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) ethpause.tx_pause = 1; else ethpause.tx_pause = 0; - - priv_lock(priv); - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - ret = errno; - WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" - " failed: %s", - strerror(ret)); - goto out; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0); + if (ret) { + DRV_LOG(WARNING, + "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" + " failed: %s", + dev->data->port_id, strerror(rte_errno)); + return ret; } - ret = 0; - -out: - priv_unlock(priv); - assert(ret >= 0); - return -ret; + return 0; } /** @@ -1138,7 +961,7 @@ out: * PCI bus address output buffer. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, @@ -1149,8 +972,10 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, MKSTR(path, "%s/device/uevent", device->ibdev_path); file = fopen(path, "rb"); - if (file == NULL) - return -1; + if (file == NULL) { + rte_errno = errno; + return -rte_errno; + } while (fgets(line, sizeof(line), file) == line) { size_t len = strlen(line); int ret; @@ -1180,46 +1005,10 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, } /** - * Update the link status. - * - * @param priv - * Pointer to private structure. - * - * @return - * Zero if the callback process can be called immediately. - */ -static int -priv_link_status_update(struct priv *priv) -{ - struct rte_eth_link *link = &priv->dev->data->dev_link; - - priv_link_update(priv, 0); - if (((link->link_speed == 0) && link->link_status) || - ((link->link_speed != 0) && !link->link_status)) { - /* - * Inconsistent status. Event likely occurred before the - * kernel netdevice exposes the new status. - */ - if (!priv->pending_alarm) { - priv->pending_alarm = 1; - rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US, - mlx5_dev_link_status_handler, - priv->dev); - } - return 1; - } else if (unlikely(priv->pending_alarm)) { - /* Link interrupt occurred while alarm is already scheduled. */ - priv->pending_alarm = 0; - rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev); - } - return 0; -} - -/** * Device status handler. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param events * Pointer to event flags holder. * @@ -1227,60 +1016,37 @@ priv_link_status_update(struct priv *priv) * Events bitmap of callback process which can be called immediately. */ static uint32_t -priv_dev_status_handler(struct priv *priv) +mlx5_dev_status_handler(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct ibv_async_event event; uint32_t ret = 0; + if (mlx5_link_update(dev, 0) == -EAGAIN) { + usleep(0); + return 0; + } /* Read all message and acknowledge them. */ for (;;) { if (mlx5_glue->get_async_event(priv->ctx, &event)) break; if ((event.event_type == IBV_EVENT_PORT_ACTIVE || event.event_type == IBV_EVENT_PORT_ERR) && - (priv->dev->data->dev_conf.intr_conf.lsc == 1)) + (dev->data->dev_conf.intr_conf.lsc == 1)) ret |= (1 << RTE_ETH_EVENT_INTR_LSC); else if (event.event_type == IBV_EVENT_DEVICE_FATAL && - priv->dev->data->dev_conf.intr_conf.rmv == 1) + dev->data->dev_conf.intr_conf.rmv == 1) ret |= (1 << RTE_ETH_EVENT_INTR_RMV); else - DEBUG("event type %d on port %d not handled", - event.event_type, event.element.port_num); + DRV_LOG(DEBUG, + "port %u event type %d on not handled", + dev->data->port_id, event.event_type); mlx5_glue->ack_async_event(&event); } - if (ret & (1 << RTE_ETH_EVENT_INTR_LSC)) - if (priv_link_status_update(priv)) - ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC); return ret; } /** - * Handle delayed link status event. - * - * @param arg - * Registered argument. - */ -void -mlx5_dev_link_status_handler(void *arg) -{ - struct rte_eth_dev *dev = arg; - struct priv *priv = dev->data->dev_private; - int ret; - - while (!priv_trylock(priv)) { - /* Alarm is being canceled. */ - if (priv->pending_alarm == 0) - return; - rte_pause(); - } - priv->pending_alarm = 0; - ret = priv_link_status_update(priv); - priv_unlock(priv); - if (!ret) - _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); -} - -/** * Handle interrupts from the NIC. * * @param[in] intr_handle @@ -1292,12 +1058,9 @@ void mlx5_dev_interrupt_handler(void *cb_arg) { struct rte_eth_dev *dev = cb_arg; - struct priv *priv = dev->data->dev_private; uint32_t events; - priv_lock(priv); - events = priv_dev_status_handler(priv); - priv_unlock(priv); + events = mlx5_dev_status_handler(dev); if (events & (1 << RTE_ETH_EVENT_INTR_LSC)) _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); if (events & (1 << RTE_ETH_EVENT_INTR_RMV)) @@ -1314,24 +1077,21 @@ static void mlx5_dev_handler_socket(void *cb_arg) { struct rte_eth_dev *dev = cb_arg; - struct priv *priv = dev->data->dev_private; - priv_lock(priv); - priv_socket_handle(priv); - priv_unlock(priv); + mlx5_socket_handle(dev); } /** * Uninstall interrupt handler. * - * @param priv - * Pointer to private structure. * @param dev - * Pointer to the rte_eth_dev structure. + * Pointer to Ethernet device. */ void -priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) +mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; + if (dev->data->dev_conf.intr_conf.lsc || dev->data->dev_conf.intr_conf.rmv) rte_intr_callback_unregister(&priv->intr_handle, @@ -1339,10 +1099,6 @@ priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) if (priv->primary_socket) rte_intr_callback_unregister(&priv->intr_handle_socket, mlx5_dev_handler_socket, dev); - if (priv->pending_alarm) { - priv->pending_alarm = 0; - rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev); - } priv->intr_handle.fd = 0; priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; priv->intr_handle_socket.fd = 0; @@ -1352,21 +1108,24 @@ priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) /** * Install interrupt handler. * - * @param priv - * Pointer to private structure. * @param dev - * Pointer to the rte_eth_dev structure. + * Pointer to Ethernet device. */ void -priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) +mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev) { - int rc, flags; + struct priv *priv = dev->data->dev_private; + int ret; + int flags; assert(priv->ctx->async_fd > 0); flags = fcntl(priv->ctx->async_fd, F_GETFL); - rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); - if (rc < 0) { - INFO("failed to change file descriptor async event queue"); + ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); + if (ret) { + DRV_LOG(INFO, + "port %u failed to change file descriptor async event" + " queue", + dev->data->port_id); dev->data->dev_conf.intr_conf.lsc = 0; dev->data->dev_conf.intr_conf.rmv = 0; } @@ -1377,9 +1136,11 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) rte_intr_callback_register(&priv->intr_handle, mlx5_dev_interrupt_handler, dev); } - - rc = priv_socket_init(priv); - if (!rc && priv->primary_socket) { + ret = mlx5_socket_init(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot initialise socket: %s", + dev->data->port_id, strerror(rte_errno)); + else if (priv->primary_socket) { priv->intr_handle_socket.fd = priv->primary_socket; priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT; rte_intr_callback_register(&priv->intr_handle_socket, @@ -1388,41 +1149,18 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) } /** - * Change the link state (UP / DOWN). - * - * @param priv - * Pointer to private data structure. - * @param up - * Nonzero for link up, otherwise link down. - * - * @return - * 0 on success, errno value on failure. - */ -static int -priv_dev_set_link(struct priv *priv, int up) -{ - return priv_set_flags(priv, ~IFF_UP, up ? IFF_UP : ~IFF_UP); -} - -/** * DPDK callback to bring the link DOWN. * * @param dev * Pointer to Ethernet device structure. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_set_link_down(struct rte_eth_dev *dev) { - struct priv *priv = dev->data->dev_private; - int err; - - priv_lock(priv); - err = priv_dev_set_link(priv, 0); - priv_unlock(priv); - return err; + return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP); } /** @@ -1432,63 +1170,68 @@ mlx5_set_link_down(struct rte_eth_dev *dev) * Pointer to Ethernet device structure. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_set_link_up(struct rte_eth_dev *dev) { - struct priv *priv = dev->data->dev_private; - int err; - - priv_lock(priv); - err = priv_dev_set_link(priv, 1); - priv_unlock(priv); - return err; + return mlx5_set_flags(dev, ~IFF_UP, IFF_UP); } /** * Configure the TX function to use. * - * @param priv - * Pointer to private data structure. * @param dev - * Pointer to rte_eth_dev structure. + * Pointer to private data structure. * * @return * Pointer to selected Tx burst function. */ eth_tx_burst_t -priv_select_tx_function(struct priv *priv, struct rte_eth_dev *dev) +mlx5_select_tx_function(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst; struct mlx5_dev_config *config = &priv->config; uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads; int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO | DEV_TX_OFFLOAD_VXLAN_TNL_TSO | - DEV_TX_OFFLOAD_GRE_TNL_TSO)); + DEV_TX_OFFLOAD_GRE_TNL_TSO | + DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO)); + int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO | + DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM)); int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT); assert(priv != NULL); /* Select appropriate TX function. */ - if (vlan_insert || tso) + if (vlan_insert || tso || swp) return tx_pkt_burst; if (config->mps == MLX5_MPW_ENHANCED) { - if (priv_check_vec_tx_support(priv, dev) > 0) { - if (priv_check_raw_vec_tx_support(priv, dev) > 0) + if (mlx5_check_vec_tx_support(dev) > 0) { + if (mlx5_check_raw_vec_tx_support(dev) > 0) tx_pkt_burst = mlx5_tx_burst_raw_vec; else tx_pkt_burst = mlx5_tx_burst_vec; - DEBUG("selected Enhanced MPW TX vectorized function"); + DRV_LOG(DEBUG, + "port %u selected enhanced MPW Tx vectorized" + " function", + dev->data->port_id); } else { tx_pkt_burst = mlx5_tx_burst_empw; - DEBUG("selected Enhanced MPW TX function"); + DRV_LOG(DEBUG, + "port %u selected enhanced MPW Tx function", + dev->data->port_id); } } else if (config->mps && (config->txq_inline > 0)) { tx_pkt_burst = mlx5_tx_burst_mpw_inline; - DEBUG("selected MPW inline TX function"); + DRV_LOG(DEBUG, "port %u selected MPW inline Tx function", + dev->data->port_id); } else if (config->mps) { tx_pkt_burst = mlx5_tx_burst_mpw; - DEBUG("selected MPW TX function"); + DRV_LOG(DEBUG, "port %u selected MPW Tx function", + dev->data->port_id); } return tx_pkt_burst; } @@ -1496,23 +1239,24 @@ priv_select_tx_function(struct priv *priv, struct rte_eth_dev *dev) /** * Configure the RX function to use. * - * @param priv - * Pointer to private data structure. * @param dev - * Pointer to rte_eth_dev structure. + * Pointer to private data structure. * * @return * Pointer to selected Rx burst function. */ eth_rx_burst_t -priv_select_rx_function(struct priv *priv, __rte_unused struct rte_eth_dev *dev) +mlx5_select_rx_function(struct rte_eth_dev *dev) { eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst; - assert(priv != NULL); - if (priv_check_vec_rx_support(priv) > 0) { + assert(dev != NULL); + if (mlx5_check_vec_rx_support(dev) > 0) { rx_pkt_burst = mlx5_rx_burst_vec; - DEBUG("selected RX vectorized function"); + DRV_LOG(DEBUG, "port %u selected Rx vectorized function", + dev->data->port_id); + } else if (mlx5_mprq_enabled(dev)) { + rx_pkt_burst = mlx5_rx_burst_mprq; } return rx_pkt_burst; } @@ -1536,3 +1280,93 @@ mlx5_is_removed(struct rte_eth_dev *dev) return 1; return 0; } + +/** + * Get port ID list of mlx5 instances sharing a common device. + * + * @param[in] dev + * Device to look for. + * @param[out] port_list + * Result buffer for collected port IDs. + * @param port_list_n + * Maximum number of entries in result buffer. If 0, @p port_list can be + * NULL. + * + * @return + * Number of matching instances regardless of the @p port_list_n + * parameter, 0 if none were found. + */ +unsigned int +mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list, + unsigned int port_list_n) +{ + uint16_t id; + unsigned int n = 0; + + RTE_ETH_FOREACH_DEV(id) { + struct rte_eth_dev *ldev = &rte_eth_devices[id]; + + if (!ldev->device || + !ldev->device->driver || + strcmp(ldev->device->driver->name, MLX5_DRIVER_NAME) || + ldev->device != dev) + continue; + if (n < port_list_n) + port_list[n] = id; + n++; + } + return n; +} + +/** + * Get switch information associated with network interface. + * + * @param ifindex + * Network interface index. + * @param[out] info + * Switch information object, populated in case of success. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info) +{ + char ifname[IF_NAMESIZE]; + FILE *file; + struct mlx5_switch_info data = { .master = 0, }; + bool port_name_set = false; + bool port_switch_id_set = false; + char c; + + if (!if_indextoname(ifindex, ifname)) { + rte_errno = errno; + return -rte_errno; + } + + MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name", + ifname); + MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id", + ifname); + + file = fopen(phys_port_name, "rb"); + if (file != NULL) { + port_name_set = + fscanf(file, "%d%c", &data.port_name, &c) == 2 && + c == '\n'; + fclose(file); + } + file = fopen(phys_switch_id, "rb"); + if (file == NULL) { + rte_errno = errno; + return -rte_errno; + } + port_switch_id_set = + fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 && + c == '\n'; + fclose(file); + data.master = port_switch_id_set && !port_name_set; + data.representor = port_switch_id_set && port_name_set; + *info = data; + return 0; +} diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index 26002c4b..ca4625b6 100644 --- a/drivers/net/mlx5/mlx5_flow.c +++ b/drivers/net/mlx5/mlx5_flow.c @@ -1,9 +1,11 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2016 6WIND S.A. - * Copyright 2016 Mellanox. + * Copyright 2016 Mellanox Technologies, Ltd */ #include <sys/queue.h> +#include <stdalign.h> +#include <stdint.h> #include <string.h> /* Verbs header. */ @@ -16,6 +18,9 @@ #pragma GCC diagnostic error "-Wpedantic" #endif +#include <rte_common.h> +#include <rte_ether.h> +#include <rte_eth_ctrl.h> #include <rte_ethdev_driver.h> #include <rte_flow.h> #include <rte_flow_driver.h> @@ -27,388 +32,289 @@ #include "mlx5_prm.h" #include "mlx5_glue.h" -/* Define minimal priority for control plane flows. */ -#define MLX5_CTRL_FLOW_PRIORITY 4 - -/* Internet Protocol versions. */ -#define MLX5_IPV4 4 -#define MLX5_IPV6 6 - -#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT -struct ibv_flow_spec_counter_action { - int dummy; -}; -#endif - /* Dev ops structure defined in mlx5.c */ extern const struct eth_dev_ops mlx5_dev_ops; extern const struct eth_dev_ops mlx5_dev_ops_isolate; -static int -mlx5_flow_create_eth(const struct rte_flow_item *item, - const void *default_mask, - void *data); - -static int -mlx5_flow_create_vlan(const struct rte_flow_item *item, - const void *default_mask, - void *data); - -static int -mlx5_flow_create_ipv4(const struct rte_flow_item *item, - const void *default_mask, - void *data); - -static int -mlx5_flow_create_ipv6(const struct rte_flow_item *item, - const void *default_mask, - void *data); - -static int -mlx5_flow_create_udp(const struct rte_flow_item *item, - const void *default_mask, - void *data); - -static int -mlx5_flow_create_tcp(const struct rte_flow_item *item, - const void *default_mask, - void *data); - -static int -mlx5_flow_create_vxlan(const struct rte_flow_item *item, - const void *default_mask, - void *data); - -struct mlx5_flow_parse; - -static void -mlx5_flow_create_copy(struct mlx5_flow_parse *parser, void *src, - unsigned int size); - -static int -mlx5_flow_create_flag_mark(struct mlx5_flow_parse *parser, uint32_t mark_id); - -static int -mlx5_flow_create_count(struct priv *priv, struct mlx5_flow_parse *parser); - -/* Hash RX queue types. */ -enum hash_rxq_type { - HASH_RXQ_TCPV4, - HASH_RXQ_UDPV4, - HASH_RXQ_IPV4, - HASH_RXQ_TCPV6, - HASH_RXQ_UDPV6, - HASH_RXQ_IPV6, - HASH_RXQ_ETH, -}; - -/* Initialization data for hash RX queue. */ -struct hash_rxq_init { - uint64_t hash_fields; /* Fields that participate in the hash. */ - uint64_t dpdk_rss_hf; /* Matching DPDK RSS hash fields. */ - unsigned int flow_priority; /* Flow priority to use. */ - unsigned int ip_version; /* Internet protocol. */ +/* Pattern outer Layer bits. */ +#define MLX5_FLOW_LAYER_OUTER_L2 (1u << 0) +#define MLX5_FLOW_LAYER_OUTER_L3_IPV4 (1u << 1) +#define MLX5_FLOW_LAYER_OUTER_L3_IPV6 (1u << 2) +#define MLX5_FLOW_LAYER_OUTER_L4_UDP (1u << 3) +#define MLX5_FLOW_LAYER_OUTER_L4_TCP (1u << 4) +#define MLX5_FLOW_LAYER_OUTER_VLAN (1u << 5) + +/* Pattern inner Layer bits. */ +#define MLX5_FLOW_LAYER_INNER_L2 (1u << 6) +#define MLX5_FLOW_LAYER_INNER_L3_IPV4 (1u << 7) +#define MLX5_FLOW_LAYER_INNER_L3_IPV6 (1u << 8) +#define MLX5_FLOW_LAYER_INNER_L4_UDP (1u << 9) +#define MLX5_FLOW_LAYER_INNER_L4_TCP (1u << 10) +#define MLX5_FLOW_LAYER_INNER_VLAN (1u << 11) + +/* Pattern tunnel Layer bits. */ +#define MLX5_FLOW_LAYER_VXLAN (1u << 12) +#define MLX5_FLOW_LAYER_VXLAN_GPE (1u << 13) +#define MLX5_FLOW_LAYER_GRE (1u << 14) +#define MLX5_FLOW_LAYER_MPLS (1u << 15) + +/* Outer Masks. */ +#define MLX5_FLOW_LAYER_OUTER_L3 \ + (MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6) +#define MLX5_FLOW_LAYER_OUTER_L4 \ + (MLX5_FLOW_LAYER_OUTER_L4_UDP | MLX5_FLOW_LAYER_OUTER_L4_TCP) +#define MLX5_FLOW_LAYER_OUTER \ + (MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_L3 | \ + MLX5_FLOW_LAYER_OUTER_L4) + +/* Tunnel Masks. */ +#define MLX5_FLOW_LAYER_TUNNEL \ + (MLX5_FLOW_LAYER_VXLAN | MLX5_FLOW_LAYER_VXLAN_GPE | \ + MLX5_FLOW_LAYER_GRE | MLX5_FLOW_LAYER_MPLS) + +/* Inner Masks. */ +#define MLX5_FLOW_LAYER_INNER_L3 \ + (MLX5_FLOW_LAYER_INNER_L3_IPV4 | MLX5_FLOW_LAYER_INNER_L3_IPV6) +#define MLX5_FLOW_LAYER_INNER_L4 \ + (MLX5_FLOW_LAYER_INNER_L4_UDP | MLX5_FLOW_LAYER_INNER_L4_TCP) +#define MLX5_FLOW_LAYER_INNER \ + (MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_L3 | \ + MLX5_FLOW_LAYER_INNER_L4) + +/* Actions that modify the fate of matching traffic. */ +#define MLX5_FLOW_FATE_DROP (1u << 0) +#define MLX5_FLOW_FATE_QUEUE (1u << 1) +#define MLX5_FLOW_FATE_RSS (1u << 2) + +/* Modify a packet. */ +#define MLX5_FLOW_MOD_FLAG (1u << 0) +#define MLX5_FLOW_MOD_MARK (1u << 1) +#define MLX5_FLOW_MOD_COUNT (1u << 2) + +/* possible L3 layers protocols filtering. */ +#define MLX5_IP_PROTOCOL_TCP 6 +#define MLX5_IP_PROTOCOL_UDP 17 +#define MLX5_IP_PROTOCOL_GRE 47 +#define MLX5_IP_PROTOCOL_MPLS 147 + +/* Priority reserved for default flows. */ +#define MLX5_FLOW_PRIO_RSVD ((uint32_t)-1) + +enum mlx5_expansion { + MLX5_EXPANSION_ROOT, + MLX5_EXPANSION_ROOT_OUTER, + MLX5_EXPANSION_ROOT_ETH_VLAN, + MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN, + MLX5_EXPANSION_OUTER_ETH, + MLX5_EXPANSION_OUTER_ETH_VLAN, + MLX5_EXPANSION_OUTER_VLAN, + MLX5_EXPANSION_OUTER_IPV4, + MLX5_EXPANSION_OUTER_IPV4_UDP, + MLX5_EXPANSION_OUTER_IPV4_TCP, + MLX5_EXPANSION_OUTER_IPV6, + MLX5_EXPANSION_OUTER_IPV6_UDP, + MLX5_EXPANSION_OUTER_IPV6_TCP, + MLX5_EXPANSION_VXLAN, + MLX5_EXPANSION_VXLAN_GPE, + MLX5_EXPANSION_GRE, + MLX5_EXPANSION_MPLS, + MLX5_EXPANSION_ETH, + MLX5_EXPANSION_ETH_VLAN, + MLX5_EXPANSION_VLAN, + MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV4_UDP, + MLX5_EXPANSION_IPV4_TCP, + MLX5_EXPANSION_IPV6, + MLX5_EXPANSION_IPV6_UDP, + MLX5_EXPANSION_IPV6_TCP, }; -/* Initialization data for hash RX queues. */ -const struct hash_rxq_init hash_rxq_init[] = { - [HASH_RXQ_TCPV4] = { - .hash_fields = (IBV_RX_HASH_SRC_IPV4 | - IBV_RX_HASH_DST_IPV4 | - IBV_RX_HASH_SRC_PORT_TCP | - IBV_RX_HASH_DST_PORT_TCP), - .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP, - .flow_priority = 0, - .ip_version = MLX5_IPV4, +/** Supported expansion of items. */ +static const struct rte_flow_expand_node mlx5_support_expansion[] = { + [MLX5_EXPANSION_ROOT] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH, + MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_END, }, - [HASH_RXQ_UDPV4] = { - .hash_fields = (IBV_RX_HASH_SRC_IPV4 | - IBV_RX_HASH_DST_IPV4 | - IBV_RX_HASH_SRC_PORT_UDP | - IBV_RX_HASH_DST_PORT_UDP), - .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP, - .flow_priority = 0, - .ip_version = MLX5_IPV4, + [MLX5_EXPANSION_ROOT_OUTER] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_ETH, + MLX5_EXPANSION_OUTER_IPV4, + MLX5_EXPANSION_OUTER_IPV6), + .type = RTE_FLOW_ITEM_TYPE_END, }, - [HASH_RXQ_IPV4] = { - .hash_fields = (IBV_RX_HASH_SRC_IPV4 | - IBV_RX_HASH_DST_IPV4), - .dpdk_rss_hf = (ETH_RSS_IPV4 | - ETH_RSS_FRAG_IPV4), - .flow_priority = 1, - .ip_version = MLX5_IPV4, + [MLX5_EXPANSION_ROOT_ETH_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH_VLAN), + .type = RTE_FLOW_ITEM_TYPE_END, }, - [HASH_RXQ_TCPV6] = { - .hash_fields = (IBV_RX_HASH_SRC_IPV6 | - IBV_RX_HASH_DST_IPV6 | - IBV_RX_HASH_SRC_PORT_TCP | - IBV_RX_HASH_DST_PORT_TCP), - .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP, - .flow_priority = 0, - .ip_version = MLX5_IPV6, + [MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_ETH_VLAN), + .type = RTE_FLOW_ITEM_TYPE_END, }, - [HASH_RXQ_UDPV6] = { - .hash_fields = (IBV_RX_HASH_SRC_IPV6 | - IBV_RX_HASH_DST_IPV6 | - IBV_RX_HASH_SRC_PORT_UDP | - IBV_RX_HASH_DST_PORT_UDP), - .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP, - .flow_priority = 0, - .ip_version = MLX5_IPV6, + [MLX5_EXPANSION_OUTER_ETH] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_IPV4, + MLX5_EXPANSION_OUTER_IPV6, + MLX5_EXPANSION_MPLS), + .type = RTE_FLOW_ITEM_TYPE_ETH, + .rss_types = 0, }, - [HASH_RXQ_IPV6] = { - .hash_fields = (IBV_RX_HASH_SRC_IPV6 | - IBV_RX_HASH_DST_IPV6), - .dpdk_rss_hf = (ETH_RSS_IPV6 | - ETH_RSS_FRAG_IPV6), - .flow_priority = 1, - .ip_version = MLX5_IPV6, + [MLX5_EXPANSION_OUTER_ETH_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_VLAN), + .type = RTE_FLOW_ITEM_TYPE_ETH, + .rss_types = 0, }, - [HASH_RXQ_ETH] = { - .hash_fields = 0, - .dpdk_rss_hf = 0, - .flow_priority = 2, + [MLX5_EXPANSION_OUTER_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_IPV4, + MLX5_EXPANSION_OUTER_IPV6), + .type = RTE_FLOW_ITEM_TYPE_VLAN, }, -}; - -/* Number of entries in hash_rxq_init[]. */ -const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init); - -/** Structure for holding counter stats. */ -struct mlx5_flow_counter_stats { - uint64_t hits; /**< Number of packets matched by the rule. */ - uint64_t bytes; /**< Number of bytes matched by the rule. */ -}; - -/** Structure for Drop queue. */ -struct mlx5_hrxq_drop { - struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */ - struct ibv_qp *qp; /**< Verbs queue pair. */ - struct ibv_wq *wq; /**< Verbs work queue. */ - struct ibv_cq *cq; /**< Verbs completion queue. */ -}; - -/* Flows structures. */ -struct mlx5_flow { - uint64_t hash_fields; /**< Fields that participate in the hash. */ - struct ibv_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */ - struct ibv_flow *ibv_flow; /**< Verbs flow. */ - struct mlx5_hrxq *hrxq; /**< Hash Rx queues. */ -}; - -/* Drop flows structures. */ -struct mlx5_flow_drop { - struct ibv_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */ - struct ibv_flow *ibv_flow; /**< Verbs flow. */ -}; - -struct rte_flow { - TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */ - uint32_t mark:1; /**< Set if the flow is marked. */ - uint32_t drop:1; /**< Drop queue. */ - uint16_t queues_n; /**< Number of entries in queue[]. */ - uint16_t (*queues)[]; /**< Queues indexes to use. */ - struct rte_eth_rss_conf rss_conf; /**< RSS configuration */ - uint8_t rss_key[40]; /**< copy of the RSS key. */ - struct ibv_counter_set *cs; /**< Holds the counters for the rule. */ - struct mlx5_flow_counter_stats counter_stats;/**<The counter stats. */ - struct mlx5_flow frxq[RTE_DIM(hash_rxq_init)]; - /**< Flow with Rx queue. */ -}; - -/** Static initializer for items. */ -#define ITEMS(...) \ - (const enum rte_flow_item_type []){ \ - __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \ - } - -/** Structure to generate a simple graph of layers supported by the NIC. */ -struct mlx5_flow_items { - /** List of possible actions for these items. */ - const enum rte_flow_action_type *const actions; - /** Bit-masks corresponding to the possibilities for the item. */ - const void *mask; - /** - * Default bit-masks to use when item->mask is not provided. When - * \default_mask is also NULL, the full supported bit-mask (\mask) is - * used instead. - */ - const void *default_mask; - /** Bit-masks size in bytes. */ - const unsigned int mask_sz; - /** - * Conversion function from rte_flow to NIC specific flow. - * - * @param item - * rte_flow item to convert. - * @param default_mask - * Default bit-masks to use when item->mask is not provided. - * @param data - * Internal structure to store the conversion. - * - * @return - * 0 on success, negative value otherwise. - */ - int (*convert)(const struct rte_flow_item *item, - const void *default_mask, - void *data); - /** Size in bytes of the destination structure. */ - const unsigned int dst_sz; - /** List of possible following items. */ - const enum rte_flow_item_type *const items; -}; - -/** Valid action for this PMD. */ -static const enum rte_flow_action_type valid_actions[] = { - RTE_FLOW_ACTION_TYPE_DROP, - RTE_FLOW_ACTION_TYPE_QUEUE, - RTE_FLOW_ACTION_TYPE_MARK, - RTE_FLOW_ACTION_TYPE_FLAG, -#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT - RTE_FLOW_ACTION_TYPE_COUNT, -#endif - RTE_FLOW_ACTION_TYPE_END, -}; - -/** Graph of supported items and associated actions. */ -static const struct mlx5_flow_items mlx5_flow_items[] = { - [RTE_FLOW_ITEM_TYPE_END] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH, - RTE_FLOW_ITEM_TYPE_VXLAN), + [MLX5_EXPANSION_OUTER_IPV4] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT + (MLX5_EXPANSION_OUTER_IPV4_UDP, + MLX5_EXPANSION_OUTER_IPV4_TCP, + MLX5_EXPANSION_GRE), + .type = RTE_FLOW_ITEM_TYPE_IPV4, + .rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 | + ETH_RSS_NONFRAG_IPV4_OTHER, }, - [RTE_FLOW_ITEM_TYPE_ETH] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_VLAN, - RTE_FLOW_ITEM_TYPE_IPV4, - RTE_FLOW_ITEM_TYPE_IPV6), - .actions = valid_actions, - .mask = &(const struct rte_flow_item_eth){ - .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", - .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", - .type = -1, - }, - .default_mask = &rte_flow_item_eth_mask, - .mask_sz = sizeof(struct rte_flow_item_eth), - .convert = mlx5_flow_create_eth, - .dst_sz = sizeof(struct ibv_flow_spec_eth), + [MLX5_EXPANSION_OUTER_IPV4_UDP] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN, + MLX5_EXPANSION_VXLAN_GPE), + .type = RTE_FLOW_ITEM_TYPE_UDP, + .rss_types = ETH_RSS_NONFRAG_IPV4_UDP, }, - [RTE_FLOW_ITEM_TYPE_VLAN] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4, - RTE_FLOW_ITEM_TYPE_IPV6), - .actions = valid_actions, - .mask = &(const struct rte_flow_item_vlan){ - .tci = -1, - }, - .default_mask = &rte_flow_item_vlan_mask, - .mask_sz = sizeof(struct rte_flow_item_vlan), - .convert = mlx5_flow_create_vlan, - .dst_sz = 0, + [MLX5_EXPANSION_OUTER_IPV4_TCP] = { + .type = RTE_FLOW_ITEM_TYPE_TCP, + .rss_types = ETH_RSS_NONFRAG_IPV4_TCP, }, - [RTE_FLOW_ITEM_TYPE_IPV4] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, - RTE_FLOW_ITEM_TYPE_TCP), - .actions = valid_actions, - .mask = &(const struct rte_flow_item_ipv4){ - .hdr = { - .src_addr = -1, - .dst_addr = -1, - .type_of_service = -1, - .next_proto_id = -1, - }, - }, - .default_mask = &rte_flow_item_ipv4_mask, - .mask_sz = sizeof(struct rte_flow_item_ipv4), - .convert = mlx5_flow_create_ipv4, - .dst_sz = sizeof(struct ibv_flow_spec_ipv4_ext), + [MLX5_EXPANSION_OUTER_IPV6] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT + (MLX5_EXPANSION_OUTER_IPV6_UDP, + MLX5_EXPANSION_OUTER_IPV6_TCP), + .type = RTE_FLOW_ITEM_TYPE_IPV6, + .rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 | + ETH_RSS_NONFRAG_IPV6_OTHER, }, - [RTE_FLOW_ITEM_TYPE_IPV6] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, - RTE_FLOW_ITEM_TYPE_TCP), - .actions = valid_actions, - .mask = &(const struct rte_flow_item_ipv6){ - .hdr = { - .src_addr = { - 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, - }, - .dst_addr = { - 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, - }, - .vtc_flow = -1, - .proto = -1, - .hop_limits = -1, - }, - }, - .default_mask = &rte_flow_item_ipv6_mask, - .mask_sz = sizeof(struct rte_flow_item_ipv6), - .convert = mlx5_flow_create_ipv6, - .dst_sz = sizeof(struct ibv_flow_spec_ipv6), + [MLX5_EXPANSION_OUTER_IPV6_UDP] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN, + MLX5_EXPANSION_VXLAN_GPE), + .type = RTE_FLOW_ITEM_TYPE_UDP, + .rss_types = ETH_RSS_NONFRAG_IPV6_UDP, }, - [RTE_FLOW_ITEM_TYPE_UDP] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_VXLAN), - .actions = valid_actions, - .mask = &(const struct rte_flow_item_udp){ - .hdr = { - .src_port = -1, - .dst_port = -1, - }, - }, - .default_mask = &rte_flow_item_udp_mask, - .mask_sz = sizeof(struct rte_flow_item_udp), - .convert = mlx5_flow_create_udp, - .dst_sz = sizeof(struct ibv_flow_spec_tcp_udp), + [MLX5_EXPANSION_OUTER_IPV6_TCP] = { + .type = RTE_FLOW_ITEM_TYPE_TCP, + .rss_types = ETH_RSS_NONFRAG_IPV6_TCP, }, - [RTE_FLOW_ITEM_TYPE_TCP] = { - .actions = valid_actions, - .mask = &(const struct rte_flow_item_tcp){ - .hdr = { - .src_port = -1, - .dst_port = -1, - }, - }, - .default_mask = &rte_flow_item_tcp_mask, - .mask_sz = sizeof(struct rte_flow_item_tcp), - .convert = mlx5_flow_create_tcp, - .dst_sz = sizeof(struct ibv_flow_spec_tcp_udp), + [MLX5_EXPANSION_VXLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH), + .type = RTE_FLOW_ITEM_TYPE_VXLAN, }, - [RTE_FLOW_ITEM_TYPE_VXLAN] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH), - .actions = valid_actions, - .mask = &(const struct rte_flow_item_vxlan){ - .vni = "\xff\xff\xff", - }, - .default_mask = &rte_flow_item_vxlan_mask, - .mask_sz = sizeof(struct rte_flow_item_vxlan), - .convert = mlx5_flow_create_vxlan, - .dst_sz = sizeof(struct ibv_flow_spec_tunnel), + [MLX5_EXPANSION_VXLAN_GPE] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH, + MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_VXLAN_GPE, + }, + [MLX5_EXPANSION_GRE] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4), + .type = RTE_FLOW_ITEM_TYPE_GRE, + }, + [MLX5_EXPANSION_MPLS] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_MPLS, + }, + [MLX5_EXPANSION_ETH] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_ETH, + }, + [MLX5_EXPANSION_ETH_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VLAN), + .type = RTE_FLOW_ITEM_TYPE_ETH, + }, + [MLX5_EXPANSION_VLAN] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4, + MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_VLAN, + }, + [MLX5_EXPANSION_IPV4] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4_UDP, + MLX5_EXPANSION_IPV4_TCP), + .type = RTE_FLOW_ITEM_TYPE_IPV4, + .rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 | + ETH_RSS_NONFRAG_IPV4_OTHER, + }, + [MLX5_EXPANSION_IPV4_UDP] = { + .type = RTE_FLOW_ITEM_TYPE_UDP, + .rss_types = ETH_RSS_NONFRAG_IPV4_UDP, + }, + [MLX5_EXPANSION_IPV4_TCP] = { + .type = RTE_FLOW_ITEM_TYPE_TCP, + .rss_types = ETH_RSS_NONFRAG_IPV4_TCP, + }, + [MLX5_EXPANSION_IPV6] = { + .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV6_UDP, + MLX5_EXPANSION_IPV6_TCP), + .type = RTE_FLOW_ITEM_TYPE_IPV6, + .rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 | + ETH_RSS_NONFRAG_IPV6_OTHER, + }, + [MLX5_EXPANSION_IPV6_UDP] = { + .type = RTE_FLOW_ITEM_TYPE_UDP, + .rss_types = ETH_RSS_NONFRAG_IPV6_UDP, + }, + [MLX5_EXPANSION_IPV6_TCP] = { + .type = RTE_FLOW_ITEM_TYPE_TCP, + .rss_types = ETH_RSS_NONFRAG_IPV6_TCP, }, }; -/** Structure to pass to the conversion function. */ -struct mlx5_flow_parse { - uint32_t inner; /**< Set once VXLAN is encountered. */ - uint32_t create:1; - /**< Whether resources should remain after a validate. */ - uint32_t drop:1; /**< Target is a drop queue. */ - uint32_t mark:1; /**< Mark is present in the flow. */ - uint32_t count:1; /**< Count is present in the flow. */ - uint32_t mark_id; /**< Mark identifier. */ - uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queues indexes to use. */ - uint16_t queues_n; /**< Number of entries in queue[]. */ - struct rte_eth_rss_conf rss_conf; /**< RSS configuration */ - uint8_t rss_key[40]; /**< copy of the RSS key. */ - enum hash_rxq_type layer; /**< Last pattern layer detected. */ - struct ibv_counter_set *cs; /**< Holds the counter set for the rule */ +/** Handles information leading to a drop fate. */ +struct mlx5_flow_verbs { + LIST_ENTRY(mlx5_flow_verbs) next; + unsigned int size; /**< Size of the attribute. */ struct { - struct ibv_flow_attr *ibv_attr; - /**< Pointer to Verbs attributes. */ - unsigned int offset; - /**< Current position or total size of the attribute. */ - } queue[RTE_DIM(hash_rxq_init)]; + struct ibv_flow_attr *attr; + /**< Pointer to the Specification buffer. */ + uint8_t *specs; /**< Pointer to the specifications. */ + }; + struct ibv_flow *flow; /**< Verbs flow pointer. */ + struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */ + uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */ +}; + +/* Counters information. */ +struct mlx5_flow_counter { + LIST_ENTRY(mlx5_flow_counter) next; /**< Pointer to the next counter. */ + uint32_t shared:1; /**< Share counter ID with other flow rules. */ + uint32_t ref_cnt:31; /**< Reference counter. */ + uint32_t id; /**< Counter ID. */ + struct ibv_counter_set *cs; /**< Holds the counters for the rule. */ + uint64_t hits; /**< Number of packets matched by the rule. */ + uint64_t bytes; /**< Number of bytes matched by the rule. */ +}; + +/* Flow structure. */ +struct rte_flow { + TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */ + struct rte_flow_attr attributes; /**< User flow attribute. */ + uint32_t l3_protocol_en:1; /**< Protocol filtering requested. */ + uint32_t layers; + /**< Bit-fields of present layers see MLX5_FLOW_LAYER_*. */ + uint32_t modifier; + /**< Bit-fields of present modifier see MLX5_FLOW_MOD_*. */ + uint32_t fate; + /**< Bit-fields of present fate see MLX5_FLOW_FATE_*. */ + uint8_t l3_protocol; /**< valid when l3_protocol_en is set. */ + LIST_HEAD(verbs, mlx5_flow_verbs) verbs; /**< Verbs flows list. */ + struct mlx5_flow_verbs *cur_verbs; + /**< Current Verbs flow structure being filled. */ + struct mlx5_flow_counter *counter; /**< Holds Verbs flow counter. */ + struct rte_flow_action_rss rss;/**< RSS context. */ + uint8_t key[MLX5_RSS_HASH_KEY_LEN]; /**< RSS hash key. */ + uint16_t (*queue)[]; /**< Destination queues to redirect traffic to. */ + void *nl_flow; /**< Netlink flow buffer if relevant. */ }; static const struct rte_flow_ops mlx5_flow_ops = { @@ -416,12 +322,8 @@ static const struct rte_flow_ops mlx5_flow_ops = { .create = mlx5_flow_create, .destroy = mlx5_flow_destroy, .flush = mlx5_flow_flush, -#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT - .query = mlx5_flow_query, -#else - .query = NULL, -#endif .isolate = mlx5_flow_isolate, + .query = mlx5_flow_query, }; /* Convert FDIR request to Generic flow. */ @@ -436,9 +338,17 @@ struct mlx5_fdir { struct rte_flow_item_ipv6 ipv6; } l3; union { + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv6 ipv6; + } l3_mask; + union { struct rte_flow_item_udp udp; struct rte_flow_item_tcp tcp; } l4; + union { + struct rte_flow_item_udp udp; + struct rte_flow_item_tcp tcp; + } l4_mask; struct rte_flow_action_queue queue; }; @@ -448,778 +358,459 @@ struct ibv_spec_header { uint16_t size; }; -/** - * Check support for a given item. - * - * @param item[in] - * Item specification. - * @param mask[in] - * Bit-masks covering supported fields to compare with spec, last and mask in - * \item. - * @param size - * Bit-Mask size in bytes. - * - * @return - * 0 on success. +/* + * Number of sub priorities. + * For each kind of pattern matching i.e. L2, L3, L4 to have a correct + * matching on the NIC (firmware dependent) L4 most have the higher priority + * followed by L3 and ending with L2. */ -static int -mlx5_flow_item_validate(const struct rte_flow_item *item, - const uint8_t *mask, unsigned int size) -{ - int ret = 0; - - if (!item->spec && (item->mask || item->last)) - return -1; - if (item->spec && !item->mask) { - unsigned int i; - const uint8_t *spec = item->spec; - - for (i = 0; i < size; ++i) - if ((spec[i] | mask[i]) != mask[i]) - return -1; - } - if (item->last && !item->mask) { - unsigned int i; - const uint8_t *spec = item->last; - - for (i = 0; i < size; ++i) - if ((spec[i] | mask[i]) != mask[i]) - return -1; - } - if (item->mask) { - unsigned int i; - const uint8_t *spec = item->spec; +#define MLX5_PRIORITY_MAP_L2 2 +#define MLX5_PRIORITY_MAP_L3 1 +#define MLX5_PRIORITY_MAP_L4 0 +#define MLX5_PRIORITY_MAP_MAX 3 + +/* Map of Verbs to Flow priority with 8 Verbs priorities. */ +static const uint32_t priority_map_3[][MLX5_PRIORITY_MAP_MAX] = { + { 0, 1, 2 }, { 2, 3, 4 }, { 5, 6, 7 }, +}; - for (i = 0; i < size; ++i) - if ((spec[i] | mask[i]) != mask[i]) - return -1; - } - if (item->spec && item->last) { - uint8_t spec[size]; - uint8_t last[size]; - const uint8_t *apply = mask; - unsigned int i; +/* Map of Verbs to Flow priority with 16 Verbs priorities. */ +static const uint32_t priority_map_5[][MLX5_PRIORITY_MAP_MAX] = { + { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 }, + { 9, 10, 11 }, { 12, 13, 14 }, +}; - if (item->mask) - apply = item->mask; - for (i = 0; i < size; ++i) { - spec[i] = ((const uint8_t *)item->spec)[i] & apply[i]; - last[i] = ((const uint8_t *)item->last)[i] & apply[i]; - } - ret = memcmp(spec, last, size); - } - return ret; -} +/* Tunnel information. */ +struct mlx5_flow_tunnel_info { + uint32_t tunnel; /**< Tunnel bit (see MLX5_FLOW_*). */ + uint32_t ptype; /**< Tunnel Ptype (see RTE_PTYPE_*). */ +}; -/** - * Copy the RSS configuration from the user ones, of the rss_conf is null, - * uses the driver one. - * - * @param priv - * Pointer to private structure. - * @param parser - * Internal parser structure. - * @param rss_conf - * User RSS configuration to save. - * - * @return - * 0 on success, errno value on failure. - */ -static int -priv_flow_convert_rss_conf(struct priv *priv, - struct mlx5_flow_parse *parser, - const struct rte_eth_rss_conf *rss_conf) -{ - /* - * This function is also called at the beginning of - * priv_flow_convert_actions() to initialize the parser with the - * device default RSS configuration. - */ - (void)priv; - if (rss_conf) { - if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) - return EINVAL; - if (rss_conf->rss_key_len != 40) - return EINVAL; - if (rss_conf->rss_key_len && rss_conf->rss_key) { - parser->rss_conf.rss_key_len = rss_conf->rss_key_len; - memcpy(parser->rss_key, rss_conf->rss_key, - rss_conf->rss_key_len); - parser->rss_conf.rss_key = parser->rss_key; - } - parser->rss_conf.rss_hf = rss_conf->rss_hf; - } - return 0; -} +static struct mlx5_flow_tunnel_info tunnels_info[] = { + { + .tunnel = MLX5_FLOW_LAYER_VXLAN, + .ptype = RTE_PTYPE_TUNNEL_VXLAN | RTE_PTYPE_L4_UDP, + }, + { + .tunnel = MLX5_FLOW_LAYER_VXLAN_GPE, + .ptype = RTE_PTYPE_TUNNEL_VXLAN_GPE | RTE_PTYPE_L4_UDP, + }, + { + .tunnel = MLX5_FLOW_LAYER_GRE, + .ptype = RTE_PTYPE_TUNNEL_GRE, + }, + { + .tunnel = MLX5_FLOW_LAYER_MPLS | MLX5_FLOW_LAYER_OUTER_L4_UDP, + .ptype = RTE_PTYPE_TUNNEL_MPLS_IN_GRE | RTE_PTYPE_L4_UDP, + }, + { + .tunnel = MLX5_FLOW_LAYER_MPLS, + .ptype = RTE_PTYPE_TUNNEL_MPLS_IN_GRE, + }, +}; /** - * Extract attribute to the parser. + * Discover the maximum number of priority available. * - * @param priv - * Pointer to private structure. - * @param[in] attr - * Flow rule attributes. - * @param[out] error - * Perform verbose error reporting if not NULL. - * @param[in, out] parser - * Internal parser structure. + * @param[in] dev + * Pointer to Ethernet device. * * @return - * 0 on success, a negative errno value otherwise and rte_errno is set. + * number of supported flow priority on success, a negative errno + * value otherwise and rte_errno is set. */ -static int -priv_flow_convert_attributes(struct priv *priv, - const struct rte_flow_attr *attr, - struct rte_flow_error *error, - struct mlx5_flow_parse *parser) +int +mlx5_flow_discover_priorities(struct rte_eth_dev *dev) { - (void)priv; - (void)parser; - if (attr->group) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_GROUP, - NULL, - "groups are not supported"); - return -rte_errno; - } - if (attr->priority && attr->priority != MLX5_CTRL_FLOW_PRIORITY) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, - NULL, - "priorities are not supported"); + struct { + struct ibv_flow_attr attr; + struct ibv_flow_spec_eth eth; + struct ibv_flow_spec_action_drop drop; + } flow_attr = { + .attr = { + .num_of_specs = 2, + }, + .eth = { + .type = IBV_FLOW_SPEC_ETH, + .size = sizeof(struct ibv_flow_spec_eth), + }, + .drop = { + .size = sizeof(struct ibv_flow_spec_action_drop), + .type = IBV_FLOW_SPEC_ACTION_DROP, + }, + }; + struct ibv_flow *flow; + struct mlx5_hrxq *drop = mlx5_hrxq_drop_new(dev); + uint16_t vprio[] = { 8, 16 }; + int i; + int priority = 0; + + if (!drop) { + rte_errno = ENOTSUP; return -rte_errno; } - if (attr->egress) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, - NULL, - "egress is not supported"); - return -rte_errno; + for (i = 0; i != RTE_DIM(vprio); i++) { + flow_attr.attr.priority = vprio[i] - 1; + flow = mlx5_glue->create_flow(drop->qp, &flow_attr.attr); + if (!flow) + break; + claim_zero(mlx5_glue->destroy_flow(flow)); + priority = vprio[i]; } - if (!attr->ingress) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, - NULL, - "only ingress is supported"); + switch (priority) { + case 8: + priority = RTE_DIM(priority_map_3); + break; + case 16: + priority = RTE_DIM(priority_map_5); + break; + default: + rte_errno = ENOTSUP; + DRV_LOG(ERR, + "port %u verbs maximum priority: %d expected 8/16", + dev->data->port_id, vprio[i]); return -rte_errno; } - return 0; + mlx5_hrxq_drop_release(dev); + DRV_LOG(INFO, "port %u flow maximum priority: %d", + dev->data->port_id, priority); + return priority; } /** - * Extract actions request to the parser. - * - * @param priv - * Pointer to private structure. - * @param[in] actions - * Associated actions (list terminated by the END action). - * @param[out] error - * Perform verbose error reporting if not NULL. - * @param[in, out] parser - * Internal parser structure. + * Adjust flow priority. * - * @return - * 0 on success, a negative errno value otherwise and rte_errno is set. + * @param dev + * Pointer to Ethernet device. + * @param flow + * Pointer to an rte flow. */ -static int -priv_flow_convert_actions(struct priv *priv, - const struct rte_flow_action actions[], - struct rte_flow_error *error, - struct mlx5_flow_parse *parser) +static void +mlx5_flow_adjust_priority(struct rte_eth_dev *dev, struct rte_flow *flow) { - /* - * Add default RSS configuration necessary for Verbs to create QP even - * if no RSS is necessary. - */ - priv_flow_convert_rss_conf(priv, parser, - (const struct rte_eth_rss_conf *) - &priv->rss_conf); - for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { - if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) { - continue; - } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { - parser->drop = 1; - } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) { - const struct rte_flow_action_queue *queue = - (const struct rte_flow_action_queue *) - actions->conf; - uint16_t n; - uint16_t found = 0; - - if (!queue || (queue->index > (priv->rxqs_n - 1))) - goto exit_action_not_supported; - for (n = 0; n < parser->queues_n; ++n) { - if (parser->queues[n] == queue->index) { - found = 1; - break; - } - } - if (parser->queues_n > 1 && !found) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "queue action not in RSS queues"); - return -rte_errno; - } - if (!found) { - parser->queues_n = 1; - parser->queues[0] = queue->index; - } - } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) { - const struct rte_flow_action_rss *rss = - (const struct rte_flow_action_rss *) - actions->conf; - uint16_t n; - - if (!rss || !rss->num) { - rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "no valid queues"); - return -rte_errno; - } - if (parser->queues_n == 1) { - uint16_t found = 0; - - assert(parser->queues_n); - for (n = 0; n < rss->num; ++n) { - if (parser->queues[0] == - rss->queue[n]) { - found = 1; - break; - } - } - if (!found) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "queue action not in RSS" - " queues"); - return -rte_errno; - } - } - for (n = 0; n < rss->num; ++n) { - if (rss->queue[n] >= priv->rxqs_n) { - rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "queue id > number of" - " queues"); - return -rte_errno; - } - } - for (n = 0; n < rss->num; ++n) - parser->queues[n] = rss->queue[n]; - parser->queues_n = rss->num; - if (priv_flow_convert_rss_conf(priv, parser, - rss->rss_conf)) { - rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "wrong RSS configuration"); - return -rte_errno; - } - } else if (actions->type == RTE_FLOW_ACTION_TYPE_MARK) { - const struct rte_flow_action_mark *mark = - (const struct rte_flow_action_mark *) - actions->conf; - - if (!mark) { - rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "mark must be defined"); - return -rte_errno; - } else if (mark->id >= MLX5_FLOW_MARK_MAX) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "mark must be between 0" - " and 16777199"); - return -rte_errno; - } - parser->mark = 1; - parser->mark_id = mark->id; - } else if (actions->type == RTE_FLOW_ACTION_TYPE_FLAG) { - parser->mark = 1; - } else if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT && - priv->config.flow_counter_en) { - parser->count = 1; - } else { - goto exit_action_not_supported; - } - } - if (parser->drop && parser->mark) - parser->mark = 0; - if (!parser->queues_n && !parser->drop) { - rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, - NULL, "no valid action"); - return -rte_errno; + struct priv *priv = dev->data->dev_private; + uint32_t priority = flow->attributes.priority; + uint32_t subpriority = flow->cur_verbs->attr->priority; + + switch (priv->config.flow_prio) { + case RTE_DIM(priority_map_3): + priority = priority_map_3[priority][subpriority]; + break; + case RTE_DIM(priority_map_5): + priority = priority_map_5[priority][subpriority]; + break; } - return 0; -exit_action_not_supported: - rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, - actions, "action not supported"); - return -rte_errno; + flow->cur_verbs->attr->priority = priority; } /** - * Validate items. + * Get a flow counter. * - * @param priv - * Pointer to private structure. - * @param[in] items - * Pattern specification (list terminated by the END pattern item). - * @param[out] error - * Perform verbose error reporting if not NULL. - * @param[in, out] parser - * Internal parser structure. + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] shared + * Indicate if this counter is shared with other flows. + * @param[in] id + * Counter identifier. * * @return - * 0 on success, a negative errno value otherwise and rte_errno is set. + * A pointer to the counter, NULL otherwise and rte_errno is set. */ -static int -priv_flow_convert_items_validate(struct priv *priv, - const struct rte_flow_item items[], - struct rte_flow_error *error, - struct mlx5_flow_parse *parser) +static struct mlx5_flow_counter * +mlx5_flow_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id) { - const struct mlx5_flow_items *cur_item = mlx5_flow_items; - unsigned int i; - - (void)priv; - /* Initialise the offsets to start after verbs attribute. */ - for (i = 0; i != hash_rxq_init_n; ++i) - parser->queue[i].offset = sizeof(struct ibv_flow_attr); - for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) { - const struct mlx5_flow_items *token = NULL; - unsigned int n; - int err; + struct priv *priv = dev->data->dev_private; + struct mlx5_flow_counter *cnt; - if (items->type == RTE_FLOW_ITEM_TYPE_VOID) + LIST_FOREACH(cnt, &priv->flow_counters, next) { + if (!cnt->shared || cnt->shared != shared) continue; - for (i = 0; - cur_item->items && - cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END; - ++i) { - if (cur_item->items[i] == items->type) { - token = &mlx5_flow_items[items->type]; - break; - } - } - if (!token) - goto exit_item_not_supported; - cur_item = token; - err = mlx5_flow_item_validate(items, - (const uint8_t *)cur_item->mask, - cur_item->mask_sz); - if (err) - goto exit_item_not_supported; - if (items->type == RTE_FLOW_ITEM_TYPE_VXLAN) { - if (parser->inner) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - items, - "cannot recognize multiple" - " VXLAN encapsulations"); - return -rte_errno; - } - parser->inner = IBV_FLOW_SPEC_INNER; - } - if (parser->drop) { - parser->queue[HASH_RXQ_ETH].offset += cur_item->dst_sz; - } else { - for (n = 0; n != hash_rxq_init_n; ++n) - parser->queue[n].offset += cur_item->dst_sz; - } - } - if (parser->drop) { - parser->queue[HASH_RXQ_ETH].offset += - sizeof(struct ibv_flow_spec_action_drop); - } - if (parser->mark) { - for (i = 0; i != hash_rxq_init_n; ++i) - parser->queue[i].offset += - sizeof(struct ibv_flow_spec_action_tag); + if (cnt->id != id) + continue; + cnt->ref_cnt++; + return cnt; } - if (parser->count) { - unsigned int size = sizeof(struct ibv_flow_spec_counter_action); +#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT + + struct mlx5_flow_counter tmpl = { + .shared = shared, + .id = id, + .cs = mlx5_glue->create_counter_set + (priv->ctx, + &(struct ibv_counter_set_init_attr){ + .counter_set_id = id, + }), + .hits = 0, + .bytes = 0, + }; - for (i = 0; i != hash_rxq_init_n; ++i) - parser->queue[i].offset += size; + if (!tmpl.cs) { + rte_errno = errno; + return NULL; } - return 0; -exit_item_not_supported: - rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, - items, "item not supported"); - return -rte_errno; + cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0); + if (!cnt) { + rte_errno = ENOMEM; + return NULL; + } + *cnt = tmpl; + LIST_INSERT_HEAD(&priv->flow_counters, cnt, next); + return cnt; +#endif + rte_errno = ENOTSUP; + return NULL; } /** - * Allocate memory space to store verbs flow attributes. - * - * @param priv - * Pointer to private structure. - * @param[in] priority - * Flow priority. - * @param[in] size - * Amount of byte to allocate. - * @param[out] error - * Perform verbose error reporting if not NULL. + * Release a flow counter. * - * @return - * A verbs flow attribute on success, NULL otherwise. + * @param[in] counter + * Pointer to the counter handler. */ -static struct ibv_flow_attr* -priv_flow_convert_allocate(struct priv *priv, - unsigned int priority, - unsigned int size, - struct rte_flow_error *error) +static void +mlx5_flow_counter_release(struct mlx5_flow_counter *counter) { - struct ibv_flow_attr *ibv_attr; - - (void)priv; - ibv_attr = rte_calloc(__func__, 1, size, 0); - if (!ibv_attr) { - rte_flow_error_set(error, ENOMEM, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "cannot allocate verbs spec attributes."); - return NULL; + if (--counter->ref_cnt == 0) { + claim_zero(mlx5_glue->destroy_counter_set(counter->cs)); + LIST_REMOVE(counter, next); + rte_free(counter); } - ibv_attr->priority = priority; - return ibv_attr; } /** - * Finalise verbs flow attributes. + * Verify the @p attributes will be correctly understood by the NIC and store + * them in the @p flow if everything is correct. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] attributes + * Pointer to flow attributes + * @param[in, out] flow + * Pointer to the rte_flow structure. + * @param[out] error + * Pointer to error structure. * - * @param priv - * Pointer to private structure. - * @param[in, out] parser - * Internal parser structure. + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static void -priv_flow_convert_finalise(struct priv *priv, struct mlx5_flow_parse *parser) +static int +mlx5_flow_attributes(struct rte_eth_dev *dev, + const struct rte_flow_attr *attributes, + struct rte_flow *flow, + struct rte_flow_error *error) { - const unsigned int ipv4 = - hash_rxq_init[parser->layer].ip_version == MLX5_IPV4; - const enum hash_rxq_type hmin = ipv4 ? HASH_RXQ_TCPV4 : HASH_RXQ_TCPV6; - const enum hash_rxq_type hmax = ipv4 ? HASH_RXQ_IPV4 : HASH_RXQ_IPV6; - const enum hash_rxq_type ohmin = ipv4 ? HASH_RXQ_TCPV6 : HASH_RXQ_TCPV4; - const enum hash_rxq_type ohmax = ipv4 ? HASH_RXQ_IPV6 : HASH_RXQ_IPV4; - const enum hash_rxq_type ip = ipv4 ? HASH_RXQ_IPV4 : HASH_RXQ_IPV6; - unsigned int i; - - (void)priv; - if (parser->layer == HASH_RXQ_ETH) { - goto fill; - } else { - /* - * This layer becomes useless as the pattern define under - * layers. - */ - rte_free(parser->queue[HASH_RXQ_ETH].ibv_attr); - parser->queue[HASH_RXQ_ETH].ibv_attr = NULL; - } - /* Remove opposite kind of layer e.g. IPv6 if the pattern is IPv4. */ - for (i = ohmin; i != (ohmax + 1); ++i) { - if (!parser->queue[i].ibv_attr) - continue; - rte_free(parser->queue[i].ibv_attr); - parser->queue[i].ibv_attr = NULL; - } - /* Remove impossible flow according to the RSS configuration. */ - if (hash_rxq_init[parser->layer].dpdk_rss_hf & - parser->rss_conf.rss_hf) { - /* Remove any other flow. */ - for (i = hmin; i != (hmax + 1); ++i) { - if ((i == parser->layer) || - (!parser->queue[i].ibv_attr)) - continue; - rte_free(parser->queue[i].ibv_attr); - parser->queue[i].ibv_attr = NULL; - } - } else if (!parser->queue[ip].ibv_attr) { - /* no RSS possible with the current configuration. */ - parser->queues_n = 1; - return; - } -fill: - /* - * Fill missing layers in verbs specifications, or compute the correct - * offset to allocate the memory space for the attributes and - * specifications. - */ - for (i = 0; i != hash_rxq_init_n - 1; ++i) { - union { - struct ibv_flow_spec_ipv4_ext ipv4; - struct ibv_flow_spec_ipv6 ipv6; - struct ibv_flow_spec_tcp_udp udp_tcp; - } specs; - void *dst; - uint16_t size; - - if (i == parser->layer) - continue; - if (parser->layer == HASH_RXQ_ETH) { - if (hash_rxq_init[i].ip_version == MLX5_IPV4) { - size = sizeof(struct ibv_flow_spec_ipv4_ext); - specs.ipv4 = (struct ibv_flow_spec_ipv4_ext){ - .type = IBV_FLOW_SPEC_IPV4_EXT, - .size = size, - }; - } else { - size = sizeof(struct ibv_flow_spec_ipv6); - specs.ipv6 = (struct ibv_flow_spec_ipv6){ - .type = IBV_FLOW_SPEC_IPV6, - .size = size, - }; - } - if (parser->queue[i].ibv_attr) { - dst = (void *)((uintptr_t) - parser->queue[i].ibv_attr + - parser->queue[i].offset); - memcpy(dst, &specs, size); - ++parser->queue[i].ibv_attr->num_of_specs; - } - parser->queue[i].offset += size; - } - if ((i == HASH_RXQ_UDPV4) || (i == HASH_RXQ_TCPV4) || - (i == HASH_RXQ_UDPV6) || (i == HASH_RXQ_TCPV6)) { - size = sizeof(struct ibv_flow_spec_tcp_udp); - specs.udp_tcp = (struct ibv_flow_spec_tcp_udp) { - .type = ((i == HASH_RXQ_UDPV4 || - i == HASH_RXQ_UDPV6) ? - IBV_FLOW_SPEC_UDP : - IBV_FLOW_SPEC_TCP), - .size = size, - }; - if (parser->queue[i].ibv_attr) { - dst = (void *)((uintptr_t) - parser->queue[i].ibv_attr + - parser->queue[i].offset); - memcpy(dst, &specs, size); - ++parser->queue[i].ibv_attr->num_of_specs; - } - parser->queue[i].offset += size; - } - } + uint32_t priority_max = + ((struct priv *)dev->data->dev_private)->config.flow_prio - 1; + + if (attributes->group) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + NULL, + "groups is not supported"); + if (attributes->priority != MLX5_FLOW_PRIO_RSVD && + attributes->priority >= priority_max) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, + "priority out of range"); + if (attributes->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, + NULL, + "egress is not supported"); + if (attributes->transfer) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, + NULL, + "transfer is not supported"); + if (!attributes->ingress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + NULL, + "ingress attribute is mandatory"); + flow->attributes = *attributes; + if (attributes->priority == MLX5_FLOW_PRIO_RSVD) + flow->attributes.priority = priority_max; + return 0; } /** - * Validate and convert a flow supported by the NIC. + * Verify the @p item specifications (spec, last, mask) are compatible with the + * NIC capabilities. * - * @param priv - * Pointer to private structure. - * @param[in] attr - * Flow rule attributes. - * @param[in] pattern - * Pattern specification (list terminated by the END pattern item). - * @param[in] actions - * Associated actions (list terminated by the END action). + * @param[in] item + * Item specification. + * @param[in] mask + * @p item->mask or flow default bit-masks. + * @param[in] nic_mask + * Bit-masks covering supported fields by the NIC to compare with user mask. + * @param[in] size + * Bit-masks size in bytes. * @param[out] error - * Perform verbose error reporting if not NULL. - * @param[in, out] parser - * Internal parser structure. + * Pointer to error structure. * * @return * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_flow_convert(struct priv *priv, - const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - const struct rte_flow_action actions[], - struct rte_flow_error *error, - struct mlx5_flow_parse *parser) +mlx5_flow_item_acceptable(const struct rte_flow_item *item, + const uint8_t *mask, + const uint8_t *nic_mask, + unsigned int size, + struct rte_flow_error *error) { - const struct mlx5_flow_items *cur_item = mlx5_flow_items; unsigned int i; - int ret; - /* First step. Validate the attributes, items and actions. */ - *parser = (struct mlx5_flow_parse){ - .create = parser->create, - .layer = HASH_RXQ_ETH, - .mark_id = MLX5_FLOW_MARK_DEFAULT, - }; - ret = priv_flow_convert_attributes(priv, attr, error, parser); - if (ret) - return ret; - ret = priv_flow_convert_actions(priv, actions, error, parser); - if (ret) - return ret; - ret = priv_flow_convert_items_validate(priv, items, error, parser); - if (ret) - return ret; - priv_flow_convert_finalise(priv, parser); - /* - * Second step. - * Allocate the memory space to store verbs specifications. - */ - if (parser->drop) { - unsigned int priority = - attr->priority + - hash_rxq_init[HASH_RXQ_ETH].flow_priority; - unsigned int offset = parser->queue[HASH_RXQ_ETH].offset; - - parser->queue[HASH_RXQ_ETH].ibv_attr = - priv_flow_convert_allocate(priv, priority, - offset, error); - if (!parser->queue[HASH_RXQ_ETH].ibv_attr) - return ENOMEM; - parser->queue[HASH_RXQ_ETH].offset = - sizeof(struct ibv_flow_attr); - } else { - for (i = 0; i != hash_rxq_init_n; ++i) { - unsigned int priority = - attr->priority + - hash_rxq_init[i].flow_priority; - unsigned int offset; - - if (!(parser->rss_conf.rss_hf & - hash_rxq_init[i].dpdk_rss_hf) && - (i != HASH_RXQ_ETH)) - continue; - offset = parser->queue[i].offset; - parser->queue[i].ibv_attr = - priv_flow_convert_allocate(priv, priority, - offset, error); - if (!parser->queue[i].ibv_attr) - goto exit_enomem; - parser->queue[i].offset = sizeof(struct ibv_flow_attr); - } - } - /* Third step. Conversion parse, fill the specifications. */ - parser->inner = 0; - for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) { - if (items->type == RTE_FLOW_ITEM_TYPE_VOID) - continue; - cur_item = &mlx5_flow_items[items->type]; - ret = cur_item->convert(items, - (cur_item->default_mask ? - cur_item->default_mask : - cur_item->mask), - parser); - if (ret) { - rte_flow_error_set(error, ret, - RTE_FLOW_ERROR_TYPE_ITEM, - items, "item not supported"); - goto exit_free; - } - } - if (parser->mark) - mlx5_flow_create_flag_mark(parser, parser->mark_id); - if (parser->count && parser->create) { - mlx5_flow_create_count(priv, parser); - if (!parser->cs) - goto exit_count_error; - } - /* - * Last step. Complete missing specification to reach the RSS - * configuration. - */ - if (!parser->drop) { - priv_flow_convert_finalise(priv, parser); - } else { - parser->queue[HASH_RXQ_ETH].ibv_attr->priority = - attr->priority + - hash_rxq_init[parser->layer].flow_priority; - } -exit_free: - /* Only verification is expected, all resources should be released. */ - if (!parser->create) { - for (i = 0; i != hash_rxq_init_n; ++i) { - if (parser->queue[i].ibv_attr) { - rte_free(parser->queue[i].ibv_attr); - parser->queue[i].ibv_attr = NULL; - } - } - } - return ret; -exit_enomem: - for (i = 0; i != hash_rxq_init_n; ++i) { - if (parser->queue[i].ibv_attr) { - rte_free(parser->queue[i].ibv_attr); - parser->queue[i].ibv_attr = NULL; + assert(nic_mask); + for (i = 0; i < size; ++i) + if ((nic_mask[i] | mask[i]) != nic_mask[i]) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "mask enables non supported" + " bits"); + if (!item->spec && (item->mask || item->last)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "mask/last without a spec is not" + " supported"); + if (item->spec && item->last) { + uint8_t spec[size]; + uint8_t last[size]; + unsigned int i; + int ret; + + for (i = 0; i < size; ++i) { + spec[i] = ((const uint8_t *)item->spec)[i] & mask[i]; + last[i] = ((const uint8_t *)item->last)[i] & mask[i]; } + ret = memcmp(spec, last, size); + if (ret != 0) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "range is not supported"); } - rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, "cannot allocate verbs spec attributes."); - return ret; -exit_count_error: - rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, "cannot create counter."); - return rte_errno; + return 0; } /** - * Copy the specification created into the flow. + * Add a verbs item specification into @p flow. * - * @param parser - * Internal parser structure. - * @param src + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] src * Create specification. - * @param size + * @param[in] size * Size in bytes of the specification to copy. */ static void -mlx5_flow_create_copy(struct mlx5_flow_parse *parser, void *src, - unsigned int size) +mlx5_flow_spec_verbs_add(struct rte_flow *flow, void *src, unsigned int size) { - unsigned int i; - void *dst; + struct mlx5_flow_verbs *verbs = flow->cur_verbs; - for (i = 0; i != hash_rxq_init_n; ++i) { - if (!parser->queue[i].ibv_attr) - continue; - /* Specification must be the same l3 type or none. */ - if (parser->layer == HASH_RXQ_ETH || - (hash_rxq_init[parser->layer].ip_version == - hash_rxq_init[i].ip_version) || - (hash_rxq_init[i].ip_version == 0)) { - dst = (void *)((uintptr_t)parser->queue[i].ibv_attr + - parser->queue[i].offset); - memcpy(dst, src, size); - ++parser->queue[i].ibv_attr->num_of_specs; - parser->queue[i].offset += size; - } + if (verbs->specs) { + void *dst; + + dst = (void *)(verbs->specs + verbs->size); + memcpy(dst, src, size); + ++verbs->attr->num_of_specs; } + verbs->size += size; } /** - * Convert Ethernet item to Verbs specification. + * Adjust verbs hash fields according to the @p flow information. + * + * @param[in, out] flow. + * Pointer to flow structure. + * @param[in] tunnel + * 1 when the hash field is for a tunnel item. + * @param[in] layer_types + * ETH_RSS_* types. + * @param[in] hash_fields + * Item hash fields. + */ +static void +mlx5_flow_verbs_hashfields_adjust(struct rte_flow *flow, + int tunnel __rte_unused, + uint32_t layer_types, uint64_t hash_fields) +{ +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + hash_fields |= (tunnel ? IBV_RX_HASH_INNER : 0); + if (flow->rss.level == 2 && !tunnel) + hash_fields = 0; + else if (flow->rss.level < 2 && tunnel) + hash_fields = 0; +#endif + if (!(flow->rss.types & layer_types)) + hash_fields = 0; + flow->cur_verbs->hash_fields |= hash_fields; +} + +/** + * Convert the @p item into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. * - * @param item[in] + * @param[in] item * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p item has fully been converted, + * otherwise another call with this returned memory size should be done. + * On error, a negative errno value is returned and rte_errno is set. */ static int -mlx5_flow_create_eth(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx5_flow_item_eth(const struct rte_flow_item *item, struct rte_flow *flow, + const size_t flow_size, struct rte_flow_error *error) { const struct rte_flow_item_eth *spec = item->spec; const struct rte_flow_item_eth *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; - const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth); + const struct rte_flow_item_eth nic_mask = { + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .type = RTE_BE16(0xffff), + }; + const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); + const unsigned int size = sizeof(struct ibv_flow_spec_eth); struct ibv_flow_spec_eth eth = { - .type = parser->inner | IBV_FLOW_SPEC_ETH, - .size = eth_size, + .type = IBV_FLOW_SPEC_ETH | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, }; + int ret; - /* Don't update layer for the inner pattern. */ - if (!parser->inner) - parser->layer = HASH_RXQ_ETH; + if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L2 layers already configured"); + if (!mask) + mask = &rte_flow_item_eth_mask; + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_eth), + error); + if (ret) + return ret; + flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + if (size > flow_size) + return size; if (spec) { unsigned int i; - if (!mask) - mask = default_mask; memcpy(ð.val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN); memcpy(ð.val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN); eth.val.ether_type = spec->type; @@ -1233,80 +824,211 @@ mlx5_flow_create_eth(const struct rte_flow_item *item, } eth.val.ether_type &= eth.mask.ether_type; } - mlx5_flow_create_copy(parser, ð, eth_size); - return 0; + flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; + mlx5_flow_spec_verbs_add(flow, ð, size); + return size; +} + +/** + * Update the VLAN tag in the Verbs Ethernet specification. + * + * @param[in, out] attr + * Pointer to Verbs attributes structure. + * @param[in] eth + * Verbs structure containing the VLAN information to copy. + */ +static void +mlx5_flow_item_vlan_update(struct ibv_flow_attr *attr, + struct ibv_flow_spec_eth *eth) +{ + unsigned int i; + const enum ibv_flow_spec_type search = eth->type; + struct ibv_spec_header *hdr = (struct ibv_spec_header *) + ((uint8_t *)attr + sizeof(struct ibv_flow_attr)); + + for (i = 0; i != attr->num_of_specs; ++i) { + if (hdr->type == search) { + struct ibv_flow_spec_eth *e = + (struct ibv_flow_spec_eth *)hdr; + + e->val.vlan_tag = eth->val.vlan_tag; + e->mask.vlan_tag = eth->mask.vlan_tag; + e->val.ether_type = eth->val.ether_type; + e->mask.ether_type = eth->mask.ether_type; + break; + } + hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size); + } } /** - * Convert VLAN item to Verbs specification. + * Convert the @p item into @p flow (or by updating the already present + * Ethernet Verbs) specification after ensuring the NIC will understand and + * process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. * - * @param item[in] + * @param[in] item * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p item has fully been converted, + * otherwise another call with this returned memory size should be done. + * On error, a negative errno value is returned and rte_errno is set. */ static int -mlx5_flow_create_vlan(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx5_flow_item_vlan(const struct rte_flow_item *item, struct rte_flow *flow, + const size_t flow_size, struct rte_flow_error *error) { const struct rte_flow_item_vlan *spec = item->spec; const struct rte_flow_item_vlan *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; - struct ibv_flow_spec_eth *eth; - const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth); - + const struct rte_flow_item_vlan nic_mask = { + .tci = RTE_BE16(0x0fff), + .inner_type = RTE_BE16(0xffff), + }; + unsigned int size = sizeof(struct ibv_flow_spec_eth); + const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); + struct ibv_flow_spec_eth eth = { + .type = IBV_FLOW_SPEC_ETH | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, + }; + int ret; + const uint32_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 | + MLX5_FLOW_LAYER_INNER_L4) : + (MLX5_FLOW_LAYER_OUTER_L3 | MLX5_FLOW_LAYER_OUTER_L4); + const uint32_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : + MLX5_FLOW_LAYER_OUTER_VLAN; + const uint32_t l2m = tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + + if (flow->layers & vlanm) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VLAN layer already configured"); + else if ((flow->layers & l34m) != 0) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L2 layer cannot follow L3/L4 layer"); + if (!mask) + mask = &rte_flow_item_vlan_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_vlan), error); + if (ret) + return ret; if (spec) { - unsigned int i; - if (!mask) - mask = default_mask; - - for (i = 0; i != hash_rxq_init_n; ++i) { - if (!parser->queue[i].ibv_attr) - continue; - - eth = (void *)((uintptr_t)parser->queue[i].ibv_attr + - parser->queue[i].offset - eth_size); - eth->val.vlan_tag = spec->tci; - eth->mask.vlan_tag = mask->tci; - eth->val.vlan_tag &= eth->mask.vlan_tag; - } + eth.val.vlan_tag = spec->tci; + eth.mask.vlan_tag = mask->tci; + eth.val.vlan_tag &= eth.mask.vlan_tag; + eth.val.ether_type = spec->inner_type; + eth.mask.ether_type = mask->inner_type; + eth.val.ether_type &= eth.mask.ether_type; } - return 0; + /* + * From verbs perspective an empty VLAN is equivalent + * to a packet without VLAN layer. + */ + if (!eth.mask.vlan_tag) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + item->spec, + "VLAN cannot be empty"); + if (!(flow->layers & l2m)) { + if (size <= flow_size) { + flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; + mlx5_flow_spec_verbs_add(flow, ð, size); + } + } else { + if (flow->cur_verbs) + mlx5_flow_item_vlan_update(flow->cur_verbs->attr, + ð); + size = 0; /* Only an update is done in eth specification. */ + } + flow->layers |= tunnel ? + (MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_VLAN) : + (MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_VLAN); + return size; } /** - * Convert IPv4 item to Verbs specification. + * Convert the @p item into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. * - * @param item[in] + * @param[in] item * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p item has fully been converted, + * otherwise another call with this returned memory size should be done. + * On error, a negative errno value is returned and rte_errno is set. */ static int -mlx5_flow_create_ipv4(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx5_flow_item_ipv4(const struct rte_flow_item *item, struct rte_flow *flow, + const size_t flow_size, struct rte_flow_error *error) { const struct rte_flow_item_ipv4 *spec = item->spec; const struct rte_flow_item_ipv4 *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; - unsigned int ipv4_size = sizeof(struct ibv_flow_spec_ipv4_ext); + const struct rte_flow_item_ipv4 nic_mask = { + .hdr = { + .src_addr = RTE_BE32(0xffffffff), + .dst_addr = RTE_BE32(0xffffffff), + .type_of_service = 0xff, + .next_proto_id = 0xff, + }, + }; + const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); + unsigned int size = sizeof(struct ibv_flow_spec_ipv4_ext); struct ibv_flow_spec_ipv4_ext ipv4 = { - .type = parser->inner | IBV_FLOW_SPEC_IPV4_EXT, - .size = ipv4_size, + .type = IBV_FLOW_SPEC_IPV4_EXT | + (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, }; + int ret; - /* Don't update layer for the inner pattern. */ - if (!parser->inner) - parser->layer = HASH_RXQ_IPV4; + if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "multiple L3 layers not supported"); + else if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L3 cannot follow an L4 layer."); + if (!mask) + mask = &rte_flow_item_ipv4_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_ipv4), error); + if (ret < 0) + return ret; + flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; if (spec) { - if (!mask) - mask = default_mask; ipv4.val = (struct ibv_flow_ipv4_ext_filter){ .src_ip = spec->hdr.src_addr, .dst_ip = spec->hdr.dst_addr, @@ -1325,44 +1047,108 @@ mlx5_flow_create_ipv4(const struct rte_flow_item *item, ipv4.val.proto &= ipv4.mask.proto; ipv4.val.tos &= ipv4.mask.tos; } - mlx5_flow_create_copy(parser, &ipv4, ipv4_size); - return 0; + flow->l3_protocol_en = !!ipv4.mask.proto; + flow->l3_protocol = ipv4.val.proto; + if (size <= flow_size) { + mlx5_flow_verbs_hashfields_adjust + (flow, tunnel, + (ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 | + ETH_RSS_NONFRAG_IPV4_OTHER), + (IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4)); + flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L3; + mlx5_flow_spec_verbs_add(flow, &ipv4, size); + } + return size; } /** - * Convert IPv6 item to Verbs specification. + * Convert the @p item into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. * - * @param item[in] + * @param[in] item * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p item has fully been converted, + * otherwise another call with this returned memory size should be done. + * On error, a negative errno value is returned and rte_errno is set. */ static int -mlx5_flow_create_ipv6(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx5_flow_item_ipv6(const struct rte_flow_item *item, struct rte_flow *flow, + const size_t flow_size, struct rte_flow_error *error) { const struct rte_flow_item_ipv6 *spec = item->spec; const struct rte_flow_item_ipv6 *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; - unsigned int ipv6_size = sizeof(struct ibv_flow_spec_ipv6); + const struct rte_flow_item_ipv6 nic_mask = { + .hdr = { + .src_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .dst_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .vtc_flow = RTE_BE32(0xffffffff), + .proto = 0xff, + .hop_limits = 0xff, + }, + }; + const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); + unsigned int size = sizeof(struct ibv_flow_spec_ipv6); struct ibv_flow_spec_ipv6 ipv6 = { - .type = parser->inner | IBV_FLOW_SPEC_IPV6, - .size = ipv6_size, + .type = IBV_FLOW_SPEC_IPV6 | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, }; + int ret; - /* Don't update layer for the inner pattern. */ - if (!parser->inner) - parser->layer = HASH_RXQ_IPV6; + if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "multiple L3 layers not supported"); + else if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L3 cannot follow an L4 layer."); + /* + * IPv6 is not recognised by the NIC inside a GRE tunnel. + * Such support has to be disabled as the rule will be + * accepted. Issue reproduced with Mellanox OFED 4.3-3.0.2.1 and + * Mellanox OFED 4.4-1.0.0.0. + */ + if (tunnel && flow->layers & MLX5_FLOW_LAYER_GRE) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "IPv6 inside a GRE tunnel is" + " not recognised."); + if (!mask) + mask = &rte_flow_item_ipv6_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_ipv6), error); + if (ret < 0) + return ret; + flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; if (spec) { unsigned int i; uint32_t vtc_flow_val; uint32_t vtc_flow_mask; - if (!mask) - mask = default_mask; memcpy(&ipv6.val.src_ip, spec->hdr.src_addr, RTE_DIM(ipv6.val.src_ip)); memcpy(&ipv6.val.dst_ip, spec->hdr.dst_addr, @@ -1397,44 +1183,86 @@ mlx5_flow_create_ipv6(const struct rte_flow_item *item, ipv6.val.next_hdr &= ipv6.mask.next_hdr; ipv6.val.hop_limit &= ipv6.mask.hop_limit; } - mlx5_flow_create_copy(parser, &ipv6, ipv6_size); - return 0; + flow->l3_protocol_en = !!ipv6.mask.next_hdr; + flow->l3_protocol = ipv6.val.next_hdr; + if (size <= flow_size) { + mlx5_flow_verbs_hashfields_adjust + (flow, tunnel, + (ETH_RSS_IPV6 | ETH_RSS_NONFRAG_IPV6_OTHER), + (IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6)); + flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L3; + mlx5_flow_spec_verbs_add(flow, &ipv6, size); + } + return size; } /** - * Convert UDP item to Verbs specification. + * Convert the @p item into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. * - * @param item[in] + * @param[in] item * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p item has fully been converted, + * otherwise another call with this returned memory size should be done. + * On error, a negative errno value is returned and rte_errno is set. */ static int -mlx5_flow_create_udp(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx5_flow_item_udp(const struct rte_flow_item *item, struct rte_flow *flow, + const size_t flow_size, struct rte_flow_error *error) { const struct rte_flow_item_udp *spec = item->spec; const struct rte_flow_item_udp *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; - unsigned int udp_size = sizeof(struct ibv_flow_spec_tcp_udp); + const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); + unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp); struct ibv_flow_spec_tcp_udp udp = { - .type = parser->inner | IBV_FLOW_SPEC_UDP, - .size = udp_size, + .type = IBV_FLOW_SPEC_UDP | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, }; + int ret; - /* Don't update layer for the inner pattern. */ - if (!parser->inner) { - if (parser->layer == HASH_RXQ_IPV4) - parser->layer = HASH_RXQ_UDPV4; - else - parser->layer = HASH_RXQ_UDPV6; - } + if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_UDP) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "protocol filtering not compatible" + " with UDP layer"); + if (!(flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3))) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L3 is mandatory to filter" + " on L4"); + if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L4 layer is already" + " present"); + if (!mask) + mask = &rte_flow_item_udp_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_udp_mask, + sizeof(struct rte_flow_item_udp), error); + if (ret < 0) + return ret; + flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; if (spec) { - if (!mask) - mask = default_mask; udp.val.dst_port = spec->hdr.dst_port; udp.val.src_port = spec->hdr.src_port; udp.mask.dst_port = mask->hdr.dst_port; @@ -1443,44 +1271,81 @@ mlx5_flow_create_udp(const struct rte_flow_item *item, udp.val.src_port &= udp.mask.src_port; udp.val.dst_port &= udp.mask.dst_port; } - mlx5_flow_create_copy(parser, &udp, udp_size); - return 0; + if (size <= flow_size) { + mlx5_flow_verbs_hashfields_adjust(flow, tunnel, ETH_RSS_UDP, + (IBV_RX_HASH_SRC_PORT_UDP | + IBV_RX_HASH_DST_PORT_UDP)); + flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L4; + mlx5_flow_spec_verbs_add(flow, &udp, size); + } + return size; } /** - * Convert TCP item to Verbs specification. + * Convert the @p item into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. * - * @param item[in] + * @param[in] item * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p item has fully been converted, + * otherwise another call with this returned memory size should be done. + * On error, a negative errno value is returned and rte_errno is set. */ static int -mlx5_flow_create_tcp(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx5_flow_item_tcp(const struct rte_flow_item *item, struct rte_flow *flow, + const size_t flow_size, struct rte_flow_error *error) { const struct rte_flow_item_tcp *spec = item->spec; const struct rte_flow_item_tcp *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; - unsigned int tcp_size = sizeof(struct ibv_flow_spec_tcp_udp); + const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); + unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp); struct ibv_flow_spec_tcp_udp tcp = { - .type = parser->inner | IBV_FLOW_SPEC_TCP, - .size = tcp_size, + .type = IBV_FLOW_SPEC_TCP | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, }; + int ret; - /* Don't update layer for the inner pattern. */ - if (!parser->inner) { - if (parser->layer == HASH_RXQ_IPV4) - parser->layer = HASH_RXQ_TCPV4; - else - parser->layer = HASH_RXQ_TCPV6; - } + if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_TCP) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "protocol filtering not compatible" + " with TCP layer"); + if (!(flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3))) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L3 is mandatory to filter on L4"); + if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L4 layer is already present"); + if (!mask) + mask = &rte_flow_item_tcp_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_tcp_mask, + sizeof(struct rte_flow_item_tcp), error); + if (ret < 0) + return ret; + flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; if (spec) { - if (!mask) - mask = default_mask; tcp.val.dst_port = spec->hdr.dst_port; tcp.val.src_port = spec->hdr.src_port; tcp.mask.dst_port = mask->hdr.dst_port; @@ -1489,43 +1354,78 @@ mlx5_flow_create_tcp(const struct rte_flow_item *item, tcp.val.src_port &= tcp.mask.src_port; tcp.val.dst_port &= tcp.mask.dst_port; } - mlx5_flow_create_copy(parser, &tcp, tcp_size); - return 0; + if (size <= flow_size) { + mlx5_flow_verbs_hashfields_adjust(flow, tunnel, ETH_RSS_TCP, + (IBV_RX_HASH_SRC_PORT_TCP | + IBV_RX_HASH_DST_PORT_TCP)); + flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L4; + mlx5_flow_spec_verbs_add(flow, &tcp, size); + } + return size; } /** - * Convert VXLAN item to Verbs specification. + * Convert the @p item into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. * - * @param item[in] + * @param[in] item * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p item has fully been converted, + * otherwise another call with this returned memory size should be done. + * On error, a negative errno value is returned and rte_errno is set. */ static int -mlx5_flow_create_vxlan(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx5_flow_item_vxlan(const struct rte_flow_item *item, struct rte_flow *flow, + const size_t flow_size, struct rte_flow_error *error) { const struct rte_flow_item_vxlan *spec = item->spec; const struct rte_flow_item_vxlan *mask = item->mask; - struct mlx5_flow_parse *parser = (struct mlx5_flow_parse *)data; unsigned int size = sizeof(struct ibv_flow_spec_tunnel); struct ibv_flow_spec_tunnel vxlan = { - .type = parser->inner | IBV_FLOW_SPEC_VXLAN_TUNNEL, + .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, .size = size, }; + int ret; union vni { uint32_t vlan_id; uint8_t vni[4]; - } id; + } id = { .vlan_id = 0, }; - id.vni[0] = 0; - parser->inner = IBV_FLOW_SPEC_INNER; + if (flow->layers & MLX5_FLOW_LAYER_TUNNEL) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "a tunnel is already present"); + /* + * Verify only UDPv4 is present as defined in + * https://tools.ietf.org/html/rfc7348 + */ + if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L4_UDP)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "no outer UDP layer found"); + if (!mask) + mask = &rte_flow_item_vxlan_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_vxlan_mask, + sizeof(struct rte_flow_item_vxlan), error); + if (ret < 0) + return ret; if (spec) { - if (!mask) - mask = default_mask; memcpy(&id.vni[1], spec->vni, 3); vxlan.val.tunnel_id = id.vlan_id; memcpy(&id.vni[1], mask->vni, 3); @@ -1534,297 +1434,1571 @@ mlx5_flow_create_vxlan(const struct rte_flow_item *item, vxlan.val.tunnel_id &= vxlan.mask.tunnel_id; } /* + * Tunnel id 0 is equivalent as not adding a VXLAN layer, if + * only this layer is defined in the Verbs specification it is + * interpreted as wildcard and all packets will match this + * rule, if it follows a full stack layer (ex: eth / ipv4 / + * udp), all packets matching the layers before will also + * match this rule. To avoid such situation, VNI 0 is + * currently refused. + */ + if (!vxlan.val.tunnel_id) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VXLAN vni cannot be 0"); + if (!(flow->layers & MLX5_FLOW_LAYER_OUTER)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VXLAN tunnel must be fully defined"); + if (size <= flow_size) { + mlx5_flow_spec_verbs_add(flow, &vxlan, size); + flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; + } + flow->layers |= MLX5_FLOW_LAYER_VXLAN; + return size; +} + +/** + * Convert the @p item into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] item + * Item specification. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p item has fully been converted, + * otherwise another call with this returned memory size should be done. + * On error, a negative errno value is returned and rte_errno is set. + */ +static int +mlx5_flow_item_vxlan_gpe(struct rte_eth_dev *dev, + const struct rte_flow_item *item, + struct rte_flow *flow, const size_t flow_size, + struct rte_flow_error *error) +{ + const struct rte_flow_item_vxlan_gpe *spec = item->spec; + const struct rte_flow_item_vxlan_gpe *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_tunnel); + struct ibv_flow_spec_tunnel vxlan_gpe = { + .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, + .size = size, + }; + int ret; + union vni { + uint32_t vlan_id; + uint8_t vni[4]; + } id = { .vlan_id = 0, }; + + if (!((struct priv *)dev->data->dev_private)->config.l3_vxlan_en) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L3 VXLAN is not enabled by device" + " parameter and/or not configured in" + " firmware"); + if (flow->layers & MLX5_FLOW_LAYER_TUNNEL) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "a tunnel is already present"); + /* + * Verify only UDPv4 is present as defined in + * https://tools.ietf.org/html/rfc7348 + */ + if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L4_UDP)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "no outer UDP layer found"); + if (!mask) + mask = &rte_flow_item_vxlan_gpe_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_vxlan_gpe_mask, + sizeof(struct rte_flow_item_vxlan_gpe), error); + if (ret < 0) + return ret; + if (spec) { + memcpy(&id.vni[1], spec->vni, 3); + vxlan_gpe.val.tunnel_id = id.vlan_id; + memcpy(&id.vni[1], mask->vni, 3); + vxlan_gpe.mask.tunnel_id = id.vlan_id; + if (spec->protocol) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VxLAN-GPE protocol not supported"); + /* Remove unwanted bits from values. */ + vxlan_gpe.val.tunnel_id &= vxlan_gpe.mask.tunnel_id; + } + /* * Tunnel id 0 is equivalent as not adding a VXLAN layer, if only this * layer is defined in the Verbs specification it is interpreted as * wildcard and all packets will match this rule, if it follows a full * stack layer (ex: eth / ipv4 / udp), all packets matching the layers - * before will also match this rule. - * To avoid such situation, VNI 0 is currently refused. + * before will also match this rule. To avoid such situation, VNI 0 + * is currently refused. */ - if (!vxlan.val.tunnel_id) - return EINVAL; - mlx5_flow_create_copy(parser, &vxlan, size); + if (!vxlan_gpe.val.tunnel_id) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VXLAN-GPE vni cannot be 0"); + if (!(flow->layers & MLX5_FLOW_LAYER_OUTER)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VXLAN-GPE tunnel must be fully" + " defined"); + if (size <= flow_size) { + mlx5_flow_spec_verbs_add(flow, &vxlan_gpe, size); + flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; + } + flow->layers |= MLX5_FLOW_LAYER_VXLAN_GPE; + return size; +} + +/** + * Update the protocol in Verbs IPv4/IPv6 spec. + * + * @param[in, out] attr + * Pointer to Verbs attributes structure. + * @param[in] search + * Specification type to search in order to update the IP protocol. + * @param[in] protocol + * Protocol value to set if none is present in the specification. + */ +static void +mlx5_flow_item_gre_ip_protocol_update(struct ibv_flow_attr *attr, + enum ibv_flow_spec_type search, + uint8_t protocol) +{ + unsigned int i; + struct ibv_spec_header *hdr = (struct ibv_spec_header *) + ((uint8_t *)attr + sizeof(struct ibv_flow_attr)); + + if (!attr) + return; + for (i = 0; i != attr->num_of_specs; ++i) { + if (hdr->type == search) { + union { + struct ibv_flow_spec_ipv4_ext *ipv4; + struct ibv_flow_spec_ipv6 *ipv6; + } ip; + + switch (search) { + case IBV_FLOW_SPEC_IPV4_EXT: + ip.ipv4 = (struct ibv_flow_spec_ipv4_ext *)hdr; + if (!ip.ipv4->val.proto) { + ip.ipv4->val.proto = protocol; + ip.ipv4->mask.proto = 0xff; + } + break; + case IBV_FLOW_SPEC_IPV6: + ip.ipv6 = (struct ibv_flow_spec_ipv6 *)hdr; + if (!ip.ipv6->val.next_hdr) { + ip.ipv6->val.next_hdr = protocol; + ip.ipv6->mask.next_hdr = 0xff; + } + break; + default: + break; + } + break; + } + hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size); + } +} + +/** + * Convert the @p item into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * It will also update the previous L3 layer with the protocol value matching + * the GRE. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] item + * Item specification. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p item has fully been converted, + * otherwise another call with this returned memory size should be done. + * On error, a negative errno value is returned and rte_errno is set. + */ +static int +mlx5_flow_item_gre(const struct rte_flow_item *item, + struct rte_flow *flow, const size_t flow_size, + struct rte_flow_error *error) +{ + struct mlx5_flow_verbs *verbs = flow->cur_verbs; + const struct rte_flow_item_gre *spec = item->spec; + const struct rte_flow_item_gre *mask = item->mask; +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + unsigned int size = sizeof(struct ibv_flow_spec_gre); + struct ibv_flow_spec_gre tunnel = { + .type = IBV_FLOW_SPEC_GRE, + .size = size, + }; +#else + unsigned int size = sizeof(struct ibv_flow_spec_tunnel); + struct ibv_flow_spec_tunnel tunnel = { + .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, + .size = size, + }; +#endif + int ret; + + if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_GRE) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "protocol filtering not compatible" + " with this GRE layer"); + if (flow->layers & MLX5_FLOW_LAYER_TUNNEL) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "a tunnel is already present"); + if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L3)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "L3 Layer is missing"); + if (!mask) + mask = &rte_flow_item_gre_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_gre_mask, + sizeof(struct rte_flow_item_gre), error); + if (ret < 0) + return ret; +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + if (spec) { + tunnel.val.c_ks_res0_ver = spec->c_rsvd0_ver; + tunnel.val.protocol = spec->protocol; + tunnel.mask.c_ks_res0_ver = mask->c_rsvd0_ver; + tunnel.mask.protocol = mask->protocol; + /* Remove unwanted bits from values. */ + tunnel.val.c_ks_res0_ver &= tunnel.mask.c_ks_res0_ver; + tunnel.val.protocol &= tunnel.mask.protocol; + tunnel.val.key &= tunnel.mask.key; + } +#else + if (spec && (spec->protocol & mask->protocol)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "without MPLS support the" + " specification cannot be used for" + " filtering"); +#endif /* !HAVE_IBV_DEVICE_MPLS_SUPPORT */ + if (size <= flow_size) { + if (flow->layers & MLX5_FLOW_LAYER_OUTER_L3_IPV4) + mlx5_flow_item_gre_ip_protocol_update + (verbs->attr, IBV_FLOW_SPEC_IPV4_EXT, + MLX5_IP_PROTOCOL_GRE); + else + mlx5_flow_item_gre_ip_protocol_update + (verbs->attr, IBV_FLOW_SPEC_IPV6, + MLX5_IP_PROTOCOL_GRE); + mlx5_flow_spec_verbs_add(flow, &tunnel, size); + flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; + } + flow->layers |= MLX5_FLOW_LAYER_GRE; + return size; +} + +/** + * Convert the @p item into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. + * + * @param[in] item + * Item specification. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p item has fully been converted, + * otherwise another call with this returned memory size should be done. + * On error, a negative errno value is returned and rte_errno is set. + */ +static int +mlx5_flow_item_mpls(const struct rte_flow_item *item __rte_unused, + struct rte_flow *flow __rte_unused, + const size_t flow_size __rte_unused, + struct rte_flow_error *error) +{ +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + const struct rte_flow_item_mpls *spec = item->spec; + const struct rte_flow_item_mpls *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_mpls); + struct ibv_flow_spec_mpls mpls = { + .type = IBV_FLOW_SPEC_MPLS, + .size = size, + }; + int ret; + + if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_MPLS) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "protocol filtering not compatible" + " with MPLS layer"); + /* Multi-tunnel isn't allowed but MPLS over GRE is an exception. */ + if (flow->layers & MLX5_FLOW_LAYER_TUNNEL && + (flow->layers & MLX5_FLOW_LAYER_GRE) != MLX5_FLOW_LAYER_GRE) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "a tunnel is already" + " present"); + if (!mask) + mask = &rte_flow_item_mpls_mask; + ret = mlx5_flow_item_acceptable + (item, (const uint8_t *)mask, + (const uint8_t *)&rte_flow_item_mpls_mask, + sizeof(struct rte_flow_item_mpls), error); + if (ret < 0) + return ret; + if (spec) { + memcpy(&mpls.val.label, spec, sizeof(mpls.val.label)); + memcpy(&mpls.mask.label, mask, sizeof(mpls.mask.label)); + /* Remove unwanted bits from values. */ + mpls.val.label &= mpls.mask.label; + } + if (size <= flow_size) { + mlx5_flow_spec_verbs_add(flow, &mpls, size); + flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; + } + flow->layers |= MLX5_FLOW_LAYER_MPLS; + return size; +#endif /* !HAVE_IBV_DEVICE_MPLS_SUPPORT */ + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "MPLS is not supported by Verbs, please" + " update."); +} + +/** + * Convert the @p pattern into a Verbs specifications after ensuring the NIC + * will understand and process it correctly. + * The conversion is performed item per item, each of them is written into + * the @p flow if its size is lesser or equal to @p flow_size. + * Validation and memory consumption computation are still performed until the + * end of @p pattern, unless an error is encountered. + * + * @param[in] pattern + * Flow pattern. + * @param[in, out] flow + * Pointer to the rte_flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small some + * garbage may be present. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @pattern has fully been + * converted, otherwise another call with this returned memory size should + * be done. + * On error, a negative errno value is returned and rte_errno is set. + */ +static int +mlx5_flow_items(struct rte_eth_dev *dev, + const struct rte_flow_item pattern[], + struct rte_flow *flow, const size_t flow_size, + struct rte_flow_error *error) +{ + int remain = flow_size; + size_t size = 0; + + for (; pattern->type != RTE_FLOW_ITEM_TYPE_END; pattern++) { + int ret = 0; + + switch (pattern->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + ret = mlx5_flow_item_eth(pattern, flow, remain, error); + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + ret = mlx5_flow_item_vlan(pattern, flow, remain, error); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + ret = mlx5_flow_item_ipv4(pattern, flow, remain, error); + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + ret = mlx5_flow_item_ipv6(pattern, flow, remain, error); + break; + case RTE_FLOW_ITEM_TYPE_UDP: + ret = mlx5_flow_item_udp(pattern, flow, remain, error); + break; + case RTE_FLOW_ITEM_TYPE_TCP: + ret = mlx5_flow_item_tcp(pattern, flow, remain, error); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + ret = mlx5_flow_item_vxlan(pattern, flow, remain, + error); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + ret = mlx5_flow_item_vxlan_gpe(dev, pattern, flow, + remain, error); + break; + case RTE_FLOW_ITEM_TYPE_GRE: + ret = mlx5_flow_item_gre(pattern, flow, remain, error); + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + ret = mlx5_flow_item_mpls(pattern, flow, remain, error); + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + pattern, + "item not supported"); + } + if (ret < 0) + return ret; + if (remain > ret) + remain -= ret; + else + remain = 0; + size += ret; + } + if (!flow->layers) { + const struct rte_flow_item item = { + .type = RTE_FLOW_ITEM_TYPE_ETH, + }; + + return mlx5_flow_item_eth(&item, flow, flow_size, error); + } + return size; +} + +/** + * Convert the @p action into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. + * + * @param[in] action + * Action configuration. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. + * + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p action has fully been + * converted, otherwise another call with this returned memory size should + * be done. + * On error, a negative errno value is returned and rte_errno is set. + */ +static int +mlx5_flow_action_drop(const struct rte_flow_action *action, + struct rte_flow *flow, const size_t flow_size, + struct rte_flow_error *error) +{ + unsigned int size = sizeof(struct ibv_flow_spec_action_drop); + struct ibv_flow_spec_action_drop drop = { + .type = IBV_FLOW_SPEC_ACTION_DROP, + .size = size, + }; + + if (flow->fate) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "multiple fate actions are not" + " supported"); + if (flow->modifier & (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "drop is not compatible with" + " flag/mark action"); + if (size < flow_size) + mlx5_flow_spec_verbs_add(flow, &drop, size); + flow->fate |= MLX5_FLOW_FATE_DROP; + return size; +} + +/** + * Convert the @p action into @p flow after ensuring the NIC will understand + * and process it correctly. + * + * @param[in] dev + * Pointer to Ethernet device structure. + * @param[in] action + * Action configuration. + * @param[in, out] flow + * Pointer to flow structure. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_action_queue(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + const struct rte_flow_action_queue *queue = action->conf; + + if (flow->fate) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "multiple fate actions are not" + " supported"); + if (queue->index >= priv->rxqs_n) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &queue->index, + "queue index out of range"); + if (!(*priv->rxqs)[queue->index]) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &queue->index, + "queue is not configured"); + if (flow->queue) + (*flow->queue)[0] = queue->index; + flow->rss.queue_num = 1; + flow->fate |= MLX5_FLOW_FATE_QUEUE; + return 0; +} + +/** + * Ensure the @p action will be understood and used correctly by the NIC. + * + * @param dev + * Pointer to Ethernet device structure. + * @param action[in] + * Pointer to flow actions array. + * @param flow[in, out] + * Pointer to the rte_flow structure. + * @param error[in, out] + * Pointer to error structure. + * + * @return + * On success @p flow->queue array and @p flow->rss are filled and valid. + * On error, a negative errno value is returned and rte_errno is set. + */ +static int +mlx5_flow_action_rss(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + const struct rte_flow_action_rss *rss = action->conf; + unsigned int i; + + if (flow->fate) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "multiple fate actions are not" + " supported"); + if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT && + rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->func, + "RSS hash function not supported"); +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + if (rss->level > 2) +#else + if (rss->level > 1) +#endif + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->level, + "tunnel RSS is not supported"); + if (rss->key_len < MLX5_RSS_HASH_KEY_LEN) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->key_len, + "RSS hash key too small"); + if (rss->key_len > MLX5_RSS_HASH_KEY_LEN) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->key_len, + "RSS hash key too large"); + if (!rss->queue_num) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + rss, + "no queues were provided for RSS"); + if (rss->queue_num > priv->config.ind_table_max_size) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->queue_num, + "number of queues too large"); + if (rss->types & MLX5_RSS_HF_MASK) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->types, + "some RSS protocols are not" + " supported"); + for (i = 0; i != rss->queue_num; ++i) { + if (rss->queue[i] >= priv->rxqs_n) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + rss, + "queue index out of range"); + if (!(*priv->rxqs)[rss->queue[i]]) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->queue[i], + "queue is not configured"); + } + if (flow->queue) + memcpy((*flow->queue), rss->queue, + rss->queue_num * sizeof(uint16_t)); + flow->rss.queue_num = rss->queue_num; + memcpy(flow->key, rss->key, MLX5_RSS_HASH_KEY_LEN); + flow->rss.types = rss->types; + flow->rss.level = rss->level; + flow->fate |= MLX5_FLOW_FATE_RSS; return 0; } /** - * Convert mark/flag action to Verbs specification. + * Convert the @p action into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. + * + * @param[in] action + * Action configuration. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. * - * @param parser - * Internal parser structure. - * @param mark_id - * Mark identifier. + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p action has fully been + * converted, otherwise another call with this returned memory size should + * be done. + * On error, a negative errno value is returned and rte_errno is set. */ static int -mlx5_flow_create_flag_mark(struct mlx5_flow_parse *parser, uint32_t mark_id) +mlx5_flow_action_flag(const struct rte_flow_action *action, + struct rte_flow *flow, const size_t flow_size, + struct rte_flow_error *error) { unsigned int size = sizeof(struct ibv_flow_spec_action_tag); struct ibv_flow_spec_action_tag tag = { .type = IBV_FLOW_SPEC_ACTION_TAG, .size = size, - .tag_id = mlx5_flow_mark_set(mark_id), + .tag_id = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT), }; + struct mlx5_flow_verbs *verbs = flow->cur_verbs; + + if (flow->modifier & MLX5_FLOW_MOD_FLAG) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "flag action already present"); + if (flow->fate & MLX5_FLOW_FATE_DROP) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "flag is not compatible with drop" + " action"); + if (flow->modifier & MLX5_FLOW_MOD_MARK) + size = 0; + else if (size <= flow_size && verbs) + mlx5_flow_spec_verbs_add(flow, &tag, size); + flow->modifier |= MLX5_FLOW_MOD_FLAG; + return size; +} - assert(parser->mark); - mlx5_flow_create_copy(parser, &tag, size); - return 0; +/** + * Update verbs specification to modify the flag to mark. + * + * @param[in, out] verbs + * Pointer to the mlx5_flow_verbs structure. + * @param[in] mark_id + * Mark identifier to replace the flag. + */ +static void +mlx5_flow_verbs_mark_update(struct mlx5_flow_verbs *verbs, uint32_t mark_id) +{ + struct ibv_spec_header *hdr; + int i; + + if (!verbs) + return; + /* Update Verbs specification. */ + hdr = (struct ibv_spec_header *)verbs->specs; + if (!hdr) + return; + for (i = 0; i != verbs->attr->num_of_specs; ++i) { + if (hdr->type == IBV_FLOW_SPEC_ACTION_TAG) { + struct ibv_flow_spec_action_tag *t = + (struct ibv_flow_spec_action_tag *)hdr; + + t->tag_id = mlx5_flow_mark_set(mark_id); + } + hdr = (struct ibv_spec_header *)((uintptr_t)hdr + hdr->size); + } } /** - * Convert count action to Verbs specification. + * Convert the @p action into @p flow (or by updating the already present + * Flag Verbs specification) after ensuring the NIC will understand and + * process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. + * + * @param[in] action + * Action configuration. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param[out] error + * Pointer to error structure. * - * @param priv - * Pointer to private structure. - * @param parser - * Pointer to MLX5 flow parser structure. + * @return + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p action has fully been + * converted, otherwise another call with this returned memory size should + * be done. + * On error, a negative errno value is returned and rte_errno is set. + */ +static int +mlx5_flow_action_mark(const struct rte_flow_action *action, + struct rte_flow *flow, const size_t flow_size, + struct rte_flow_error *error) +{ + const struct rte_flow_action_mark *mark = action->conf; + unsigned int size = sizeof(struct ibv_flow_spec_action_tag); + struct ibv_flow_spec_action_tag tag = { + .type = IBV_FLOW_SPEC_ACTION_TAG, + .size = size, + }; + struct mlx5_flow_verbs *verbs = flow->cur_verbs; + + if (!mark) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "configuration cannot be null"); + if (mark->id >= MLX5_FLOW_MARK_MAX) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &mark->id, + "mark id must in 0 <= id < " + RTE_STR(MLX5_FLOW_MARK_MAX)); + if (flow->modifier & MLX5_FLOW_MOD_MARK) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "mark action already present"); + if (flow->fate & MLX5_FLOW_FATE_DROP) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "mark is not compatible with drop" + " action"); + if (flow->modifier & MLX5_FLOW_MOD_FLAG) { + mlx5_flow_verbs_mark_update(verbs, mark->id); + size = 0; + } else if (size <= flow_size) { + tag.tag_id = mlx5_flow_mark_set(mark->id); + mlx5_flow_spec_verbs_add(flow, &tag, size); + } + flow->modifier |= MLX5_FLOW_MOD_MARK; + return size; +} + +/** + * Convert the @p action into a Verbs specification after ensuring the NIC + * will understand and process it correctly. + * If the necessary size for the conversion is greater than the @p flow_size, + * nothing is written in @p flow, the validation is still performed. + * + * @param action[in] + * Action configuration. + * @param flow[in, out] + * Pointer to flow structure. + * @param flow_size[in] + * Size in bytes of the available space in @p flow, if too small, nothing is + * written. + * @param error[int, out] + * Pointer to error structure. * * @return - * 0 on success, errno value on failure. + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p action has fully been + * converted, otherwise another call with this returned memory size should + * be done. + * On error, a negative errno value is returned and rte_errno is set. */ static int -mlx5_flow_create_count(struct priv *priv __rte_unused, - struct mlx5_flow_parse *parser __rte_unused) +mlx5_flow_action_count(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + struct rte_flow *flow, + const size_t flow_size __rte_unused, + struct rte_flow_error *error) { + const struct rte_flow_action_count *count = action->conf; #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT unsigned int size = sizeof(struct ibv_flow_spec_counter_action); - struct ibv_counter_set_init_attr init_attr = {0}; struct ibv_flow_spec_counter_action counter = { .type = IBV_FLOW_SPEC_ACTION_COUNT, .size = size, - .counter_set_handle = 0, }; +#endif - init_attr.counter_set_id = 0; - parser->cs = mlx5_glue->create_counter_set(priv->ctx, &init_attr); - if (!parser->cs) - return EINVAL; - counter.counter_set_handle = parser->cs->handle; - mlx5_flow_create_copy(parser, &counter, size); + if (!flow->counter) { + flow->counter = mlx5_flow_counter_new(dev, count->shared, + count->id); + if (!flow->counter) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "cannot get counter" + " context."); + } + if (!((struct priv *)dev->data->dev_private)->config.flow_counter_en) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "flow counters are not supported."); + flow->modifier |= MLX5_FLOW_MOD_COUNT; +#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT + counter.counter_set_handle = flow->counter->cs->handle; + if (size <= flow_size) + mlx5_flow_spec_verbs_add(flow, &counter, size); + return size; #endif return 0; } /** - * Complete flow rule creation with a drop queue. - * - * @param priv - * Pointer to private structure. - * @param parser - * Internal parser structure. - * @param flow - * Pointer to the rte_flow. + * Convert the @p action into @p flow after ensuring the NIC will understand + * and process it correctly. + * The conversion is performed action per action, each of them is written into + * the @p flow if its size is lesser or equal to @p flow_size. + * Validation and memory consumption computation are still performed until the + * end of @p action, unless an error is encountered. + * + * @param[in] dev + * Pointer to Ethernet device structure. + * @param[in] actions + * Pointer to flow actions array. + * @param[in, out] flow + * Pointer to the rte_flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small some + * garbage may be present. * @param[out] error - * Perform verbose error reporting if not NULL. + * Pointer to error structure. * * @return - * 0 on success, errno value on failure. + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the @p actions has fully been + * converted, otherwise another call with this returned memory size should + * be done. + * On error, a negative errno value is returned and rte_errno is set. */ static int -priv_flow_create_action_queue_drop(struct priv *priv, - struct mlx5_flow_parse *parser, - struct rte_flow *flow, - struct rte_flow_error *error) +mlx5_flow_actions(struct rte_eth_dev *dev, + const struct rte_flow_action actions[], + struct rte_flow *flow, const size_t flow_size, + struct rte_flow_error *error) { - struct ibv_flow_spec_action_drop *drop; - unsigned int size = sizeof(struct ibv_flow_spec_action_drop); - int err = 0; - - assert(priv->pd); - assert(priv->ctx); - flow->drop = 1; - drop = (void *)((uintptr_t)parser->queue[HASH_RXQ_ETH].ibv_attr + - parser->queue[HASH_RXQ_ETH].offset); - *drop = (struct ibv_flow_spec_action_drop){ - .type = IBV_FLOW_SPEC_ACTION_DROP, - .size = size, - }; - ++parser->queue[HASH_RXQ_ETH].ibv_attr->num_of_specs; - parser->queue[HASH_RXQ_ETH].offset += size; - flow->frxq[HASH_RXQ_ETH].ibv_attr = - parser->queue[HASH_RXQ_ETH].ibv_attr; - if (parser->count) - flow->cs = parser->cs; - if (!priv->dev->data->dev_started) - return 0; - parser->queue[HASH_RXQ_ETH].ibv_attr = NULL; - flow->frxq[HASH_RXQ_ETH].ibv_flow = - mlx5_glue->create_flow(priv->flow_drop_queue->qp, - flow->frxq[HASH_RXQ_ETH].ibv_attr); - if (!flow->frxq[HASH_RXQ_ETH].ibv_flow) { - rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, - NULL, "flow rule creation failure"); - err = ENOMEM; - goto error; - } - return 0; -error: - assert(flow); - if (flow->frxq[HASH_RXQ_ETH].ibv_flow) { - claim_zero(mlx5_glue->destroy_flow - (flow->frxq[HASH_RXQ_ETH].ibv_flow)); - flow->frxq[HASH_RXQ_ETH].ibv_flow = NULL; - } - if (flow->frxq[HASH_RXQ_ETH].ibv_attr) { - rte_free(flow->frxq[HASH_RXQ_ETH].ibv_attr); - flow->frxq[HASH_RXQ_ETH].ibv_attr = NULL; - } - if (flow->cs) { - claim_zero(mlx5_glue->destroy_counter_set(flow->cs)); - flow->cs = NULL; - parser->cs = NULL; - } - return err; + size_t size = 0; + int remain = flow_size; + int ret = 0; + + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + ret = mlx5_flow_action_flag(actions, flow, remain, + error); + break; + case RTE_FLOW_ACTION_TYPE_MARK: + ret = mlx5_flow_action_mark(actions, flow, remain, + error); + break; + case RTE_FLOW_ACTION_TYPE_DROP: + ret = mlx5_flow_action_drop(actions, flow, remain, + error); + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + ret = mlx5_flow_action_queue(dev, actions, flow, error); + break; + case RTE_FLOW_ACTION_TYPE_RSS: + ret = mlx5_flow_action_rss(dev, actions, flow, error); + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = mlx5_flow_action_count(dev, actions, flow, remain, + error); + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + if (ret < 0) + return ret; + if (remain > ret) + remain -= ret; + else + remain = 0; + size += ret; + } + if (!flow->fate) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "no fate action found"); + return size; } /** - * Create hash Rx queues when RSS is enabled. + * Validate flow rule and fill flow structure accordingly. * - * @param priv - * Pointer to private structure. - * @param parser - * Internal parser structure. - * @param flow - * Pointer to the rte_flow. + * @param dev + * Pointer to Ethernet device. + * @param[out] flow + * Pointer to flow structure. + * @param flow_size + * Size of allocated space for @p flow. + * @param[in] attr + * Flow rule attributes. + * @param[in] pattern + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). * @param[out] error * Perform verbose error reporting if not NULL. * * @return - * 0 on success, a errno value otherwise and rte_errno is set. + * A positive value representing the size of the flow object in bytes + * regardless of @p flow_size on success, a negative errno value otherwise + * and rte_errno is set. */ static int -priv_flow_create_action_queue_rss(struct priv *priv, - struct mlx5_flow_parse *parser, - struct rte_flow *flow, - struct rte_flow_error *error) +mlx5_flow_merge_switch(struct rte_eth_dev *dev, + struct rte_flow *flow, + size_t flow_size, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) { + unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0); + uint16_t port_id[!n + n]; + struct mlx5_nl_flow_ptoi ptoi[!n + n + 1]; + size_t off = RTE_ALIGN_CEIL(sizeof(*flow), alignof(max_align_t)); unsigned int i; + unsigned int own = 0; + int ret; - for (i = 0; i != hash_rxq_init_n; ++i) { - uint64_t hash_fields; + /* At least one port is needed when no switch domain is present. */ + if (!n) { + n = 1; + port_id[0] = dev->data->port_id; + } else { + n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n); + } + for (i = 0; i != n; ++i) { + struct rte_eth_dev_info dev_info; + + rte_eth_dev_info_get(port_id[i], &dev_info); + if (port_id[i] == dev->data->port_id) + own = i; + ptoi[i].port_id = port_id[i]; + ptoi[i].ifindex = dev_info.if_index; + } + /* Ensure first entry of ptoi[] is the current device. */ + if (own) { + ptoi[n] = ptoi[0]; + ptoi[0] = ptoi[own]; + ptoi[own] = ptoi[n]; + } + /* An entry with zero ifindex terminates ptoi[]. */ + ptoi[n].port_id = 0; + ptoi[n].ifindex = 0; + if (flow_size < off) + flow_size = 0; + ret = mlx5_nl_flow_transpose((uint8_t *)flow + off, + flow_size ? flow_size - off : 0, + ptoi, attr, pattern, actions, error); + if (ret < 0) + return ret; + if (flow_size) { + *flow = (struct rte_flow){ + .attributes = *attr, + .nl_flow = (uint8_t *)flow + off, + }; + /* + * Generate a reasonably unique handle based on the address + * of the target buffer. + * + * This is straightforward on 32-bit systems where the flow + * pointer can be used directly. Otherwise, its least + * significant part is taken after shifting it by the + * previous power of two of the pointed buffer size. + */ + if (sizeof(flow) <= 4) + mlx5_nl_flow_brand(flow->nl_flow, (uintptr_t)flow); + else + mlx5_nl_flow_brand + (flow->nl_flow, + (uintptr_t)flow >> + rte_log2_u32(rte_align32prevpow2(flow_size))); + } + return off + ret; +} - if (!parser->queue[i].ibv_attr) - continue; - flow->frxq[i].ibv_attr = parser->queue[i].ibv_attr; - parser->queue[i].ibv_attr = NULL; - hash_fields = hash_rxq_init[i].hash_fields; - if (!priv->dev->data->dev_started) - continue; - flow->frxq[i].hrxq = - mlx5_priv_hrxq_get(priv, - parser->rss_conf.rss_key, - parser->rss_conf.rss_key_len, - hash_fields, - parser->queues, - parser->queues_n); - if (flow->frxq[i].hrxq) - continue; - flow->frxq[i].hrxq = - mlx5_priv_hrxq_new(priv, - parser->rss_conf.rss_key, - parser->rss_conf.rss_key_len, - hash_fields, - parser->queues, - parser->queues_n); - if (!flow->frxq[i].hrxq) { - rte_flow_error_set(error, ENOMEM, - RTE_FLOW_ERROR_TYPE_HANDLE, - NULL, "cannot create hash rxq"); - return ENOMEM; +static unsigned int +mlx5_find_graph_root(const struct rte_flow_item pattern[], uint32_t rss_level) +{ + const struct rte_flow_item *item; + unsigned int has_vlan = 0; + + for (item = pattern; item->type != RTE_FLOW_ITEM_TYPE_END; item++) { + if (item->type == RTE_FLOW_ITEM_TYPE_VLAN) { + has_vlan = 1; + break; } } - return 0; + if (has_vlan) + return rss_level < 2 ? MLX5_EXPANSION_ROOT_ETH_VLAN : + MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN; + return rss_level < 2 ? MLX5_EXPANSION_ROOT : + MLX5_EXPANSION_ROOT_OUTER; } /** - * Complete flow rule creation. - * - * @param priv - * Pointer to private structure. - * @param parser - * Internal parser structure. - * @param flow - * Pointer to the rte_flow. + * Convert the @p attributes, @p pattern, @p action, into an flow for the NIC + * after ensuring the NIC will understand and process it correctly. + * The conversion is only performed item/action per item/action, each of + * them is written into the @p flow if its size is lesser or equal to @p + * flow_size. + * Validation and memory consumption computation are still performed until the + * end, unless an error is encountered. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] flow_size + * Size in bytes of the available space in @p flow, if too small some + * garbage may be present. + * @param[in] attributes + * Flow rule attributes. + * @param[in] pattern + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). * @param[out] error * Perform verbose error reporting if not NULL. * * @return - * 0 on success, a errno value otherwise and rte_errno is set. + * On success the number of bytes consumed/necessary, if the returned value + * is lesser or equal to @p flow_size, the flow has fully been converted and + * can be applied, otherwise another call with this returned memory size + * should be done. + * On error, a negative errno value is returned and rte_errno is set. */ static int -priv_flow_create_action_queue(struct priv *priv, - struct mlx5_flow_parse *parser, - struct rte_flow *flow, - struct rte_flow_error *error) +mlx5_flow_merge(struct rte_eth_dev *dev, struct rte_flow *flow, + const size_t flow_size, + const struct rte_flow_attr *attributes, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct rte_flow local_flow = { .layers = 0, }; + size_t size = sizeof(*flow); + union { + struct rte_flow_expand_rss buf; + uint8_t buffer[2048]; + } expand_buffer; + struct rte_flow_expand_rss *buf = &expand_buffer.buf; + struct mlx5_flow_verbs *original_verbs = NULL; + size_t original_verbs_size = 0; + uint32_t original_layers = 0; + int expanded_pattern_idx = 0; + int ret; + uint32_t i; + + if (attributes->transfer) + return mlx5_flow_merge_switch(dev, flow, flow_size, + attributes, pattern, + actions, error); + if (size > flow_size) + flow = &local_flow; + ret = mlx5_flow_attributes(dev, attributes, flow, error); + if (ret < 0) + return ret; + ret = mlx5_flow_actions(dev, actions, &local_flow, 0, error); + if (ret < 0) + return ret; + if (local_flow.rss.types) { + unsigned int graph_root; + + graph_root = mlx5_find_graph_root(pattern, + local_flow.rss.level); + ret = rte_flow_expand_rss(buf, sizeof(expand_buffer.buffer), + pattern, local_flow.rss.types, + mlx5_support_expansion, + graph_root); + assert(ret > 0 && + (unsigned int)ret < sizeof(expand_buffer.buffer)); + } else { + buf->entries = 1; + buf->entry[0].pattern = (void *)(uintptr_t)pattern; + } + size += RTE_ALIGN_CEIL(local_flow.rss.queue_num * sizeof(uint16_t), + sizeof(void *)); + if (size <= flow_size) + flow->queue = (void *)(flow + 1); + LIST_INIT(&flow->verbs); + flow->layers = 0; + flow->modifier = 0; + flow->fate = 0; + for (i = 0; i != buf->entries; ++i) { + size_t off = size; + size_t off2; + + flow->layers = original_layers; + size += sizeof(struct ibv_flow_attr) + + sizeof(struct mlx5_flow_verbs); + off2 = size; + if (size < flow_size) { + flow->cur_verbs = (void *)((uintptr_t)flow + off); + flow->cur_verbs->attr = (void *)(flow->cur_verbs + 1); + flow->cur_verbs->specs = + (void *)(flow->cur_verbs->attr + 1); + } + /* First iteration convert the pattern into Verbs. */ + if (i == 0) { + /* Actions don't need to be converted several time. */ + ret = mlx5_flow_actions(dev, actions, flow, + (size < flow_size) ? + flow_size - size : 0, + error); + if (ret < 0) + return ret; + size += ret; + } else { + /* + * Next iteration means the pattern has already been + * converted and an expansion is necessary to match + * the user RSS request. For that only the expanded + * items will be converted, the common part with the + * user pattern are just copied into the next buffer + * zone. + */ + size += original_verbs_size; + if (size < flow_size) { + rte_memcpy(flow->cur_verbs->attr, + original_verbs->attr, + original_verbs_size + + sizeof(struct ibv_flow_attr)); + flow->cur_verbs->size = original_verbs_size; + } + } + ret = mlx5_flow_items + (dev, + (const struct rte_flow_item *) + &buf->entry[i].pattern[expanded_pattern_idx], + flow, + (size < flow_size) ? flow_size - size : 0, error); + if (ret < 0) + return ret; + size += ret; + if (size <= flow_size) { + mlx5_flow_adjust_priority(dev, flow); + LIST_INSERT_HEAD(&flow->verbs, flow->cur_verbs, next); + } + /* + * Keep a pointer of the first verbs conversion and the layers + * it has encountered. + */ + if (i == 0) { + original_verbs = flow->cur_verbs; + original_verbs_size = size - off2; + original_layers = flow->layers; + /* + * move the index of the expanded pattern to the + * first item not addressed yet. + */ + if (pattern->type == RTE_FLOW_ITEM_TYPE_END) { + expanded_pattern_idx++; + } else { + const struct rte_flow_item *item = pattern; + + for (item = pattern; + item->type != RTE_FLOW_ITEM_TYPE_END; + ++item) + expanded_pattern_idx++; + } + } + } + /* Restore the origin layers in the flow. */ + flow->layers = original_layers; + return size; +} + +/** + * Lookup and set the ptype in the data Rx part. A single Ptype can be used, + * if several tunnel rules are used on this queue, the tunnel ptype will be + * cleared. + * + * @param rxq_ctrl + * Rx queue to update. + */ +static void +mlx5_flow_rxq_tunnel_ptype_update(struct mlx5_rxq_ctrl *rxq_ctrl) { - int err = 0; unsigned int i; + uint32_t tunnel_ptype = 0; - assert(priv->pd); - assert(priv->ctx); - assert(!parser->drop); - err = priv_flow_create_action_queue_rss(priv, parser, flow, error); - if (err) - goto error; - if (parser->count) - flow->cs = parser->cs; - if (!priv->dev->data->dev_started) - return 0; - for (i = 0; i != hash_rxq_init_n; ++i) { - if (!flow->frxq[i].hrxq) + /* Look up for the ptype to use. */ + for (i = 0; i != MLX5_FLOW_TUNNEL; ++i) { + if (!rxq_ctrl->flow_tunnels_n[i]) continue; - flow->frxq[i].ibv_flow = - mlx5_glue->create_flow(flow->frxq[i].hrxq->qp, - flow->frxq[i].ibv_attr); - if (!flow->frxq[i].ibv_flow) { - rte_flow_error_set(error, ENOMEM, - RTE_FLOW_ERROR_TYPE_HANDLE, - NULL, "flow rule creation failure"); - err = ENOMEM; - goto error; + if (!tunnel_ptype) { + tunnel_ptype = tunnels_info[i].ptype; + } else { + tunnel_ptype = 0; + break; + } + } + rxq_ctrl->rxq.tunnel = tunnel_ptype; +} + +/** + * Set the Rx queue flags (Mark/Flag and Tunnel Ptypes) according to the flow. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] flow + * Pointer to flow structure. + */ +static void +mlx5_flow_rxq_flags_set(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct priv *priv = dev->data->dev_private; + const int mark = !!(flow->modifier & + (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK)); + const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); + unsigned int i; + + for (i = 0; i != flow->rss.queue_num; ++i) { + int idx = (*flow->queue)[i]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of((*priv->rxqs)[idx], + struct mlx5_rxq_ctrl, rxq); + + if (mark) { + rxq_ctrl->rxq.mark = 1; + rxq_ctrl->flow_mark_n++; + } + if (tunnel) { + unsigned int j; + + /* Increase the counter matching the flow. */ + for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) { + if ((tunnels_info[j].tunnel & flow->layers) == + tunnels_info[j].tunnel) { + rxq_ctrl->flow_tunnels_n[j]++; + break; + } + } + mlx5_flow_rxq_tunnel_ptype_update(rxq_ctrl); + } + } +} + +/** + * Clear the Rx queue flags (Mark/Flag and Tunnel Ptype) associated with the + * @p flow if no other flow uses it with the same kind of request. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] flow + * Pointer to the flow. + */ +static void +mlx5_flow_rxq_flags_trim(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct priv *priv = dev->data->dev_private; + const int mark = !!(flow->modifier & + (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK)); + const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); + unsigned int i; + + assert(dev->data->dev_started); + for (i = 0; i != flow->rss.queue_num; ++i) { + int idx = (*flow->queue)[i]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of((*priv->rxqs)[idx], + struct mlx5_rxq_ctrl, rxq); + + if (mark) { + rxq_ctrl->flow_mark_n--; + rxq_ctrl->rxq.mark = !!rxq_ctrl->flow_mark_n; + } + if (tunnel) { + unsigned int j; + + /* Decrease the counter matching the flow. */ + for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) { + if ((tunnels_info[j].tunnel & flow->layers) == + tunnels_info[j].tunnel) { + rxq_ctrl->flow_tunnels_n[j]--; + break; + } + } + mlx5_flow_rxq_tunnel_ptype_update(rxq_ctrl); } - DEBUG("%p type %d QP %p ibv_flow %p", - (void *)flow, i, - (void *)flow->frxq[i].hrxq, - (void *)flow->frxq[i].ibv_flow); } - for (i = 0; i != parser->queues_n; ++i) { - struct mlx5_rxq_data *q = - (*priv->rxqs)[parser->queues[i]]; +} + +/** + * Clear the Mark/Flag and Tunnel ptype information in all Rx queues. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +mlx5_flow_rxq_flags_clear(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + unsigned int i; + + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_ctrl *rxq_ctrl; + unsigned int j; - q->mark |= parser->mark; + if (!(*priv->rxqs)[i]) + continue; + rxq_ctrl = container_of((*priv->rxqs)[i], + struct mlx5_rxq_ctrl, rxq); + rxq_ctrl->flow_mark_n = 0; + rxq_ctrl->rxq.mark = 0; + for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) + rxq_ctrl->flow_tunnels_n[j] = 0; + rxq_ctrl->rxq.tunnel = 0; } +} + +/** + * Validate a flow supported by the NIC. + * + * @see rte_flow_validate() + * @see rte_flow_ops + */ +int +mlx5_flow_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + int ret = mlx5_flow_merge(dev, NULL, 0, attr, items, actions, error); + + if (ret < 0) + return ret; return 0; -error: - assert(flow); - for (i = 0; i != hash_rxq_init_n; ++i) { - if (flow->frxq[i].ibv_flow) { - struct ibv_flow *ibv_flow = flow->frxq[i].ibv_flow; +} + +/** + * Remove the flow. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to flow structure. + */ +static void +mlx5_flow_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_flow_verbs *verbs; + + if (flow->nl_flow && priv->mnl_socket) + mlx5_nl_flow_destroy(priv->mnl_socket, flow->nl_flow, NULL); + LIST_FOREACH(verbs, &flow->verbs, next) { + if (verbs->flow) { + claim_zero(mlx5_glue->destroy_flow(verbs->flow)); + verbs->flow = NULL; + } + if (verbs->hrxq) { + if (flow->fate & MLX5_FLOW_FATE_DROP) + mlx5_hrxq_drop_release(dev); + else + mlx5_hrxq_release(dev, verbs->hrxq); + verbs->hrxq = NULL; + } + } + if (flow->counter) { + mlx5_flow_counter_release(flow->counter); + flow->counter = NULL; + } +} + +/** + * Apply the flow. + * + * @param[in] dev + * Pointer to Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_flow_apply(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_flow_verbs *verbs; + int err; - claim_zero(mlx5_glue->destroy_flow(ibv_flow)); + LIST_FOREACH(verbs, &flow->verbs, next) { + if (flow->fate & MLX5_FLOW_FATE_DROP) { + verbs->hrxq = mlx5_hrxq_drop_new(dev); + if (!verbs->hrxq) { + rte_flow_error_set + (error, errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "cannot get drop hash queue"); + goto error; + } + } else { + struct mlx5_hrxq *hrxq; + + hrxq = mlx5_hrxq_get(dev, flow->key, + MLX5_RSS_HASH_KEY_LEN, + verbs->hash_fields, + (*flow->queue), + flow->rss.queue_num); + if (!hrxq) + hrxq = mlx5_hrxq_new(dev, flow->key, + MLX5_RSS_HASH_KEY_LEN, + verbs->hash_fields, + (*flow->queue), + flow->rss.queue_num, + !!(flow->layers & + MLX5_FLOW_LAYER_TUNNEL)); + if (!hrxq) { + rte_flow_error_set + (error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "cannot get hash queue"); + goto error; + } + verbs->hrxq = hrxq; + } + verbs->flow = + mlx5_glue->create_flow(verbs->hrxq->qp, verbs->attr); + if (!verbs->flow) { + rte_flow_error_set(error, errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "hardware refuses to create flow"); + goto error; } - if (flow->frxq[i].hrxq) - mlx5_priv_hrxq_release(priv, flow->frxq[i].hrxq); - if (flow->frxq[i].ibv_attr) - rte_free(flow->frxq[i].ibv_attr); } - if (flow->cs) { - claim_zero(mlx5_glue->destroy_counter_set(flow->cs)); - flow->cs = NULL; - parser->cs = NULL; + if (flow->nl_flow && + priv->mnl_socket && + mlx5_nl_flow_create(priv->mnl_socket, flow->nl_flow, error)) + goto error; + return 0; +error: + err = rte_errno; /* Save rte_errno before cleanup. */ + LIST_FOREACH(verbs, &flow->verbs, next) { + if (verbs->hrxq) { + if (flow->fate & MLX5_FLOW_FATE_DROP) + mlx5_hrxq_drop_release(dev); + else + mlx5_hrxq_release(dev, verbs->hrxq); + verbs->hrxq = NULL; + } } - return err; + rte_errno = err; /* Restore rte_errno. */ + return -rte_errno; } /** - * Convert a flow. + * Create a flow and add it to @p list. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param list * Pointer to a TAILQ flow list. * @param[in] attr * Flow rule attributes. - * @param[in] pattern + * @param[in] items * Pattern specification (list terminated by the END pattern item). * @param[in] actions * Associated actions (list terminated by the END action). @@ -1832,85 +3006,53 @@ error: * Perform verbose error reporting if not NULL. * * @return - * A flow on success, NULL otherwise. + * A flow on success, NULL otherwise and rte_errno is set. */ static struct rte_flow * -priv_flow_create(struct priv *priv, - struct mlx5_flows *list, - const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - const struct rte_flow_action actions[], - struct rte_flow_error *error) +mlx5_flow_list_create(struct rte_eth_dev *dev, + struct mlx5_flows *list, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) { - struct mlx5_flow_parse parser = { .create = 1, }; struct rte_flow *flow = NULL; - unsigned int i; - int err; + size_t size = 0; + int ret; - err = priv_flow_convert(priv, attr, items, actions, error, &parser); - if (err) - goto exit; - flow = rte_calloc(__func__, 1, - sizeof(*flow) + parser.queues_n * sizeof(uint16_t), - 0); + ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error); + if (ret < 0) + return NULL; + size = ret; + flow = rte_calloc(__func__, 1, size, 0); if (!flow) { rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, - "cannot allocate flow memory"); + "not enough memory to create flow"); return NULL; } - /* Copy queues configuration. */ - flow->queues = (uint16_t (*)[])(flow + 1); - memcpy(flow->queues, parser.queues, parser.queues_n * sizeof(uint16_t)); - flow->queues_n = parser.queues_n; - flow->mark = parser.mark; - /* Copy RSS configuration. */ - flow->rss_conf = parser.rss_conf; - flow->rss_conf.rss_key = flow->rss_key; - memcpy(flow->rss_key, parser.rss_key, parser.rss_conf.rss_key_len); - /* finalise the flow. */ - if (parser.drop) - err = priv_flow_create_action_queue_drop(priv, &parser, flow, - error); - else - err = priv_flow_create_action_queue(priv, &parser, flow, error); - if (err) - goto exit; + ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error); + if (ret < 0) { + rte_free(flow); + return NULL; + } + assert((size_t)ret == size); + if (dev->data->dev_started) { + ret = mlx5_flow_apply(dev, flow, error); + if (ret < 0) { + ret = rte_errno; /* Save rte_errno before cleanup. */ + if (flow) { + mlx5_flow_remove(dev, flow); + rte_free(flow); + } + rte_errno = ret; /* Restore rte_errno. */ + return NULL; + } + } TAILQ_INSERT_TAIL(list, flow, next); - DEBUG("Flow created %p", (void *)flow); + mlx5_flow_rxq_flags_set(dev, flow); return flow; -exit: - ERROR("flow creation error: %s", error->message); - for (i = 0; i != hash_rxq_init_n; ++i) { - if (parser.queue[i].ibv_attr) - rte_free(parser.queue[i].ibv_attr); - } - rte_free(flow); - return NULL; -} - -/** - * Validate a flow supported by the NIC. - * - * @see rte_flow_validate() - * @see rte_flow_ops - */ -int -mlx5_flow_validate(struct rte_eth_dev *dev, - const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - const struct rte_flow_action actions[], - struct rte_flow_error *error) -{ - struct priv *priv = dev->data->dev_private; - int ret; - struct mlx5_flow_parse parser = { .create = 0, }; - - priv_lock(priv); - ret = priv_flow_convert(priv, attr, items, actions, error, &parser); - priv_unlock(priv); - return ret; } /** @@ -1926,379 +3068,123 @@ mlx5_flow_create(struct rte_eth_dev *dev, const struct rte_flow_action actions[], struct rte_flow_error *error) { - struct priv *priv = dev->data->dev_private; - struct rte_flow *flow; - - priv_lock(priv); - flow = priv_flow_create(priv, &priv->flows, attr, items, actions, - error); - priv_unlock(priv); - return flow; + return mlx5_flow_list_create + (dev, &((struct priv *)dev->data->dev_private)->flows, + attr, items, actions, error); } /** - * Destroy a flow. + * Destroy a flow in a list. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param list * Pointer to a TAILQ flow list. * @param[in] flow * Flow to destroy. */ static void -priv_flow_destroy(struct priv *priv, - struct mlx5_flows *list, - struct rte_flow *flow) +mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list, + struct rte_flow *flow) { - unsigned int i; - - if (flow->drop || !flow->mark) - goto free; - for (i = 0; i != flow->queues_n; ++i) { - struct rte_flow *tmp; - int mark = 0; - - /* - * To remove the mark from the queue, the queue must not be - * present in any other marked flow (RSS or not). - */ - TAILQ_FOREACH(tmp, list, next) { - unsigned int j; - uint16_t *tqs = NULL; - uint16_t tq_n = 0; - - if (!tmp->mark) - continue; - for (j = 0; j != hash_rxq_init_n; ++j) { - if (!tmp->frxq[j].hrxq) - continue; - tqs = tmp->frxq[j].hrxq->ind_table->queues; - tq_n = tmp->frxq[j].hrxq->ind_table->queues_n; - } - if (!tq_n) - continue; - for (j = 0; (j != tq_n) && !mark; j++) - if (tqs[j] == (*flow->queues)[i]) - mark = 1; - } - (*priv->rxqs)[(*flow->queues)[i]]->mark = mark; - } -free: - if (flow->drop) { - if (flow->frxq[HASH_RXQ_ETH].ibv_flow) - claim_zero(mlx5_glue->destroy_flow - (flow->frxq[HASH_RXQ_ETH].ibv_flow)); - rte_free(flow->frxq[HASH_RXQ_ETH].ibv_attr); - } else { - for (i = 0; i != hash_rxq_init_n; ++i) { - struct mlx5_flow *frxq = &flow->frxq[i]; - - if (frxq->ibv_flow) - claim_zero(mlx5_glue->destroy_flow - (frxq->ibv_flow)); - if (frxq->hrxq) - mlx5_priv_hrxq_release(priv, frxq->hrxq); - if (frxq->ibv_attr) - rte_free(frxq->ibv_attr); - } - } - if (flow->cs) { - claim_zero(mlx5_glue->destroy_counter_set(flow->cs)); - flow->cs = NULL; - } + mlx5_flow_remove(dev, flow); TAILQ_REMOVE(list, flow, next); - DEBUG("Flow destroyed %p", (void *)flow); + /* + * Update RX queue flags only if port is started, otherwise it is + * already clean. + */ + if (dev->data->dev_started) + mlx5_flow_rxq_flags_trim(dev, flow); rte_free(flow); } /** * Destroy all flows. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param list * Pointer to a TAILQ flow list. */ void -priv_flow_flush(struct priv *priv, struct mlx5_flows *list) +mlx5_flow_list_flush(struct rte_eth_dev *dev, struct mlx5_flows *list) { while (!TAILQ_EMPTY(list)) { struct rte_flow *flow; flow = TAILQ_FIRST(list); - priv_flow_destroy(priv, list, flow); - } -} - -/** - * Create drop queue. - * - * @param priv - * Pointer to private structure. - * - * @return - * 0 on success. - */ -int -priv_flow_create_drop_queue(struct priv *priv) -{ - struct mlx5_hrxq_drop *fdq = NULL; - - assert(priv->pd); - assert(priv->ctx); - fdq = rte_calloc(__func__, 1, sizeof(*fdq), 0); - if (!fdq) { - WARN("cannot allocate memory for drop queue"); - goto error; - } - fdq->cq = mlx5_glue->create_cq(priv->ctx, 1, NULL, NULL, 0); - if (!fdq->cq) { - WARN("cannot allocate CQ for drop queue"); - goto error; - } - fdq->wq = mlx5_glue->create_wq - (priv->ctx, - &(struct ibv_wq_init_attr){ - .wq_type = IBV_WQT_RQ, - .max_wr = 1, - .max_sge = 1, - .pd = priv->pd, - .cq = fdq->cq, - }); - if (!fdq->wq) { - WARN("cannot allocate WQ for drop queue"); - goto error; - } - fdq->ind_table = mlx5_glue->create_rwq_ind_table - (priv->ctx, - &(struct ibv_rwq_ind_table_init_attr){ - .log_ind_tbl_size = 0, - .ind_tbl = &fdq->wq, - .comp_mask = 0, - }); - if (!fdq->ind_table) { - WARN("cannot allocate indirection table for drop queue"); - goto error; + mlx5_flow_list_destroy(dev, list, flow); } - fdq->qp = mlx5_glue->create_qp_ex - (priv->ctx, - &(struct ibv_qp_init_attr_ex){ - .qp_type = IBV_QPT_RAW_PACKET, - .comp_mask = - IBV_QP_INIT_ATTR_PD | - IBV_QP_INIT_ATTR_IND_TABLE | - IBV_QP_INIT_ATTR_RX_HASH, - .rx_hash_conf = (struct ibv_rx_hash_conf){ - .rx_hash_function = - IBV_RX_HASH_FUNC_TOEPLITZ, - .rx_hash_key_len = rss_hash_default_key_len, - .rx_hash_key = rss_hash_default_key, - .rx_hash_fields_mask = 0, - }, - .rwq_ind_tbl = fdq->ind_table, - .pd = priv->pd - }); - if (!fdq->qp) { - WARN("cannot allocate QP for drop queue"); - goto error; - } - priv->flow_drop_queue = fdq; - return 0; -error: - if (fdq->qp) - claim_zero(mlx5_glue->destroy_qp(fdq->qp)); - if (fdq->ind_table) - claim_zero(mlx5_glue->destroy_rwq_ind_table(fdq->ind_table)); - if (fdq->wq) - claim_zero(mlx5_glue->destroy_wq(fdq->wq)); - if (fdq->cq) - claim_zero(mlx5_glue->destroy_cq(fdq->cq)); - if (fdq) - rte_free(fdq); - priv->flow_drop_queue = NULL; - return -1; -} - -/** - * Delete drop queue. - * - * @param priv - * Pointer to private structure. - */ -void -priv_flow_delete_drop_queue(struct priv *priv) -{ - struct mlx5_hrxq_drop *fdq = priv->flow_drop_queue; - - if (!fdq) - return; - if (fdq->qp) - claim_zero(mlx5_glue->destroy_qp(fdq->qp)); - if (fdq->ind_table) - claim_zero(mlx5_glue->destroy_rwq_ind_table(fdq->ind_table)); - if (fdq->wq) - claim_zero(mlx5_glue->destroy_wq(fdq->wq)); - if (fdq->cq) - claim_zero(mlx5_glue->destroy_cq(fdq->cq)); - rte_free(fdq); - priv->flow_drop_queue = NULL; } /** * Remove all flows. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param list * Pointer to a TAILQ flow list. */ void -priv_flow_stop(struct priv *priv, struct mlx5_flows *list) +mlx5_flow_stop(struct rte_eth_dev *dev, struct mlx5_flows *list) { struct rte_flow *flow; - TAILQ_FOREACH_REVERSE(flow, list, mlx5_flows, next) { - unsigned int i; - struct mlx5_ind_table_ibv *ind_tbl = NULL; - - if (flow->drop) { - if (!flow->frxq[HASH_RXQ_ETH].ibv_flow) - continue; - claim_zero(mlx5_glue->destroy_flow - (flow->frxq[HASH_RXQ_ETH].ibv_flow)); - flow->frxq[HASH_RXQ_ETH].ibv_flow = NULL; - DEBUG("Flow %p removed", (void *)flow); - /* Next flow. */ - continue; - } - /* Verify the flow has not already been cleaned. */ - for (i = 0; i != hash_rxq_init_n; ++i) { - if (!flow->frxq[i].ibv_flow) - continue; - /* - * Indirection table may be necessary to remove the - * flags in the Rx queues. - * This helps to speed-up the process by avoiding - * another loop. - */ - ind_tbl = flow->frxq[i].hrxq->ind_table; - break; - } - if (i == hash_rxq_init_n) - return; - if (flow->mark) { - assert(ind_tbl); - for (i = 0; i != ind_tbl->queues_n; ++i) - (*priv->rxqs)[ind_tbl->queues[i]]->mark = 0; - } - for (i = 0; i != hash_rxq_init_n; ++i) { - if (!flow->frxq[i].ibv_flow) - continue; - claim_zero(mlx5_glue->destroy_flow - (flow->frxq[i].ibv_flow)); - flow->frxq[i].ibv_flow = NULL; - mlx5_priv_hrxq_release(priv, flow->frxq[i].hrxq); - flow->frxq[i].hrxq = NULL; - } - DEBUG("Flow %p removed", (void *)flow); - } + TAILQ_FOREACH_REVERSE(flow, list, mlx5_flows, next) + mlx5_flow_remove(dev, flow); + mlx5_flow_rxq_flags_clear(dev); } /** * Add all flows. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param list * Pointer to a TAILQ flow list. * * @return - * 0 on success, a errno value otherwise and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_flow_start(struct priv *priv, struct mlx5_flows *list) +mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list) { struct rte_flow *flow; + struct rte_flow_error error; + int ret = 0; TAILQ_FOREACH(flow, list, next) { - unsigned int i; - - if (flow->drop) { - flow->frxq[HASH_RXQ_ETH].ibv_flow = - mlx5_glue->create_flow - (priv->flow_drop_queue->qp, - flow->frxq[HASH_RXQ_ETH].ibv_attr); - if (!flow->frxq[HASH_RXQ_ETH].ibv_flow) { - DEBUG("Flow %p cannot be applied", - (void *)flow); - rte_errno = EINVAL; - return rte_errno; - } - DEBUG("Flow %p applied", (void *)flow); - /* Next flow. */ - continue; - } - for (i = 0; i != hash_rxq_init_n; ++i) { - if (!flow->frxq[i].ibv_attr) - continue; - flow->frxq[i].hrxq = - mlx5_priv_hrxq_get(priv, flow->rss_conf.rss_key, - flow->rss_conf.rss_key_len, - hash_rxq_init[i].hash_fields, - (*flow->queues), - flow->queues_n); - if (flow->frxq[i].hrxq) - goto flow_create; - flow->frxq[i].hrxq = - mlx5_priv_hrxq_new(priv, flow->rss_conf.rss_key, - flow->rss_conf.rss_key_len, - hash_rxq_init[i].hash_fields, - (*flow->queues), - flow->queues_n); - if (!flow->frxq[i].hrxq) { - DEBUG("Flow %p cannot be applied", - (void *)flow); - rte_errno = EINVAL; - return rte_errno; - } -flow_create: - flow->frxq[i].ibv_flow = - mlx5_glue->create_flow(flow->frxq[i].hrxq->qp, - flow->frxq[i].ibv_attr); - if (!flow->frxq[i].ibv_flow) { - DEBUG("Flow %p cannot be applied", - (void *)flow); - rte_errno = EINVAL; - return rte_errno; - } - DEBUG("Flow %p applied", (void *)flow); - } - if (!flow->mark) - continue; - for (i = 0; i != flow->queues_n; ++i) - (*priv->rxqs)[(*flow->queues)[i]]->mark = 1; + ret = mlx5_flow_apply(dev, flow, &error); + if (ret < 0) + goto error; + mlx5_flow_rxq_flags_set(dev, flow); } return 0; +error: + ret = rte_errno; /* Save rte_errno before cleanup. */ + mlx5_flow_stop(dev, list); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } /** * Verify the flow list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return the number of flows not released. */ int -priv_flow_verify(struct priv *priv) +mlx5_flow_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct rte_flow *flow; int ret = 0; TAILQ_FOREACH(flow, &priv->flows, next) { - DEBUG("%p: flow %p still referenced", (void *)priv, - (void *)flow); + DRV_LOG(DEBUG, "port %u flow %p still referenced", + dev->data->port_id, (void *)flow); ++ret; } return ret; @@ -2319,7 +3205,7 @@ priv_flow_verify(struct priv *priv) * A VLAN flow mask to apply. * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, @@ -2331,7 +3217,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, struct priv *priv = dev->data->dev_private; const struct rte_flow_attr attr = { .ingress = 1, - .priority = MLX5_CTRL_FLOW_PRIORITY, + .priority = MLX5_FLOW_PRIO_RSVD, }; struct rte_flow_item items[] = { { @@ -2351,9 +3237,20 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, .type = RTE_FLOW_ITEM_TYPE_END, }, }; + uint16_t queue[priv->reta_idx_n]; + struct rte_flow_action_rss action_rss = { + .func = RTE_ETH_HASH_FUNCTION_DEFAULT, + .level = 0, + .types = priv->rss_conf.rss_hf, + .key_len = priv->rss_conf.rss_key_len, + .queue_num = priv->reta_idx_n, + .key = priv->rss_conf.rss_key, + .queue = queue, + }; struct rte_flow_action actions[] = { { .type = RTE_FLOW_ACTION_TYPE_RSS, + .conf = &action_rss, }, { .type = RTE_FLOW_ACTION_TYPE_END, @@ -2362,26 +3259,17 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, struct rte_flow *flow; struct rte_flow_error error; unsigned int i; - union { - struct rte_flow_action_rss rss; - struct { - const struct rte_eth_rss_conf *rss_conf; - uint16_t num; - uint16_t queue[RTE_MAX_QUEUES_PER_PORT]; - } local; - } action_rss; - - if (!priv->reta_idx_n) - return EINVAL; + + if (!priv->reta_idx_n) { + rte_errno = EINVAL; + return -rte_errno; + } for (i = 0; i != priv->reta_idx_n; ++i) - action_rss.local.queue[i] = (*priv->reta_idx)[i]; - action_rss.local.rss_conf = &priv->rss_conf; - action_rss.local.num = priv->reta_idx_n; - actions[0].conf = (const void *)&action_rss.rss; - flow = priv_flow_create(priv, &priv->ctrl_flows, &attr, items, actions, - &error); + queue[i] = (*priv->reta_idx)[i]; + flow = mlx5_flow_list_create(dev, &priv->ctrl_flows, &attr, items, + actions, &error); if (!flow) - return rte_errno; + return -rte_errno; return 0; } @@ -2396,7 +3284,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, * An Ethernet flow mask to apply. * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_ctrl_flow(struct rte_eth_dev *dev, @@ -2415,14 +3303,11 @@ mlx5_ctrl_flow(struct rte_eth_dev *dev, int mlx5_flow_destroy(struct rte_eth_dev *dev, struct rte_flow *flow, - struct rte_flow_error *error) + struct rte_flow_error *error __rte_unused) { struct priv *priv = dev->data->dev_private; - (void)error; - priv_lock(priv); - priv_flow_destroy(priv, &priv->flows, flow); - priv_unlock(priv); + mlx5_flow_list_destroy(dev, &priv->flows, flow); return 0; } @@ -2434,152 +3319,161 @@ mlx5_flow_destroy(struct rte_eth_dev *dev, */ int mlx5_flow_flush(struct rte_eth_dev *dev, - struct rte_flow_error *error) + struct rte_flow_error *error __rte_unused) { struct priv *priv = dev->data->dev_private; - (void)error; - priv_lock(priv); - priv_flow_flush(priv, &priv->flows); - priv_unlock(priv); + mlx5_flow_list_flush(dev, &priv->flows); return 0; } -#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT /** - * Query flow counter. - * - * @param cs - * the counter set. - * @param counter_value - * returned data from the counter. + * Isolated mode. * - * @return - * 0 on success, a errno value otherwise and rte_errno is set. + * @see rte_flow_isolate() + * @see rte_flow_ops */ -static int -priv_flow_query_count(struct ibv_counter_set *cs, - struct mlx5_flow_counter_stats *counter_stats, - struct rte_flow_query_count *query_count, - struct rte_flow_error *error) +int +mlx5_flow_isolate(struct rte_eth_dev *dev, + int enable, + struct rte_flow_error *error) { - uint64_t counters[2]; - struct ibv_query_counter_set_attr query_cs_attr = { - .cs = cs, - .query_flags = IBV_COUNTER_SET_FORCE_UPDATE, - }; - struct ibv_counter_set_data query_out = { - .out = counters, - .outlen = 2 * sizeof(uint64_t), - }; - int res = mlx5_glue->query_counter_set(&query_cs_attr, &query_out); + struct priv *priv = dev->data->dev_private; - if (res) { - rte_flow_error_set(error, -res, + if (dev->data->dev_started) { + rte_flow_error_set(error, EBUSY, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, - "cannot read counter"); - return -res; - } - query_count->hits_set = 1; - query_count->bytes_set = 1; - query_count->hits = counters[0] - counter_stats->hits; - query_count->bytes = counters[1] - counter_stats->bytes; - if (query_count->reset) { - counter_stats->hits = counters[0]; - counter_stats->bytes = counters[1]; + "port must be stopped first"); + return -rte_errno; } + priv->isolated = !!enable; + if (enable) + dev->dev_ops = &mlx5_dev_ops_isolate; + else + dev->dev_ops = &mlx5_dev_ops; return 0; } /** - * Query a flows. + * Query flow counter. * - * @see rte_flow_query() - * @see rte_flow_ops + * @param flow + * Pointer to the flow. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -int -mlx5_flow_query(struct rte_eth_dev *dev, - struct rte_flow *flow, - enum rte_flow_action_type action __rte_unused, - void *data, - struct rte_flow_error *error) +static int +mlx5_flow_query_count(struct rte_flow *flow __rte_unused, + void *data __rte_unused, + struct rte_flow_error *error) { - struct priv *priv = dev->data->dev_private; - int res = EINVAL; +#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT + if (flow->modifier & MLX5_FLOW_MOD_COUNT) { + struct rte_flow_query_count *qc = data; + uint64_t counters[2] = {0, 0}; + struct ibv_query_counter_set_attr query_cs_attr = { + .cs = flow->counter->cs, + .query_flags = IBV_COUNTER_SET_FORCE_UPDATE, + }; + struct ibv_counter_set_data query_out = { + .out = counters, + .outlen = 2 * sizeof(uint64_t), + }; + int err = mlx5_glue->query_counter_set(&query_cs_attr, + &query_out); - priv_lock(priv); - if (flow->cs) { - res = priv_flow_query_count(flow->cs, - &flow->counter_stats, - (struct rte_flow_query_count *)data, - error); - } else { - rte_flow_error_set(error, res, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "no counter found for flow"); + if (err) + return rte_flow_error_set + (error, err, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "cannot read counter"); + qc->hits_set = 1; + qc->bytes_set = 1; + qc->hits = counters[0] - flow->counter->hits; + qc->bytes = counters[1] - flow->counter->bytes; + if (qc->reset) { + flow->counter->hits = counters[0]; + flow->counter->bytes = counters[1]; + } + return 0; } - priv_unlock(priv); - return -res; -} + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "flow does not have counter"); #endif + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "counters are not available"); +} /** - * Isolated mode. + * Query a flows. * - * @see rte_flow_isolate() + * @see rte_flow_query() * @see rte_flow_ops */ int -mlx5_flow_isolate(struct rte_eth_dev *dev, - int enable, - struct rte_flow_error *error) +mlx5_flow_query(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow, + const struct rte_flow_action *actions, + void *data, + struct rte_flow_error *error) { - struct priv *priv = dev->data->dev_private; + int ret = 0; - priv_lock(priv); - if (dev->data->dev_started) { - rte_flow_error_set(error, EBUSY, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "port must be stopped first"); - priv_unlock(priv); - return -rte_errno; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = mlx5_flow_query_count(flow, data, error); + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + if (ret < 0) + return ret; } - priv->isolated = !!enable; - if (enable) - priv->dev->dev_ops = &mlx5_dev_ops_isolate; - else - priv->dev->dev_ops = &mlx5_dev_ops; - priv_unlock(priv); return 0; } /** * Convert a flow director filter to a generic flow. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. * @param fdir_filter * Flow director filter to add. * @param attributes * Generic flow parameters structure. * * @return - * 0 on success, errno value on error. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_fdir_filter_convert(struct priv *priv, +mlx5_fdir_filter_convert(struct rte_eth_dev *dev, const struct rte_eth_fdir_filter *fdir_filter, struct mlx5_fdir *attributes) { + struct priv *priv = dev->data->dev_private; const struct rte_eth_fdir_input *input = &fdir_filter->input; + const struct rte_eth_fdir_masks *mask = + &dev->data->dev_conf.fdir_conf.mask; /* Validate queue number. */ if (fdir_filter->action.rx_queue >= priv->rxqs_n) { - ERROR("invalid queue number %d", fdir_filter->action.rx_queue); - return EINVAL; + DRV_LOG(ERR, "port %u invalid queue number %d", + dev->data->port_id, fdir_filter->action.rx_queue); + rte_errno = EINVAL; + return -rte_errno; } attributes->attr.ingress = 1; attributes->items[0] = (struct rte_flow_item) { @@ -2600,144 +3494,140 @@ priv_fdir_filter_convert(struct priv *priv, }; break; default: - ERROR("invalid behavior %d", fdir_filter->action.behavior); - return ENOTSUP; + DRV_LOG(ERR, "port %u invalid behavior %d", + dev->data->port_id, + fdir_filter->action.behavior); + rte_errno = ENOTSUP; + return -rte_errno; } attributes->queue.index = fdir_filter->action.rx_queue; + /* Handle L3. */ switch (fdir_filter->input.flow_type) { case RTE_ETH_FLOW_NONFRAG_IPV4_UDP: + case RTE_ETH_FLOW_NONFRAG_IPV4_TCP: + case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER: attributes->l3.ipv4.hdr = (struct ipv4_hdr){ - .src_addr = input->flow.udp4_flow.ip.src_ip, - .dst_addr = input->flow.udp4_flow.ip.dst_ip, - .time_to_live = input->flow.udp4_flow.ip.ttl, - .type_of_service = input->flow.udp4_flow.ip.tos, - .next_proto_id = input->flow.udp4_flow.ip.proto, + .src_addr = input->flow.ip4_flow.src_ip, + .dst_addr = input->flow.ip4_flow.dst_ip, + .time_to_live = input->flow.ip4_flow.ttl, + .type_of_service = input->flow.ip4_flow.tos, + .next_proto_id = input->flow.ip4_flow.proto, }; - attributes->l4.udp.hdr = (struct udp_hdr){ - .src_port = input->flow.udp4_flow.src_port, - .dst_port = input->flow.udp4_flow.dst_port, + attributes->l3_mask.ipv4.hdr = (struct ipv4_hdr){ + .src_addr = mask->ipv4_mask.src_ip, + .dst_addr = mask->ipv4_mask.dst_ip, + .time_to_live = mask->ipv4_mask.ttl, + .type_of_service = mask->ipv4_mask.tos, + .next_proto_id = mask->ipv4_mask.proto, }; attributes->items[1] = (struct rte_flow_item){ .type = RTE_FLOW_ITEM_TYPE_IPV4, .spec = &attributes->l3, - .mask = &attributes->l3, + .mask = &attributes->l3_mask, + }; + break; + case RTE_ETH_FLOW_NONFRAG_IPV6_UDP: + case RTE_ETH_FLOW_NONFRAG_IPV6_TCP: + case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER: + attributes->l3.ipv6.hdr = (struct ipv6_hdr){ + .hop_limits = input->flow.ipv6_flow.hop_limits, + .proto = input->flow.ipv6_flow.proto, + }; + + memcpy(attributes->l3.ipv6.hdr.src_addr, + input->flow.ipv6_flow.src_ip, + RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); + memcpy(attributes->l3.ipv6.hdr.dst_addr, + input->flow.ipv6_flow.dst_ip, + RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); + memcpy(attributes->l3_mask.ipv6.hdr.src_addr, + mask->ipv6_mask.src_ip, + RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr)); + memcpy(attributes->l3_mask.ipv6.hdr.dst_addr, + mask->ipv6_mask.dst_ip, + RTE_DIM(attributes->l3_mask.ipv6.hdr.src_addr)); + attributes->items[1] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_IPV6, + .spec = &attributes->l3, + .mask = &attributes->l3_mask, + }; + break; + default: + DRV_LOG(ERR, "port %u invalid flow type%d", + dev->data->port_id, fdir_filter->input.flow_type); + rte_errno = ENOTSUP; + return -rte_errno; + } + /* Handle L4. */ + switch (fdir_filter->input.flow_type) { + case RTE_ETH_FLOW_NONFRAG_IPV4_UDP: + attributes->l4.udp.hdr = (struct udp_hdr){ + .src_port = input->flow.udp4_flow.src_port, + .dst_port = input->flow.udp4_flow.dst_port, + }; + attributes->l4_mask.udp.hdr = (struct udp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, }; attributes->items[2] = (struct rte_flow_item){ .type = RTE_FLOW_ITEM_TYPE_UDP, .spec = &attributes->l4, - .mask = &attributes->l4, + .mask = &attributes->l4_mask, }; break; case RTE_ETH_FLOW_NONFRAG_IPV4_TCP: - attributes->l3.ipv4.hdr = (struct ipv4_hdr){ - .src_addr = input->flow.tcp4_flow.ip.src_ip, - .dst_addr = input->flow.tcp4_flow.ip.dst_ip, - .time_to_live = input->flow.tcp4_flow.ip.ttl, - .type_of_service = input->flow.tcp4_flow.ip.tos, - .next_proto_id = input->flow.tcp4_flow.ip.proto, - }; attributes->l4.tcp.hdr = (struct tcp_hdr){ .src_port = input->flow.tcp4_flow.src_port, .dst_port = input->flow.tcp4_flow.dst_port, }; - attributes->items[1] = (struct rte_flow_item){ - .type = RTE_FLOW_ITEM_TYPE_IPV4, - .spec = &attributes->l3, - .mask = &attributes->l3, + attributes->l4_mask.tcp.hdr = (struct tcp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, }; attributes->items[2] = (struct rte_flow_item){ .type = RTE_FLOW_ITEM_TYPE_TCP, .spec = &attributes->l4, - .mask = &attributes->l4, - }; - break; - case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER: - attributes->l3.ipv4.hdr = (struct ipv4_hdr){ - .src_addr = input->flow.ip4_flow.src_ip, - .dst_addr = input->flow.ip4_flow.dst_ip, - .time_to_live = input->flow.ip4_flow.ttl, - .type_of_service = input->flow.ip4_flow.tos, - .next_proto_id = input->flow.ip4_flow.proto, - }; - attributes->items[1] = (struct rte_flow_item){ - .type = RTE_FLOW_ITEM_TYPE_IPV4, - .spec = &attributes->l3, - .mask = &attributes->l3, + .mask = &attributes->l4_mask, }; break; case RTE_ETH_FLOW_NONFRAG_IPV6_UDP: - attributes->l3.ipv6.hdr = (struct ipv6_hdr){ - .hop_limits = input->flow.udp6_flow.ip.hop_limits, - .proto = input->flow.udp6_flow.ip.proto, - }; - memcpy(attributes->l3.ipv6.hdr.src_addr, - input->flow.udp6_flow.ip.src_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); - memcpy(attributes->l3.ipv6.hdr.dst_addr, - input->flow.udp6_flow.ip.dst_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); attributes->l4.udp.hdr = (struct udp_hdr){ .src_port = input->flow.udp6_flow.src_port, .dst_port = input->flow.udp6_flow.dst_port, }; - attributes->items[1] = (struct rte_flow_item){ - .type = RTE_FLOW_ITEM_TYPE_IPV6, - .spec = &attributes->l3, - .mask = &attributes->l3, + attributes->l4_mask.udp.hdr = (struct udp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, }; attributes->items[2] = (struct rte_flow_item){ .type = RTE_FLOW_ITEM_TYPE_UDP, .spec = &attributes->l4, - .mask = &attributes->l4, + .mask = &attributes->l4_mask, }; break; case RTE_ETH_FLOW_NONFRAG_IPV6_TCP: - attributes->l3.ipv6.hdr = (struct ipv6_hdr){ - .hop_limits = input->flow.tcp6_flow.ip.hop_limits, - .proto = input->flow.tcp6_flow.ip.proto, - }; - memcpy(attributes->l3.ipv6.hdr.src_addr, - input->flow.tcp6_flow.ip.src_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); - memcpy(attributes->l3.ipv6.hdr.dst_addr, - input->flow.tcp6_flow.ip.dst_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); attributes->l4.tcp.hdr = (struct tcp_hdr){ .src_port = input->flow.tcp6_flow.src_port, .dst_port = input->flow.tcp6_flow.dst_port, }; - attributes->items[1] = (struct rte_flow_item){ - .type = RTE_FLOW_ITEM_TYPE_IPV6, - .spec = &attributes->l3, - .mask = &attributes->l3, + attributes->l4_mask.tcp.hdr = (struct tcp_hdr){ + .src_port = mask->src_port_mask, + .dst_port = mask->dst_port_mask, }; attributes->items[2] = (struct rte_flow_item){ .type = RTE_FLOW_ITEM_TYPE_TCP, .spec = &attributes->l4, - .mask = &attributes->l4, + .mask = &attributes->l4_mask, }; break; + case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER: case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER: - attributes->l3.ipv6.hdr = (struct ipv6_hdr){ - .hop_limits = input->flow.ipv6_flow.hop_limits, - .proto = input->flow.ipv6_flow.proto, - }; - memcpy(attributes->l3.ipv6.hdr.src_addr, - input->flow.ipv6_flow.src_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); - memcpy(attributes->l3.ipv6.hdr.dst_addr, - input->flow.ipv6_flow.dst_ip, - RTE_DIM(attributes->l3.ipv6.hdr.src_addr)); - attributes->items[1] = (struct rte_flow_item){ - .type = RTE_FLOW_ITEM_TYPE_IPV6, - .spec = &attributes->l3, - .mask = &attributes->l3, - }; break; default: - ERROR("invalid flow type%d", - fdir_filter->input.flow_type); - return ENOTSUP; + DRV_LOG(ERR, "port %u invalid flow type%d", + dev->data->port_id, fdir_filter->input.flow_type); + rte_errno = ENOTSUP; + return -rte_errno; } return 0; } @@ -2745,18 +3635,19 @@ priv_fdir_filter_convert(struct priv *priv, /** * Add new flow director filter and store it in list. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. * @param fdir_filter * Flow director filter to add. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_fdir_filter_add(struct priv *priv, +mlx5_fdir_filter_add(struct rte_eth_dev *dev, const struct rte_eth_fdir_filter *fdir_filter) { + struct priv *priv = dev->data->dev_private; struct mlx5_fdir attributes = { .attr.group = 0, .l2_mask = { @@ -2765,181 +3656,96 @@ priv_fdir_filter_add(struct priv *priv, .type = 0, }, }; - struct mlx5_flow_parse parser = { - .layer = HASH_RXQ_ETH, - }; struct rte_flow_error error; struct rte_flow *flow; int ret; - ret = priv_fdir_filter_convert(priv, fdir_filter, &attributes); - if (ret) - return -ret; - ret = priv_flow_convert(priv, &attributes.attr, attributes.items, - attributes.actions, &error, &parser); + ret = mlx5_fdir_filter_convert(dev, fdir_filter, &attributes); if (ret) - return -ret; - flow = priv_flow_create(priv, - &priv->flows, - &attributes.attr, - attributes.items, - attributes.actions, - &error); + return ret; + flow = mlx5_flow_list_create(dev, &priv->flows, &attributes.attr, + attributes.items, attributes.actions, + &error); if (flow) { - DEBUG("FDIR created %p", (void *)flow); + DRV_LOG(DEBUG, "port %u FDIR created %p", dev->data->port_id, + (void *)flow); return 0; } - return ENOTSUP; + return -rte_errno; } /** * Delete specific filter. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. * @param fdir_filter * Filter to be deleted. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_fdir_filter_delete(struct priv *priv, - const struct rte_eth_fdir_filter *fdir_filter) +mlx5_fdir_filter_delete(struct rte_eth_dev *dev __rte_unused, + const struct rte_eth_fdir_filter *fdir_filter + __rte_unused) { - struct mlx5_fdir attributes = { - .attr.group = 0, - }; - struct mlx5_flow_parse parser = { - .create = 1, - .layer = HASH_RXQ_ETH, - }; - struct rte_flow_error error; - struct rte_flow *flow; - unsigned int i; - int ret; - - ret = priv_fdir_filter_convert(priv, fdir_filter, &attributes); - if (ret) - return -ret; - ret = priv_flow_convert(priv, &attributes.attr, attributes.items, - attributes.actions, &error, &parser); - if (ret) - goto exit; - /* - * Special case for drop action which is only set in the - * specifications when the flow is created. In this situation the - * drop specification is missing. - */ - if (parser.drop) { - struct ibv_flow_spec_action_drop *drop; - - drop = (void *)((uintptr_t)parser.queue[HASH_RXQ_ETH].ibv_attr + - parser.queue[HASH_RXQ_ETH].offset); - *drop = (struct ibv_flow_spec_action_drop){ - .type = IBV_FLOW_SPEC_ACTION_DROP, - .size = sizeof(struct ibv_flow_spec_action_drop), - }; - parser.queue[HASH_RXQ_ETH].ibv_attr->num_of_specs++; - } - TAILQ_FOREACH(flow, &priv->flows, next) { - struct ibv_flow_attr *attr; - struct ibv_spec_header *attr_h; - void *spec; - struct ibv_flow_attr *flow_attr; - struct ibv_spec_header *flow_h; - void *flow_spec; - unsigned int specs_n; - - attr = parser.queue[HASH_RXQ_ETH].ibv_attr; - flow_attr = flow->frxq[HASH_RXQ_ETH].ibv_attr; - /* Compare first the attributes. */ - if (memcmp(attr, flow_attr, sizeof(struct ibv_flow_attr))) - continue; - if (attr->num_of_specs == 0) - continue; - spec = (void *)((uintptr_t)attr + - sizeof(struct ibv_flow_attr)); - flow_spec = (void *)((uintptr_t)flow_attr + - sizeof(struct ibv_flow_attr)); - specs_n = RTE_MIN(attr->num_of_specs, flow_attr->num_of_specs); - for (i = 0; i != specs_n; ++i) { - attr_h = spec; - flow_h = flow_spec; - if (memcmp(spec, flow_spec, - RTE_MIN(attr_h->size, flow_h->size))) - goto wrong_flow; - spec = (void *)((uintptr_t)spec + attr_h->size); - flow_spec = (void *)((uintptr_t)flow_spec + - flow_h->size); - } - /* At this point, the flow match. */ - break; -wrong_flow: - /* The flow does not match. */ - continue; - } - if (flow) - priv_flow_destroy(priv, &priv->flows, flow); -exit: - for (i = 0; i != hash_rxq_init_n; ++i) { - if (parser.queue[i].ibv_attr) - rte_free(parser.queue[i].ibv_attr); - } - return -ret; + rte_errno = ENOTSUP; + return -rte_errno; } /** * Update queue for specific filter. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. * @param fdir_filter * Filter to be updated. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_fdir_filter_update(struct priv *priv, +mlx5_fdir_filter_update(struct rte_eth_dev *dev, const struct rte_eth_fdir_filter *fdir_filter) { int ret; - ret = priv_fdir_filter_delete(priv, fdir_filter); + ret = mlx5_fdir_filter_delete(dev, fdir_filter); if (ret) return ret; - ret = priv_fdir_filter_add(priv, fdir_filter); - return ret; + return mlx5_fdir_filter_add(dev, fdir_filter); } /** * Flush all filters. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. */ static void -priv_fdir_filter_flush(struct priv *priv) +mlx5_fdir_filter_flush(struct rte_eth_dev *dev) { - priv_flow_flush(priv, &priv->flows); + struct priv *priv = dev->data->dev_private; + + mlx5_flow_list_flush(dev, &priv->flows); } /** * Get flow director information. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device. * @param[out] fdir_info * Resulting flow director information. */ static void -priv_fdir_info_get(struct priv *priv, struct rte_eth_fdir_info *fdir_info) +mlx5_fdir_info_get(struct rte_eth_dev *dev, struct rte_eth_fdir_info *fdir_info) { struct rte_eth_fdir_masks *mask = - &priv->dev->data->dev_conf.fdir_conf.mask; + &dev->data->dev_conf.fdir_conf.mask; - fdir_info->mode = priv->dev->data->dev_conf.fdir_conf.mode; + fdir_info->mode = dev->data->dev_conf.fdir_conf.mode; fdir_info->guarant_spc = 0; rte_memcpy(&fdir_info->mask, mask, sizeof(fdir_info->mask)); fdir_info->max_flexpayload = 0; @@ -2953,54 +3759,52 @@ priv_fdir_info_get(struct priv *priv, struct rte_eth_fdir_info *fdir_info) /** * Deal with flow director operations. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param filter_op * Operation to perform. * @param arg * Pointer to operation-specific structure. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_fdir_ctrl_func(struct priv *priv, enum rte_filter_op filter_op, void *arg) +mlx5_fdir_ctrl_func(struct rte_eth_dev *dev, enum rte_filter_op filter_op, + void *arg) { enum rte_fdir_mode fdir_mode = - priv->dev->data->dev_conf.fdir_conf.mode; - int ret = 0; + dev->data->dev_conf.fdir_conf.mode; if (filter_op == RTE_ETH_FILTER_NOP) return 0; if (fdir_mode != RTE_FDIR_MODE_PERFECT && fdir_mode != RTE_FDIR_MODE_PERFECT_MAC_VLAN) { - ERROR("%p: flow director mode %d not supported", - (void *)priv, fdir_mode); - return EINVAL; + DRV_LOG(ERR, "port %u flow director mode %d not supported", + dev->data->port_id, fdir_mode); + rte_errno = EINVAL; + return -rte_errno; } switch (filter_op) { case RTE_ETH_FILTER_ADD: - ret = priv_fdir_filter_add(priv, arg); - break; + return mlx5_fdir_filter_add(dev, arg); case RTE_ETH_FILTER_UPDATE: - ret = priv_fdir_filter_update(priv, arg); - break; + return mlx5_fdir_filter_update(dev, arg); case RTE_ETH_FILTER_DELETE: - ret = priv_fdir_filter_delete(priv, arg); - break; + return mlx5_fdir_filter_delete(dev, arg); case RTE_ETH_FILTER_FLUSH: - priv_fdir_filter_flush(priv); + mlx5_fdir_filter_flush(dev); break; case RTE_ETH_FILTER_INFO: - priv_fdir_info_get(priv, arg); + mlx5_fdir_info_get(dev, arg); break; default: - DEBUG("%p: unknown operation %u", (void *)priv, - filter_op); - ret = EINVAL; - break; + DRV_LOG(DEBUG, "port %u unknown operation %u", + dev->data->port_id, filter_op); + rte_errno = EINVAL; + return -rte_errno; } - return ret; + return 0; } /** @@ -3016,7 +3820,7 @@ priv_fdir_ctrl_func(struct priv *priv, enum rte_filter_op filter_op, void *arg) * Pointer to operation-specific structure. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_dev_filter_ctrl(struct rte_eth_dev *dev, @@ -3024,24 +3828,21 @@ mlx5_dev_filter_ctrl(struct rte_eth_dev *dev, enum rte_filter_op filter_op, void *arg) { - int ret = EINVAL; - struct priv *priv = dev->data->dev_private; - switch (filter_type) { case RTE_ETH_FILTER_GENERIC: - if (filter_op != RTE_ETH_FILTER_GET) - return -EINVAL; + if (filter_op != RTE_ETH_FILTER_GET) { + rte_errno = EINVAL; + return -rte_errno; + } *(const void **)arg = &mlx5_flow_ops; return 0; case RTE_ETH_FILTER_FDIR: - priv_lock(priv); - ret = priv_fdir_ctrl_func(priv, filter_op, arg); - priv_unlock(priv); - break; + return mlx5_fdir_ctrl_func(dev, filter_op, arg); default: - ERROR("%p: filter type (%d) not supported", - (void *)dev, filter_type); - break; + DRV_LOG(ERR, "port %u filter type (%d) not supported", + dev->data->port_id, filter_type); + rte_errno = ENOTSUP; + return -rte_errno; } - return -ret; + return 0; } diff --git a/drivers/net/mlx5/mlx5_glue.c b/drivers/net/mlx5/mlx5_glue.c index 1c4396ad..84f9492a 100644 --- a/drivers/net/mlx5/mlx5_glue.c +++ b/drivers/net/mlx5/mlx5_glue.c @@ -1,12 +1,19 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2018 6WIND S.A. - * Copyright 2018 Mellanox Technologies, Ltd. + * Copyright 2018 Mellanox Technologies, Ltd */ #include <errno.h> +#include <stdalign.h> #include <stddef.h> #include <stdint.h> +/* + * Not needed by this file; included to work around the lack of off_t + * definition for mlx5dv.h with unpatched rdma-core versions. + */ +#include <sys/types.h> + /* Verbs headers do not support -pedantic. */ #ifdef PEDANTIC #pragma GCC diagnostic ignored "-Wpedantic" @@ -17,6 +24,8 @@ #pragma GCC diagnostic error "-Wpedantic" #endif +#include <rte_config.h> + #include "mlx5_autoconf.h" #include "mlx5_glue.h" @@ -287,6 +296,21 @@ mlx5_glue_dv_create_cq(struct ibv_context *context, return mlx5dv_create_cq(context, cq_attr, mlx5_cq_attr); } +static struct ibv_wq * +mlx5_glue_dv_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *wq_attr, + struct mlx5dv_wq_init_attr *mlx5_wq_attr) +{ +#ifndef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + (void)context; + (void)wq_attr; + (void)mlx5_wq_attr; + return NULL; +#else + return mlx5dv_create_wq(context, wq_attr, mlx5_wq_attr); +#endif +} + static int mlx5_glue_dv_query_device(struct ibv_context *ctx, struct mlx5dv_context *attrs_out) @@ -307,6 +331,22 @@ mlx5_glue_dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type) return mlx5dv_init_obj(obj, obj_type); } +static struct ibv_qp * +mlx5_glue_dv_create_qp(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex, + struct mlx5dv_qp_init_attr *dv_qp_init_attr) +{ +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + return mlx5dv_create_qp(context, qp_init_attr_ex, dv_qp_init_attr); +#else + (void)context; + (void)qp_init_attr_ex; + (void)dv_qp_init_attr; + return NULL; +#endif +} + +alignas(RTE_CACHE_LINE_SIZE) const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){ .version = MLX5_GLUE_VERSION, .fork_init = mlx5_glue_fork_init, @@ -347,7 +387,9 @@ const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){ .port_state_str = mlx5_glue_port_state_str, .cq_ex_to_cq = mlx5_glue_cq_ex_to_cq, .dv_create_cq = mlx5_glue_dv_create_cq, + .dv_create_wq = mlx5_glue_dv_create_wq, .dv_query_device = mlx5_glue_dv_query_device, .dv_set_context_attr = mlx5_glue_dv_set_context_attr, .dv_init_obj = mlx5_glue_dv_init_obj, + .dv_create_qp = mlx5_glue_dv_create_qp, }; diff --git a/drivers/net/mlx5/mlx5_glue.h b/drivers/net/mlx5/mlx5_glue.h index b5efee3b..e584d367 100644 --- a/drivers/net/mlx5/mlx5_glue.h +++ b/drivers/net/mlx5/mlx5_glue.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2018 6WIND S.A. - * Copyright 2018 Mellanox Technologies, Ltd. + * Copyright 2018 Mellanox Technologies, Ltd */ #ifndef MLX5_GLUE_H_ @@ -31,6 +31,14 @@ struct ibv_counter_set_init_attr; struct ibv_query_counter_set_attr; #endif +#ifndef HAVE_IBV_DEVICE_TUNNEL_SUPPORT +struct mlx5dv_qp_init_attr; +#endif + +#ifndef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT +struct mlx5dv_wq_init_attr; +#endif + /* LIB_GLUE_VERSION must be updated every time this structure is modified. */ struct mlx5_glue { const char *version; @@ -100,12 +108,20 @@ struct mlx5_glue { (struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr, struct mlx5dv_cq_init_attr *mlx5_cq_attr); + struct ibv_wq *(*dv_create_wq) + (struct ibv_context *context, + struct ibv_wq_init_attr *wq_attr, + struct mlx5dv_wq_init_attr *mlx5_wq_attr); int (*dv_query_device)(struct ibv_context *ctx_in, struct mlx5dv_context *attrs_out); int (*dv_set_context_attr)(struct ibv_context *ibv_ctx, enum mlx5dv_set_ctx_attr_type type, void *attr); int (*dv_init_obj)(struct mlx5dv_obj *obj, uint64_t obj_type); + struct ibv_qp *(*dv_create_qp) + (struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex, + struct mlx5dv_qp_init_attr *dv_qp_init_attr); }; const struct mlx5_glue *mlx5_glue; diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c index e8a8d459..12ee37f5 100644 --- a/drivers/net/mlx5/mlx5_mac.c +++ b/drivers/net/mlx5/mlx5_mac.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -35,44 +35,52 @@ /** * Get MAC address by querying netdevice. * - * @param[in] priv - * struct priv for the requested device. + * @param[in] dev + * Pointer to Ethernet device. * @param[out] mac * MAC address output buffer. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]) +mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN]) { struct ifreq request; + int ret; - if (priv_ifreq(priv, SIOCGIFHWADDR, &request)) - return -1; + ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0); + if (ret) + return ret; memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); return 0; } /** - * DPDK callback to remove a MAC address. + * Remove a MAC address from the internal array. * * @param dev * Pointer to Ethernet device structure. * @param index * MAC address index. */ -void -mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) +static void +mlx5_internal_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) { + struct priv *priv = dev->data->dev_private; + const int vf = priv->config.vf; + assert(index < MLX5_MAX_MAC_ADDRESSES); + if (is_zero_ether_addr(&dev->data->mac_addrs[index])) + return; + if (vf) + mlx5_nl_mac_addr_remove(dev, &dev->data->mac_addrs[index], + index); memset(&dev->data->mac_addrs[index], 0, sizeof(struct ether_addr)); - if (!dev->data->promiscuous) - mlx5_traffic_restart(dev); } /** - * DPDK callback to add a MAC address. + * Adds a MAC address to the internal array. * * @param dev * Pointer to Ethernet device structure. @@ -80,21 +88,23 @@ mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) * MAC address to register. * @param index * MAC address index. - * @param vmdq - * VMDq pool index to associate address with (ignored). * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -int -mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, - uint32_t index, uint32_t vmdq) +static int +mlx5_internal_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index) { + struct priv *priv = dev->data->dev_private; + const int vf = priv->config.vf; unsigned int i; - int ret = 0; - (void)vmdq; assert(index < MLX5_MAX_MAC_ADDRESSES); + if (is_zero_ether_addr(mac)) { + rte_errno = EINVAL; + return -rte_errno; + } /* First, make sure this address isn't already configured. */ for (i = 0; (i != MLX5_MAX_MAC_ADDRESSES); ++i) { /* Skip this index, it's going to be reconfigured. */ @@ -103,12 +113,74 @@ mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, if (memcmp(&dev->data->mac_addrs[i], mac, sizeof(*mac))) continue; /* Address already configured elsewhere, return with error. */ - return EADDRINUSE; + rte_errno = EADDRINUSE; + return -rte_errno; + } + if (vf) { + int ret = mlx5_nl_mac_addr_add(dev, mac, index); + + if (ret) + return ret; } dev->data->mac_addrs[index] = *mac; + return 0; +} + +/** + * DPDK callback to remove a MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param index + * MAC address index. + */ +void +mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) +{ + int ret; + + if (index >= MLX5_MAX_UC_MAC_ADDRESSES) + return; + mlx5_internal_mac_addr_remove(dev, index); + if (!dev->data->promiscuous) { + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot restart traffic: %s", + dev->data->port_id, strerror(rte_errno)); + } +} + +/** + * DPDK callback to add a MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param mac_addr + * MAC address to register. + * @param index + * MAC address index. + * @param vmdq + * VMDq pool index to associate address with (ignored). + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index, uint32_t vmdq __rte_unused) +{ + int ret; + + if (index >= MLX5_MAX_UC_MAC_ADDRESSES) { + rte_errno = EINVAL; + return -rte_errno; + } + ret = mlx5_internal_mac_addr_add(dev, mac, index); + if (ret < 0) + return ret; if (!dev->data->promiscuous) - mlx5_traffic_restart(dev); - return ret; + return mlx5_traffic_restart(dev); + return 0; } /** @@ -118,10 +190,43 @@ mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, * Pointer to Ethernet device structure. * @param mac_addr * MAC address to register. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -void +int mlx5_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr) { - DEBUG("%p: setting primary MAC address", (void *)dev); - mlx5_mac_addr_add(dev, mac_addr, 0, 0); + DRV_LOG(DEBUG, "port %u setting primary MAC address", + dev->data->port_id); + return mlx5_mac_addr_add(dev, mac_addr, 0, 0); +} + +/** + * DPDK callback to set multicast addresses list. + * + * @see rte_eth_dev_set_mc_addr_list() + */ +int +mlx5_set_mc_addr_list(struct rte_eth_dev *dev, + struct ether_addr *mc_addr_set, uint32_t nb_mc_addr) +{ + uint32_t i; + int ret; + + if (nb_mc_addr >= MLX5_MAX_MC_MAC_ADDRESSES) { + rte_errno = ENOSPC; + return -rte_errno; + } + for (i = MLX5_MAX_UC_MAC_ADDRESSES; i != MLX5_MAX_MAC_ADDRESSES; ++i) + mlx5_internal_mac_addr_remove(dev, i); + i = MLX5_MAX_UC_MAC_ADDRESSES; + while (nb_mc_addr--) { + ret = mlx5_internal_mac_addr_add(dev, mc_addr_set++, i++); + if (ret) + return ret; + } + if (!dev->data->promiscuous) + return mlx5_traffic_restart(dev); + return 0; } diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c index 857dfcd8..1d1bcb5f 100644 --- a/drivers/net/mlx5/mlx5_mr.c +++ b/drivers/net/mlx5/mlx5_mr.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2016 6WIND S.A. - * Copyright 2016 Mellanox. + * Copyright 2016 Mellanox Technologies, Ltd */ #ifdef PEDANTIC @@ -13,362 +13,1174 @@ #include <rte_mempool.h> #include <rte_malloc.h> +#include <rte_rwlock.h> #include "mlx5.h" +#include "mlx5_mr.h" #include "mlx5_rxtx.h" #include "mlx5_glue.h" -struct mlx5_check_mempool_data { +struct mr_find_contig_memsegs_data { + uintptr_t addr; + uintptr_t start; + uintptr_t end; + const struct rte_memseg_list *msl; +}; + +struct mr_update_mp_data { + struct rte_eth_dev *dev; + struct mlx5_mr_ctrl *mr_ctrl; int ret; - char *start; - char *end; }; -/* Called by mlx5_check_mempool() when iterating the memory chunks. */ -static void -mlx5_check_mempool_cb(struct rte_mempool *mp, - void *opaque, struct rte_mempool_memhdr *memhdr, - unsigned int mem_idx) +/** + * Expand B-tree table to a given size. Can't be called with holding + * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc(). + * + * @param bt + * Pointer to B-tree structure. + * @param n + * Number of entries for expansion. + * + * @return + * 0 on success, -1 on failure. + */ +static int +mr_btree_expand(struct mlx5_mr_btree *bt, int n) +{ + void *mem; + int ret = 0; + + if (n <= bt->size) + return ret; + /* + * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is + * used inside if there's no room to expand. Because this is a quite + * rare case and a part of very slow path, it is very acceptable. + * Initially cache_bh[] will be given practically enough space and once + * it is expanded, expansion wouldn't be needed again ever. + */ + mem = rte_realloc(bt->table, n * sizeof(struct mlx5_mr_cache), 0); + if (mem == NULL) { + /* Not an error, B-tree search will be skipped. */ + DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table", + (void *)bt); + ret = -1; + } else { + DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n); + bt->table = mem; + bt->size = n; + } + return ret; +} + +/** + * Look up LKey from given B-tree lookup table, store the last index and return + * searched LKey. + * + * @param bt + * Pointer to B-tree structure. + * @param[out] idx + * Pointer to index. Even on search failure, returns index where it stops + * searching so that index can be used when inserting a new entry. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static uint32_t +mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr) { - struct mlx5_check_mempool_data *data = opaque; + struct mlx5_mr_cache *lkp_tbl; + uint16_t n; + uint16_t base = 0; - (void)mp; - (void)mem_idx; + assert(bt != NULL); + lkp_tbl = *bt->table; + n = bt->len; + /* First entry must be NULL for comparison. */ + assert(bt->len > 0 || (lkp_tbl[0].start == 0 && + lkp_tbl[0].lkey == UINT32_MAX)); + /* Binary search. */ + do { + register uint16_t delta = n >> 1; - /* It already failed, skip the next chunks. */ - if (data->ret != 0) - return; - /* It is the first chunk. */ - if (data->start == NULL && data->end == NULL) { - data->start = memhdr->addr; - data->end = data->start + memhdr->len; - return; + if (addr < lkp_tbl[base + delta].start) { + n = delta; + } else { + base += delta; + n -= delta; + } + } while (n > 1); + assert(addr >= lkp_tbl[base].start); + *idx = base; + if (addr < lkp_tbl[base].end) + return lkp_tbl[base].lkey; + /* Not found. */ + return UINT32_MAX; +} + +/** + * Insert an entry to B-tree lookup table. + * + * @param bt + * Pointer to B-tree structure. + * @param entry + * Pointer to new entry to insert. + * + * @return + * 0 on success, -1 on failure. + */ +static int +mr_btree_insert(struct mlx5_mr_btree *bt, struct mlx5_mr_cache *entry) +{ + struct mlx5_mr_cache *lkp_tbl; + uint16_t idx = 0; + size_t shift; + + assert(bt != NULL); + assert(bt->len <= bt->size); + assert(bt->len > 0); + lkp_tbl = *bt->table; + /* Find out the slot for insertion. */ + if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { + DRV_LOG(DEBUG, + "abort insertion to B-tree(%p): already exist at" + " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); + /* Already exist, return. */ + return 0; } - if (data->end == memhdr->addr) { - data->end += memhdr->len; - return; + /* If table is full, return error. */ + if (unlikely(bt->len == bt->size)) { + bt->overflow = 1; + return -1; + } + /* Insert entry. */ + ++idx; + shift = (bt->len - idx) * sizeof(struct mlx5_mr_cache); + if (shift) + memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); + lkp_tbl[idx] = *entry; + bt->len++; + DRV_LOG(DEBUG, + "inserted B-tree(%p)[%u]," + " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); + return 0; +} + +/** + * Initialize B-tree and allocate memory for lookup table. + * + * @param bt + * Pointer to B-tree structure. + * @param n + * Number of entries to allocate. + * @param socket + * NUMA socket on which memory must be allocated. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket) +{ + if (bt == NULL) { + rte_errno = EINVAL; + return -rte_errno; } - if (data->start == (char *)memhdr->addr + memhdr->len) { - data->start -= memhdr->len; + assert(!bt->table && !bt->size); + memset(bt, 0, sizeof(*bt)); + bt->table = rte_calloc_socket("B-tree table", + n, sizeof(struct mlx5_mr_cache), + 0, socket); + if (bt->table == NULL) { + rte_errno = ENOMEM; + DEBUG("failed to allocate memory for btree cache on socket %d", + socket); + return -rte_errno; + } + bt->size = n; + /* First entry must be NULL for binary search. */ + (*bt->table)[bt->len++] = (struct mlx5_mr_cache) { + .lkey = UINT32_MAX, + }; + DEBUG("initialized B-tree %p with table %p", + (void *)bt, (void *)bt->table); + return 0; +} + +/** + * Free B-tree resources. + * + * @param bt + * Pointer to B-tree structure. + */ +void +mlx5_mr_btree_free(struct mlx5_mr_btree *bt) +{ + if (bt == NULL) return; + DEBUG("freeing B-tree %p with table %p", + (void *)bt, (void *)bt->table); + rte_free(bt->table); + memset(bt, 0, sizeof(*bt)); +} + +/** + * Dump all the entries in a B-tree + * + * @param bt + * Pointer to B-tree structure. + */ +void +mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused) +{ +#ifndef NDEBUG + int idx; + struct mlx5_mr_cache *lkp_tbl; + + if (bt == NULL) + return; + lkp_tbl = *bt->table; + for (idx = 0; idx < bt->len; ++idx) { + struct mlx5_mr_cache *entry = &lkp_tbl[idx]; + + DEBUG("B-tree(%p)[%u]," + " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", + (void *)bt, idx, entry->start, entry->end, entry->lkey); } - /* Error, mempool is not virtually contiguous. */ - data->ret = -1; +#endif } /** - * Check if a mempool can be used: it must be virtually contiguous. + * Find virtually contiguous memory chunk in a given MR. * - * @param[in] mp - * Pointer to memory pool. - * @param[out] start - * Pointer to the start address of the mempool virtual memory area - * @param[out] end - * Pointer to the end address of the mempool virtual memory area + * @param dev + * Pointer to MR structure. + * @param[out] entry + * Pointer to returning MR cache entry. If not found, this will not be + * updated. + * @param start_idx + * Start index of the memseg bitmap. * * @return - * 0 on success (mempool is virtually contiguous), -1 on error. + * Next index to go on lookup. */ -static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start, - uintptr_t *end) +static int +mr_find_next_chunk(struct mlx5_mr *mr, struct mlx5_mr_cache *entry, + int base_idx) { - struct mlx5_check_mempool_data data; + uintptr_t start = 0; + uintptr_t end = 0; + uint32_t idx = 0; - memset(&data, 0, sizeof(data)); - rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data); - *start = (uintptr_t)data.start; - *end = (uintptr_t)data.end; + for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { + if (rte_bitmap_get(mr->ms_bmp, idx)) { + const struct rte_memseg_list *msl; + const struct rte_memseg *ms; - return data.ret; + msl = mr->msl; + ms = rte_fbarray_get(&msl->memseg_arr, + mr->ms_base_idx + idx); + assert(msl->page_sz == ms->hugepage_sz); + if (!start) + start = ms->addr_64; + end = ms->addr_64 + ms->hugepage_sz; + } else if (start) { + /* Passed the end of a fragment. */ + break; + } + } + if (start) { + /* Found one chunk. */ + entry->start = start; + entry->end = end; + entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey); + } + return idx; } /** - * Register a Memory Region (MR) <-> Memory Pool (MP) association in - * txq->mp2mr[]. If mp2mr[] is full, remove an entry first. + * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. + * Then, this entry will have to be searched by mr_lookup_dev_list() in + * mlx5_mr_create() on miss. * - * This function should only be called by txq_mp2mr(). + * @param dev + * Pointer to Ethernet device. + * @param mr + * Pointer to MR to insert. * - * @param priv - * Pointer to private structure. - * @param txq - * Pointer to TX queue structure. - * @param[in] mp - * Memory Pool for which a Memory Region lkey must be returned. - * @param idx - * Index of the next available entry. + * @return + * 0 on success, -1 on failure. + */ +static int +mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx5_mr *mr) +{ + struct priv *priv = dev->data->dev_private; + unsigned int n; + + DRV_LOG(DEBUG, "port %u inserting MR(%p) to global cache", + dev->data->port_id, (void *)mr); + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx5_mr_cache entry = { 0, }; + + /* Find a contiguous chunk and advance the index. */ + n = mr_find_next_chunk(mr, &entry, n); + if (!entry.end) + break; + if (mr_btree_insert(&priv->mr.cache, &entry) < 0) { + /* + * Overflowed, but the global table cannot be expanded + * because of deadlock. + */ + return -1; + } + } + return 0; +} + +/** + * Look up address in the original global MR list. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry. If no match, this will not be updated. + * @param addr + * Search key. * * @return - * mr on success, NULL on failure. + * Found MR on match, NULL otherwise. */ -struct mlx5_mr* -priv_txq_mp2mr_reg(struct priv *priv, struct mlx5_txq_data *txq, - struct rte_mempool *mp, unsigned int idx) +static struct mlx5_mr * +mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry, + uintptr_t addr) { - struct mlx5_txq_ctrl *txq_ctrl = - container_of(txq, struct mlx5_txq_ctrl, txq); + struct priv *priv = dev->data->dev_private; struct mlx5_mr *mr; - /* Add a new entry, register MR first. */ - DEBUG("%p: discovered new memory pool \"%s\" (%p)", - (void *)txq_ctrl, mp->name, (void *)mp); - mr = priv_mr_get(priv, mp); - if (mr == NULL) { - if (rte_eal_process_type() != RTE_PROC_PRIMARY) { - DEBUG("Using unregistered mempool 0x%p(%s) in secondary process," - " please create mempool before rte_eth_dev_start()", - (void *)mp, mp->name); - return NULL; + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + unsigned int n; + + if (mr->ms_n == 0) + continue; + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx5_mr_cache ret = { 0, }; + + n = mr_find_next_chunk(mr, &ret, n); + if (addr >= ret.start && addr < ret.end) { + /* Found. */ + *entry = ret; + return mr; + } } - mr = priv_mr_new(priv, mp); - } - if (unlikely(mr == NULL)) { - DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", - (void *)txq_ctrl); - return NULL; - } - if (unlikely(idx == RTE_DIM(txq->mp2mr))) { - /* Table is full, remove oldest entry. */ - DEBUG("%p: MR <-> MP table full, dropping oldest entry.", - (void *)txq_ctrl); - --idx; - priv_mr_release(priv, txq->mp2mr[0]); - memmove(&txq->mp2mr[0], &txq->mp2mr[1], - (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); } - /* Store the new entry. */ - txq_ctrl->txq.mp2mr[idx] = mr; - DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, - (void *)txq_ctrl, mp->name, (void *)mp, - txq_ctrl->txq.mp2mr[idx]->lkey); - return mr; + return NULL; } /** - * Register a Memory Region (MR) <-> Memory Pool (MP) association in - * txq->mp2mr[]. If mp2mr[] is full, remove an entry first. - * - * This function should only be called by txq_mp2mr(). + * Look up address on device. * - * @param txq - * Pointer to TX queue structure. - * @param[in] mp - * Memory Pool for which a Memory Region lkey must be returned. - * @param idx - * Index of the next available entry. + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry. If no match, this will not be updated. + * @param addr + * Search key. * * @return - * mr on success, NULL on failure. + * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. */ -struct mlx5_mr* -mlx5_txq_mp2mr_reg(struct mlx5_txq_data *txq, struct rte_mempool *mp, - unsigned int idx) +static uint32_t +mr_lookup_dev(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry, + uintptr_t addr) { - struct mlx5_txq_ctrl *txq_ctrl = - container_of(txq, struct mlx5_txq_ctrl, txq); + struct priv *priv = dev->data->dev_private; + uint16_t idx; + uint32_t lkey = UINT32_MAX; struct mlx5_mr *mr; - priv_lock(txq_ctrl->priv); - mr = priv_txq_mp2mr_reg(txq_ctrl->priv, txq, mp, idx); - priv_unlock(txq_ctrl->priv); - return mr; + /* + * If the global cache has overflowed since it failed to expand the + * B-tree table, it can't have all the existing MRs. Then, the address + * has to be searched by traversing the original MR list instead, which + * is very slow path. Otherwise, the global cache is all inclusive. + */ + if (!unlikely(priv->mr.cache.overflow)) { + lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + if (lkey != UINT32_MAX) + *entry = (*priv->mr.cache.table)[idx]; + } else { + /* Falling back to the slowest path. */ + mr = mr_lookup_dev_list(dev, entry, addr); + if (mr != NULL) + lkey = entry->lkey; + } + assert(lkey == UINT32_MAX || (addr >= entry->start && + addr < entry->end)); + return lkey; } -struct mlx5_mp2mr_mbuf_check_data { - int ret; -}; +/** + * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() + * can raise memory free event and the callback function will spin on the lock. + * + * @param mr + * Pointer to MR to free. + */ +static void +mr_free(struct mlx5_mr *mr) +{ + if (mr == NULL) + return; + DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr); + if (mr->ibv_mr != NULL) + claim_zero(mlx5_glue->dereg_mr(mr->ibv_mr)); + if (mr->ms_bmp != NULL) + rte_bitmap_free(mr->ms_bmp); + rte_free(mr); +} /** - * Callback function for rte_mempool_obj_iter() to check whether a given - * mempool object looks like a mbuf. - * - * @param[in] mp - * The mempool pointer - * @param[in] arg - * Context data (struct txq_mp2mr_mbuf_check_data). Contains the - * return value. - * @param[in] obj - * Object address. - * @param index - * Object index, unused. + * Releass resources of detached MR having no online entry. + * + * @param dev + * Pointer to Ethernet device. */ static void -txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj, - uint32_t index __rte_unused) +mlx5_mr_garbage_collect(struct rte_eth_dev *dev) { - struct mlx5_mp2mr_mbuf_check_data *data = arg; - struct rte_mbuf *buf = obj; + struct priv *priv = dev->data->dev_private; + struct mlx5_mr *mr_next; + struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); + /* Must be called from the primary process. */ + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); /* - * Check whether mbuf structure fits element size and whether mempool - * pointer is valid. + * MR can't be freed with holding the lock because rte_free() could call + * memory free callback function. This will be a deadlock situation. */ - if (sizeof(*buf) > mp->elt_size || buf->pool != mp) - data->ret = -1; + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Detach the whole free list and release it after unlocking. */ + free_list = priv->mr.mr_free_list; + LIST_INIT(&priv->mr.mr_free_list); + rte_rwlock_write_unlock(&priv->mr.rwlock); + /* Release resources. */ + mr_next = LIST_FIRST(&free_list); + while (mr_next != NULL) { + struct mlx5_mr *mr = mr_next; + + mr_next = LIST_NEXT(mr, mr); + mr_free(mr); + } +} + +/* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */ +static int +mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg) +{ + struct mr_find_contig_memsegs_data *data = arg; + + if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) + return 0; + /* Found, save it and stop walking. */ + data->start = ms->addr_64; + data->end = ms->addr_64 + len; + data->msl = msl; + return 1; } /** - * Iterator function for rte_mempool_walk() to register existing mempools and - * fill the MP to MR cache of a TX queue. + * Create a new global Memroy Region (MR) for a missing virtual address. + * Register entire virtually contiguous memory chunk around the address. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] entry + * Pointer to returning MR cache entry, found in the global cache or newly + * created. If failed to create one, this will not be updated. + * @param addr + * Target virtual address to register. * - * @param[in] mp - * Memory Pool to register. - * @param *arg - * Pointer to TX queue structure. + * @return + * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. */ -void -mlx5_mp2mr_iter(struct rte_mempool *mp, void *arg) +static uint32_t +mlx5_mr_create(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry, + uintptr_t addr) { - struct priv *priv = (struct priv *)arg; - struct mlx5_mp2mr_mbuf_check_data data = { - .ret = 0, + struct priv *priv = dev->data->dev_private; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + const struct rte_memseg_list *msl; + const struct rte_memseg *ms; + struct mlx5_mr *mr = NULL; + size_t len; + uint32_t ms_n; + uint32_t bmp_size; + void *bmp_mem; + int ms_idx_shift = -1; + unsigned int n; + struct mr_find_contig_memsegs_data data = { + .addr = addr, }; - struct mlx5_mr *mr; + struct mr_find_contig_memsegs_data data_re; - /* Register mempool only if the first element looks like a mbuf. */ - if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 || - data.ret == -1) - return; - mr = priv_mr_get(priv, mp); - if (mr) { - priv_mr_release(priv, mr); - return; + DRV_LOG(DEBUG, "port %u creating a MR using address (%p)", + dev->data->port_id, (void *)addr); + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + DRV_LOG(WARNING, + "port %u using address (%p) of unregistered mempool" + " in secondary process, please create mempool" + " before rte_eth_dev_start()", + dev->data->port_id, (void *)addr); + rte_errno = EPERM; + goto err_nolock; + } + /* + * Release detached MRs if any. This can't be called with holding either + * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have + * been detached by the memory free event but it couldn't be released + * inside the callback due to deadlock. As a result, releasing resources + * is quite opportunistic. + */ + mlx5_mr_garbage_collect(dev); + /* + * Find out a contiguous virtual address chunk in use, to which the + * given address belongs, in order to register maximum range. In the + * best case where mempools are not dynamically recreated and + * '--socket-mem' is speicified as an EAL option, it is very likely to + * have only one MR(LKey) per a socket and per a hugepage-size even + * though the system memory is highly fragmented. + */ + if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { + DRV_LOG(WARNING, + "port %u unable to find virtually contiguous" + " chunk for address (%p)." + " rte_memseg_contig_walk() failed.", + dev->data->port_id, (void *)addr); + rte_errno = ENXIO; + goto err_nolock; + } +alloc_resources: + /* Addresses must be page-aligned. */ + assert(rte_is_aligned((void *)data.start, data.msl->page_sz)); + assert(rte_is_aligned((void *)data.end, data.msl->page_sz)); + msl = data.msl; + ms = rte_mem_virt2memseg((void *)data.start, msl); + len = data.end - data.start; + assert(msl->page_sz == ms->hugepage_sz); + /* Number of memsegs in the range. */ + ms_n = len / msl->page_sz; + DEBUG("port %u extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," + " page_sz=0x%" PRIx64 ", ms_n=%u", + dev->data->port_id, (void *)addr, + data.start, data.end, msl->page_sz, ms_n); + /* Size of memory for bitmap. */ + bmp_size = rte_bitmap_get_memory_footprint(ms_n); + mr = rte_zmalloc_socket(NULL, + RTE_ALIGN_CEIL(sizeof(*mr), + RTE_CACHE_LINE_SIZE) + + bmp_size, + RTE_CACHE_LINE_SIZE, msl->socket_id); + if (mr == NULL) { + DEBUG("port %u unable to allocate memory for a new MR of" + " address (%p).", + dev->data->port_id, (void *)addr); + rte_errno = ENOMEM; + goto err_nolock; + } + mr->msl = msl; + /* + * Save the index of the first memseg and initialize memseg bitmap. To + * see if a memseg of ms_idx in the memseg-list is still valid, check: + * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) + */ + mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); + mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); + if (mr->ms_bmp == NULL) { + DEBUG("port %u unable to initialize bitamp for a new MR of" + " address (%p).", + dev->data->port_id, (void *)addr); + rte_errno = EINVAL; + goto err_nolock; + } + /* + * Should recheck whether the extended contiguous chunk is still valid. + * Because memory_hotplug_lock can't be held if there's any memory + * related calls in a critical path, resource allocation above can't be + * locked. If the memory has been changed at this point, try again with + * just single page. If not, go on with the big chunk atomically from + * here. + */ + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); + data_re = data; + if (len > msl->page_sz && + !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { + DEBUG("port %u unable to find virtually contiguous" + " chunk for address (%p)." + " rte_memseg_contig_walk() failed.", + dev->data->port_id, (void *)addr); + rte_errno = ENXIO; + goto err_memlock; + } + if (data.start != data_re.start || data.end != data_re.end) { + /* + * The extended contiguous chunk has been changed. Try again + * with single memseg instead. + */ + data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); + data.end = data.start + msl->page_sz; + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + mr_free(mr); + goto alloc_resources; + } + assert(data.msl == data_re.msl); + rte_rwlock_write_lock(&priv->mr.rwlock); + /* + * Check the address is really missing. If other thread already created + * one or it is not found due to overflow, abort and return. + */ + if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) { + /* + * Insert to the global cache table. It may fail due to + * low-on-memory. Then, this entry will have to be searched + * here again. + */ + mr_btree_insert(&priv->mr.cache, entry); + DEBUG("port %u found MR for %p on final lookup, abort", + dev->data->port_id, (void *)addr); + rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + /* + * Must be unlocked before calling rte_free() because + * mlx5_mr_mem_event_free_cb() can be called inside. + */ + mr_free(mr); + return entry->lkey; } - priv_mr_new(priv, mp); + /* + * Trim start and end addresses for verbs MR. Set bits for registering + * memsegs but exclude already registered ones. Bitmap can be + * fragmented. + */ + for (n = 0; n < ms_n; ++n) { + uintptr_t start; + struct mlx5_mr_cache ret = { 0, }; + + start = data_re.start + n * msl->page_sz; + /* Exclude memsegs already registered by other MRs. */ + if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) { + /* + * Start from the first unregistered memseg in the + * extended range. + */ + if (ms_idx_shift == -1) { + mr->ms_base_idx += n; + data.start = start; + ms_idx_shift = n; + } + data.end = start + msl->page_sz; + rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); + ++mr->ms_n; + } + } + len = data.end - data.start; + mr->ms_bmp_n = len / msl->page_sz; + assert(ms_idx_shift + mr->ms_bmp_n <= ms_n); + /* + * Finally create a verbs MR for the memory chunk. ibv_reg_mr() can be + * called with holding the memory lock because it doesn't use + * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket() + * through mlx5_alloc_verbs_buf(). + */ + mr->ibv_mr = mlx5_glue->reg_mr(priv->pd, (void *)data.start, len, + IBV_ACCESS_LOCAL_WRITE); + if (mr->ibv_mr == NULL) { + DEBUG("port %u fail to create a verbs MR for address (%p)", + dev->data->port_id, (void *)addr); + rte_errno = EINVAL; + goto err_mrlock; + } + assert((uintptr_t)mr->ibv_mr->addr == data.start); + assert(mr->ibv_mr->length == len); + LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + DEBUG("port %u MR CREATED (%p) for %p:\n" + " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," + " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", + dev->data->port_id, (void *)mr, (void *)addr, + data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey), + mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); + /* Insert to the global cache table. */ + mr_insert_dev_cache(dev, mr); + /* Fill in output data. */ + mr_lookup_dev(dev, entry, addr); + /* Lookup can't fail. */ + assert(entry->lkey != UINT32_MAX); + rte_rwlock_write_unlock(&priv->mr.rwlock); + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); + return entry->lkey; +err_mrlock: + rte_rwlock_write_unlock(&priv->mr.rwlock); +err_memlock: + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); +err_nolock: + /* + * In case of error, as this can be called in a datapath, a warning + * message per an error is preferable instead. Must be unlocked before + * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called + * inside. + */ + mr_free(mr); + return UINT32_MAX; } /** - * Register a new memory region from the mempool and store it in the memory - * region list. + * Rebuild the global B-tree cache of device from the original MR list. * - * @param priv - * Pointer to private structure. - * @param mp - * Pointer to the memory pool to register. - * @return - * The memory region on success. + * @param dev + * Pointer to Ethernet device. */ -struct mlx5_mr* -priv_mr_new(struct priv *priv, struct rte_mempool *mp) +static void +mr_rebuild_dev_cache(struct rte_eth_dev *dev) { - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - uintptr_t start; - uintptr_t end; - unsigned int i; + struct priv *priv = dev->data->dev_private; struct mlx5_mr *mr; - mr = rte_zmalloc_socket(__func__, sizeof(*mr), 0, mp->socket_id); - if (!mr) { - DEBUG("unable to configure MR, ibv_reg_mr() failed."); - return NULL; + DRV_LOG(DEBUG, "port %u rebuild dev cache[]", dev->data->port_id); + /* Flush cache to rebuild. */ + priv->mr.cache.len = 1; + priv->mr.cache.overflow = 0; + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) + if (mr_insert_dev_cache(dev, mr) < 0) + return; +} + +/** + * Callback for memory free event. Iterate freed memsegs and check whether it + * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a + * result, the MR would be fragmented. If it becomes empty, the MR will be freed + * later by mlx5_mr_garbage_collect(). Even if this callback is called from a + * secondary process, the garbage collector will be called in primary process + * as the secondary process can't call mlx5_mr_create(). + * + * The global cache must be rebuilt if there's any change and this event has to + * be propagated to dataplane threads to flush the local caches. + * + * @param dev + * Pointer to Ethernet device. + * @param addr + * Address of freed memory. + * @param len + * Size of freed memory. + */ +static void +mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len) +{ + struct priv *priv = dev->data->dev_private; + const struct rte_memseg_list *msl; + struct mlx5_mr *mr; + int ms_n; + int i; + int rebuild = 0; + + DEBUG("port %u free callback: addr=%p, len=%zu", + dev->data->port_id, addr, len); + msl = rte_mem_virt2memseg_list(addr); + /* addr and len must be page-aligned. */ + assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz)); + assert(len == RTE_ALIGN(len, msl->page_sz)); + ms_n = len / msl->page_sz; + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Clear bits of freed memsegs from MR. */ + for (i = 0; i < ms_n; ++i) { + const struct rte_memseg *ms; + struct mlx5_mr_cache entry; + uintptr_t start; + int ms_idx; + uint32_t pos; + + /* Find MR having this memseg. */ + start = (uintptr_t)addr + i * msl->page_sz; + mr = mr_lookup_dev_list(dev, &entry, start); + if (mr == NULL) + continue; + ms = rte_mem_virt2memseg((void *)start, msl); + assert(ms != NULL); + assert(msl->page_sz == ms->hugepage_sz); + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + pos = ms_idx - mr->ms_base_idx; + assert(rte_bitmap_get(mr->ms_bmp, pos)); + assert(pos < mr->ms_bmp_n); + DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p", + dev->data->port_id, (void *)mr, pos, (void *)start); + rte_bitmap_clear(mr->ms_bmp, pos); + if (--mr->ms_n == 0) { + LIST_REMOVE(mr, mr); + LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + DEBUG("port %u remove MR(%p) from list", + dev->data->port_id, (void *)mr); + } + /* + * MR is fragmented or will be freed. the global cache must be + * rebuilt. + */ + rebuild = 1; } - if (mlx5_check_mempool(mp, &start, &end) != 0) { - ERROR("mempool %p: not virtually contiguous", - (void *)mp); - return NULL; + if (rebuild) { + mr_rebuild_dev_cache(dev); + /* + * Flush local caches by propagating invalidation across cores. + * rte_smp_wmb() is enough to synchronize this event. If one of + * freed memsegs is seen by other core, that means the memseg + * has been allocated by allocator, which will come after this + * free call. Therefore, this store instruction (incrementing + * generation below) will be guaranteed to be seen by other core + * before the core sees the newly allocated memory. + */ + ++priv->mr.dev_gen; + DEBUG("broadcasting local cache flush, gen=%d", + priv->mr.dev_gen); + rte_smp_wmb(); } - DEBUG("mempool %p area start=%p end=%p size=%zu", - (void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - /* Save original addresses for exact MR lookup. */ - mr->start = start; - mr->end = end; - /* Round start and end to page boundary if found in memory segments. */ - for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { - uintptr_t addr = (uintptr_t)ms[i].addr; - size_t len = ms[i].len; - unsigned int align = ms[i].hugepage_sz; - - if ((start > addr) && (start < addr + len)) - start = RTE_ALIGN_FLOOR(start, align); - if ((end > addr) && (end < addr + len)) - end = RTE_ALIGN_CEIL(end, align); + rte_rwlock_write_unlock(&priv->mr.rwlock); +} + +/** + * Callback for memory event. This can be called from both primary and secondary + * process. + * + * @param event_type + * Memory event type. + * @param addr + * Address of memory. + * @param len + * Size of memory. + */ +void +mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, + size_t len, void *arg __rte_unused) +{ + struct priv *priv; + struct mlx5_dev_list *dev_list = &mlx5_shared_data->mem_event_cb_list; + + switch (event_type) { + case RTE_MEM_EVENT_FREE: + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + /* Iterate all the existing mlx5 devices. */ + LIST_FOREACH(priv, dev_list, mem_event_cb) + mlx5_mr_mem_event_free_cb(ETH_DEV(priv), addr, len); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); + break; + case RTE_MEM_EVENT_ALLOC: + default: + break; } - DEBUG("mempool %p using start=%p end=%p size=%zu for MR", - (void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - mr->mr = mlx5_glue->reg_mr(priv->pd, (void *)start, end - start, - IBV_ACCESS_LOCAL_WRITE); - mr->mp = mp; - mr->lkey = rte_cpu_to_be_32(mr->mr->lkey); - rte_atomic32_inc(&mr->refcnt); - DEBUG("%p: new Memory Region %p refcnt: %d", (void *)priv, - (void *)mr, rte_atomic32_read(&mr->refcnt)); - LIST_INSERT_HEAD(&priv->mr, mr, next); - return mr; } /** - * Search the memory region object in the memory region list. + * Look up address in the global MR cache table. If not found, create a new MR. + * Insert the found/created entry to local bottom-half cache table. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param[out] entry + * Pointer to returning MR cache entry, found in the global cache or newly + * created. If failed to create one, this is not written. + * @param addr + * Search key. * - * @param priv - * Pointer to private structure. - * @param mp - * Pointer to the memory pool to register. * @return - * The memory region on success. + * Searched LKey on success, UINT32_MAX on no match. */ -struct mlx5_mr* -priv_mr_get(struct priv *priv, struct rte_mempool *mp) +static uint32_t +mlx5_mr_lookup_dev(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + struct mlx5_mr_cache *entry, uintptr_t addr) { - struct mlx5_mr *mr; + struct priv *priv = dev->data->dev_private; + struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; + uint16_t idx; + uint32_t lkey; - assert(mp); - if (LIST_EMPTY(&priv->mr)) - return NULL; - LIST_FOREACH(mr, &priv->mr, next) { - if (mr->mp == mp) { - rte_atomic32_inc(&mr->refcnt); - DEBUG("Memory Region %p refcnt: %d", - (void *)mr, rte_atomic32_read(&mr->refcnt)); - return mr; - } + /* If local cache table is full, try to double it. */ + if (unlikely(bt->len == bt->size)) + mr_btree_expand(bt, bt->size << 1); + /* Look up in the global cache. */ + rte_rwlock_read_lock(&priv->mr.rwlock); + lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); + if (lkey != UINT32_MAX) { + /* Found. */ + *entry = (*priv->mr.cache.table)[idx]; + rte_rwlock_read_unlock(&priv->mr.rwlock); + /* + * Update local cache. Even if it fails, return the found entry + * to update top-half cache. Next time, this entry will be found + * in the global cache. + */ + mr_btree_insert(bt, entry); + return lkey; } - return NULL; + rte_rwlock_read_unlock(&priv->mr.rwlock); + /* First time to see the address? Create a new MR. */ + lkey = mlx5_mr_create(dev, entry, addr); + /* + * Update the local cache if successfully created a new global MR. Even + * if failed to create one, there's no action to take in this datapath + * code. As returning LKey is invalid, this will eventually make HW + * fail. + */ + if (lkey != UINT32_MAX) + mr_btree_insert(bt, entry); + return lkey; } /** - * Release the memory region object. + * Bottom-half of LKey search on datapath. Firstly search in cache_bh[] and if + * misses, search in the global MR cache table and update the new entry to + * per-queue local caches. * - * @param mr - * Pointer to memory region to release. + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param addr + * Search key. * * @return - * 0 on success, errno on failure. + * Searched LKey on success, UINT32_MAX on no match. */ -int -priv_mr_release(struct priv *priv, struct mlx5_mr *mr) +static uint32_t +mlx5_mr_addr2mr_bh(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + uintptr_t addr) { - (void)priv; - assert(mr); - DEBUG("Memory Region %p refcnt: %d", - (void *)mr, rte_atomic32_read(&mr->refcnt)); - if (rte_atomic32_dec_and_test(&mr->refcnt)) { - claim_zero(mlx5_glue->dereg_mr(mr->mr)); - LIST_REMOVE(mr, next); - rte_free(mr); - return 0; + uint32_t lkey; + uint16_t bh_idx = 0; + /* Victim in top-half cache to replace with new entry. */ + struct mlx5_mr_cache *repl = &mr_ctrl->cache[mr_ctrl->head]; + + /* Binary-search MR translation table. */ + lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); + /* Update top-half cache. */ + if (likely(lkey != UINT32_MAX)) { + *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; + } else { + /* + * If missed in local lookup table, search in the global cache + * and local cache_bh[] will be updated inside if possible. + * Top-half cache entry will also be updated. + */ + lkey = mlx5_mr_lookup_dev(dev, mr_ctrl, repl, addr); + if (unlikely(lkey == UINT32_MAX)) + return UINT32_MAX; } - return EBUSY; + /* Update the most recently used entry. */ + mr_ctrl->mru = mr_ctrl->head; + /* Point to the next victim, the oldest. */ + mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; + return lkey; } /** - * Verify the flow list is empty + * Bottom-half of LKey search on Rx. * - * @param priv - * Pointer to private structure. + * @param rxq + * Pointer to Rx queue structure. + * @param addr + * Search key. * - * @return the number of object not released. + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +uint32_t +mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr) +{ + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl; + struct priv *priv = rxq_ctrl->priv; + + DRV_LOG(DEBUG, + "Rx queue %u: miss on top-half, mru=%u, head=%u, addr=%p", + rxq_ctrl->idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr); + return mlx5_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr); +} + +/** + * Bottom-half of LKey search on Tx. + * + * @param txq + * Pointer to Tx queue structure. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +uint32_t +mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr) +{ + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq, struct mlx5_txq_ctrl, txq); + struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + struct priv *priv = txq_ctrl->priv; + + DRV_LOG(DEBUG, + "Tx queue %u: miss on top-half, mru=%u, head=%u, addr=%p", + txq_ctrl->idx, mr_ctrl->mru, mr_ctrl->head, (void *)addr); + return mlx5_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr); +} + +/** + * Flush all of the local cache entries. + * + * @param mr_ctrl + * Pointer to per-queue MR control structure. + */ +void +mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl) +{ + /* Reset the most-recently-used index. */ + mr_ctrl->mru = 0; + /* Reset the linear search array. */ + mr_ctrl->head = 0; + memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); + /* Reset the B-tree table. */ + mr_ctrl->cache_bh.len = 1; + mr_ctrl->cache_bh.overflow = 0; + /* Update the generation number. */ + mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; + DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d", + (void *)mr_ctrl, mr_ctrl->cur_gen); +} + +/* Called during rte_mempool_mem_iter() by mlx5_mr_update_mp(). */ +static void +mlx5_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque, + struct rte_mempool_memhdr *memhdr, + unsigned mem_idx __rte_unused) +{ + struct mr_update_mp_data *data = opaque; + uint32_t lkey; + + /* Stop iteration if failed in the previous walk. */ + if (data->ret < 0) + return; + /* Register address of the chunk and update local caches. */ + lkey = mlx5_mr_addr2mr_bh(data->dev, data->mr_ctrl, + (uintptr_t)memhdr->addr); + if (lkey == UINT32_MAX) + data->ret = -1; +} + +/** + * Register entire memory chunks in a Mempool. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param mp + * Pointer to registering Mempool. + * + * @return + * 0 on success, -1 on failure. */ int -priv_mr_verify(struct priv *priv) +mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + struct rte_mempool *mp) { - int ret = 0; + struct mr_update_mp_data data = { + .dev = dev, + .mr_ctrl = mr_ctrl, + .ret = 0, + }; + + rte_mempool_mem_iter(mp, mlx5_mr_update_mp_cb, &data); + return data.ret; +} + +/** + * Dump all the created MRs and the global cache entries. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_mr_dump_dev(struct rte_eth_dev *dev __rte_unused) +{ +#ifndef NDEBUG + struct priv *priv = dev->data->dev_private; struct mlx5_mr *mr; + int mr_n = 0; + int chunk_n = 0; - LIST_FOREACH(mr, &priv->mr, next) { - DEBUG("%p: mr %p still referenced", (void *)priv, - (void *)mr); - ++ret; + rte_rwlock_read_lock(&priv->mr.rwlock); + /* Iterate all the existing MRs. */ + LIST_FOREACH(mr, &priv->mr.mr_list, mr) { + unsigned int n; + + DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", + dev->data->port_id, mr_n++, + rte_cpu_to_be_32(mr->ibv_mr->lkey), + mr->ms_n, mr->ms_bmp_n); + if (mr->ms_n == 0) + continue; + for (n = 0; n < mr->ms_bmp_n; ) { + struct mlx5_mr_cache ret = { 0, }; + + n = mr_find_next_chunk(mr, &ret, n); + if (!ret.end) + break; + DEBUG(" chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", + chunk_n++, ret.start, ret.end); + } } - return ret; + DEBUG("port %u dumping global cache", dev->data->port_id); + mlx5_mr_btree_dump(&priv->mr.cache); + rte_rwlock_read_unlock(&priv->mr.rwlock); +#endif +} + +/** + * Release all the created MRs and resources. Remove device from memory callback + * list. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_mr_release(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_mr *mr_next = LIST_FIRST(&priv->mr.mr_list); + + /* Remove from memory callback device list. */ + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + LIST_REMOVE(priv, mem_event_cb); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); + if (rte_log_get_level(mlx5_logtype) == RTE_LOG_DEBUG) + mlx5_mr_dump_dev(dev); + rte_rwlock_write_lock(&priv->mr.rwlock); + /* Detach from MR list and move to free list. */ + while (mr_next != NULL) { + struct mlx5_mr *mr = mr_next; + + mr_next = LIST_NEXT(mr, mr); + LIST_REMOVE(mr, mr); + LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); + } + LIST_INIT(&priv->mr.mr_list); + /* Free global cache. */ + mlx5_mr_btree_free(&priv->mr.cache); + rte_rwlock_write_unlock(&priv->mr.rwlock); + /* Free all remaining MRs. */ + mlx5_mr_garbage_collect(dev); } diff --git a/drivers/net/mlx5/mlx5_mr.h b/drivers/net/mlx5/mlx5_mr.h new file mode 100644 index 00000000..a57003fe --- /dev/null +++ b/drivers/net/mlx5/mlx5_mr.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 6WIND S.A. + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_MR_H_ +#define RTE_PMD_MLX5_MR_H_ + +#include <stddef.h> +#include <stdint.h> +#include <sys/queue.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#include <infiniband/mlx5dv.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_eal_memconfig.h> +#include <rte_ethdev.h> +#include <rte_rwlock.h> +#include <rte_bitmap.h> + +/* Memory Region object. */ +struct mlx5_mr { + LIST_ENTRY(mlx5_mr) mr; /**< Pointer to the prev/next entry. */ + struct ibv_mr *ibv_mr; /* Verbs Memory Region. */ + const struct rte_memseg_list *msl; + int ms_base_idx; /* Start index of msl->memseg_arr[]. */ + int ms_n; /* Number of memsegs in use. */ + uint32_t ms_bmp_n; /* Number of bits in memsegs bit-mask. */ + struct rte_bitmap *ms_bmp; /* Bit-mask of memsegs belonged to MR. */ +}; + +/* Cache entry for Memory Region. */ +struct mlx5_mr_cache { + uintptr_t start; /* Start address of MR. */ + uintptr_t end; /* End address of MR. */ + uint32_t lkey; /* rte_cpu_to_be_32(ibv_mr->lkey). */ +} __rte_packed; + +/* MR Cache table for Binary search. */ +struct mlx5_mr_btree { + uint16_t len; /* Number of entries. */ + uint16_t size; /* Total number of entries. */ + int overflow; /* Mark failure of table expansion. */ + struct mlx5_mr_cache (*table)[]; +} __rte_packed; + +/* Per-queue MR control descriptor. */ +struct mlx5_mr_ctrl { + uint32_t *dev_gen_ptr; /* Generation number of device to poll. */ + uint32_t cur_gen; /* Generation number saved to flush caches. */ + uint16_t mru; /* Index of last hit entry in top-half cache. */ + uint16_t head; /* Index of the oldest entry in top-half cache. */ + struct mlx5_mr_cache cache[MLX5_MR_CACHE_N]; /* Cache for top-half. */ + struct mlx5_mr_btree cache_bh; /* Cache for bottom-half. */ +} __rte_packed; + +extern struct mlx5_dev_list mlx5_mem_event_cb_list; +extern rte_rwlock_t mlx5_mem_event_rwlock; + +/* First entry must be NULL for comparison. */ +#define mlx5_mr_btree_len(bt) ((bt)->len - 1) + +int mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket); +void mlx5_mr_btree_free(struct mlx5_mr_btree *bt); +void mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, + size_t len, void *arg); +int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + struct rte_mempool *mp); +void mlx5_mr_release(struct rte_eth_dev *dev); + +/* Debug purpose functions. */ +void mlx5_mr_btree_dump(struct mlx5_mr_btree *bt); +void mlx5_mr_dump_dev(struct rte_eth_dev *dev); + +/** + * Look up LKey from given lookup table by linear search. Firstly look up the + * last-hit entry. If miss, the entire array is searched. If found, update the + * last-hit index and return LKey. + * + * @param lkp_tbl + * Pointer to lookup table. + * @param[in,out] cached_idx + * Pointer to last-hit index. + * @param n + * Size of lookup table. + * @param addr + * Search key. + * + * @return + * Searched LKey on success, UINT32_MAX on no match. + */ +static __rte_always_inline uint32_t +mlx5_mr_lookup_cache(struct mlx5_mr_cache *lkp_tbl, uint16_t *cached_idx, + uint16_t n, uintptr_t addr) +{ + uint16_t idx; + + if (likely(addr >= lkp_tbl[*cached_idx].start && + addr < lkp_tbl[*cached_idx].end)) + return lkp_tbl[*cached_idx].lkey; + for (idx = 0; idx < n && lkp_tbl[idx].start != 0; ++idx) { + if (addr >= lkp_tbl[idx].start && + addr < lkp_tbl[idx].end) { + /* Found. */ + *cached_idx = idx; + return lkp_tbl[idx].lkey; + } + } + return UINT32_MAX; +} + +#endif /* RTE_PMD_MLX5_MR_H_ */ diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c new file mode 100644 index 00000000..d61826ae --- /dev/null +++ b/drivers/net/mlx5/mlx5_nl.c @@ -0,0 +1,916 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 6WIND S.A. + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include <errno.h> +#include <linux/if_link.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <net/if.h> +#include <rdma/rdma_netlink.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> + +#include <rte_errno.h> + +#include "mlx5.h" +#include "mlx5_utils.h" + +/* Size of the buffer to receive kernel messages */ +#define MLX5_NL_BUF_SIZE (32 * 1024) +/* Send buffer size for the Netlink socket */ +#define MLX5_SEND_BUF_SIZE 32768 +/* Receive buffer size for the Netlink socket */ +#define MLX5_RECV_BUF_SIZE 32768 + +/* + * Define NDA_RTA as defined in iproute2 sources. + * + * see in iproute2 sources file include/libnetlink.h + */ +#ifndef MLX5_NDA_RTA +#define MLX5_NDA_RTA(r) \ + ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg)))) +#endif + +/* + * The following definitions are normally found in rdma/rdma_netlink.h, + * however they are so recent that most systems do not expose them yet. + */ +#ifndef HAVE_RDMA_NL_NLDEV +#define RDMA_NL_NLDEV 5 +#endif +#ifndef HAVE_RDMA_NLDEV_CMD_GET +#define RDMA_NLDEV_CMD_GET 1 +#endif +#ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET +#define RDMA_NLDEV_CMD_PORT_GET 5 +#endif +#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX +#define RDMA_NLDEV_ATTR_DEV_INDEX 1 +#endif +#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME +#define RDMA_NLDEV_ATTR_DEV_NAME 2 +#endif +#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX +#define RDMA_NLDEV_ATTR_PORT_INDEX 3 +#endif +#ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX +#define RDMA_NLDEV_ATTR_NDEV_INDEX 50 +#endif + +/* These are normally found in linux/if_link.h. */ +#ifndef HAVE_IFLA_PHYS_SWITCH_ID +#define IFLA_PHYS_SWITCH_ID 36 +#endif +#ifndef HAVE_IFLA_PHYS_PORT_NAME +#define IFLA_PHYS_PORT_NAME 38 +#endif + +/* Add/remove MAC address through Netlink */ +struct mlx5_nl_mac_addr { + struct ether_addr (*mac)[]; + /**< MAC address handled by the device. */ + int mac_n; /**< Number of addresses in the array. */ +}; + +/** Data structure used by mlx5_nl_ifindex_cb(). */ +struct mlx5_nl_ifindex_data { + const char *name; /**< IB device name (in). */ + uint32_t ibindex; /**< IB device index (out). */ + uint32_t ifindex; /**< Network interface index (out). */ +}; + +/** + * Opens a Netlink socket. + * + * @param protocol + * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA). + * + * @return + * A file descriptor on success, a negative errno value otherwise and + * rte_errno is set. + */ +int +mlx5_nl_init(int protocol) +{ + int fd; + int sndbuf_size = MLX5_SEND_BUF_SIZE; + int rcvbuf_size = MLX5_RECV_BUF_SIZE; + struct sockaddr_nl local = { + .nl_family = AF_NETLINK, + }; + int ret; + + fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol); + if (fd == -1) { + rte_errno = errno; + return -rte_errno; + } + ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int)); + if (ret == -1) { + rte_errno = errno; + goto error; + } + ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int)); + if (ret == -1) { + rte_errno = errno; + goto error; + } + ret = bind(fd, (struct sockaddr *)&local, sizeof(local)); + if (ret == -1) { + rte_errno = errno; + goto error; + } + return fd; +error: + close(fd); + return -rte_errno; +} + +/** + * Send a request message to the kernel on the Netlink socket. + * + * @param[in] nlsk_fd + * Netlink socket file descriptor. + * @param[in] nh + * The Netlink message send to the kernel. + * @param[in] ssn + * Sequence number. + * @param[in] req + * Pointer to the request structure. + * @param[in] len + * Length of the request in bytes. + * + * @return + * The number of sent bytes on success, a negative errno value otherwise and + * rte_errno is set. + */ +static int +mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req, + int len) +{ + struct sockaddr_nl sa = { + .nl_family = AF_NETLINK, + }; + struct iovec iov[2] = { + { .iov_base = nh, .iov_len = sizeof(*nh), }, + { .iov_base = req, .iov_len = len, }, + }; + struct msghdr msg = { + .msg_name = &sa, + .msg_namelen = sizeof(sa), + .msg_iov = iov, + .msg_iovlen = 2, + }; + int send_bytes; + + nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */ + nh->nlmsg_seq = sn; + send_bytes = sendmsg(nlsk_fd, &msg, 0); + if (send_bytes < 0) { + rte_errno = errno; + return -rte_errno; + } + return send_bytes; +} + +/** + * Send a message to the kernel on the Netlink socket. + * + * @param[in] nlsk_fd + * The Netlink socket file descriptor used for communication. + * @param[in] nh + * The Netlink message send to the kernel. + * @param[in] sn + * Sequence number. + * + * @return + * The number of sent bytes on success, a negative errno value otherwise and + * rte_errno is set. + */ +static int +mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn) +{ + struct sockaddr_nl sa = { + .nl_family = AF_NETLINK, + }; + struct iovec iov = { + .iov_base = nh, + .iov_len = nh->nlmsg_len, + }; + struct msghdr msg = { + .msg_name = &sa, + .msg_namelen = sizeof(sa), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + int send_bytes; + + nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */ + nh->nlmsg_seq = sn; + send_bytes = sendmsg(nlsk_fd, &msg, 0); + if (send_bytes < 0) { + rte_errno = errno; + return -rte_errno; + } + return send_bytes; +} + +/** + * Receive a message from the kernel on the Netlink socket, following + * mlx5_nl_send(). + * + * @param[in] nlsk_fd + * The Netlink socket file descriptor used for communication. + * @param[in] sn + * Sequence number. + * @param[in] cb + * The callback function to call for each Netlink message received. + * @param[in, out] arg + * Custom arguments for the callback. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg), + void *arg) +{ + struct sockaddr_nl sa; + char buf[MLX5_RECV_BUF_SIZE]; + struct iovec iov = { + .iov_base = buf, + .iov_len = sizeof(buf), + }; + struct msghdr msg = { + .msg_name = &sa, + .msg_namelen = sizeof(sa), + .msg_iov = &iov, + /* One message at a time */ + .msg_iovlen = 1, + }; + int multipart = 0; + int ret = 0; + + do { + struct nlmsghdr *nh; + int recv_bytes = 0; + + do { + recv_bytes = recvmsg(nlsk_fd, &msg, 0); + if (recv_bytes == -1) { + rte_errno = errno; + return -rte_errno; + } + nh = (struct nlmsghdr *)buf; + } while (nh->nlmsg_seq != sn); + for (; + NLMSG_OK(nh, (unsigned int)recv_bytes); + nh = NLMSG_NEXT(nh, recv_bytes)) { + if (nh->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err_data = NLMSG_DATA(nh); + + if (err_data->error < 0) { + rte_errno = -err_data->error; + return -rte_errno; + } + /* Ack message. */ + return 0; + } + /* Multi-part msgs and their trailing DONE message. */ + if (nh->nlmsg_flags & NLM_F_MULTI) { + if (nh->nlmsg_type == NLMSG_DONE) + return 0; + multipart = 1; + } + if (cb) { + ret = cb(nh, arg); + if (ret < 0) + return ret; + } + } + } while (multipart); + return ret; +} + +/** + * Parse Netlink message to retrieve the bridge MAC address. + * + * @param nh + * Pointer to Netlink Message Header. + * @param arg + * PMD data register with this callback. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg) +{ + struct mlx5_nl_mac_addr *data = arg; + struct ndmsg *r = NLMSG_DATA(nh); + struct rtattr *attribute; + int len; + + len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)); + for (attribute = MLX5_NDA_RTA(r); + RTA_OK(attribute, len); + attribute = RTA_NEXT(attribute, len)) { + if (attribute->rta_type == NDA_LLADDR) { + if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) { + DRV_LOG(WARNING, + "not enough room to finalize the" + " request"); + rte_errno = ENOMEM; + return -rte_errno; + } +#ifndef NDEBUG + char m[18]; + + ether_format_addr(m, 18, RTA_DATA(attribute)); + DRV_LOG(DEBUG, "bridge MAC address %s", m); +#endif + memcpy(&(*data->mac)[data->mac_n++], + RTA_DATA(attribute), ETHER_ADDR_LEN); + } + } + return 0; +} + +/** + * Get bridge MAC addresses. + * + * @param dev + * Pointer to Ethernet device. + * @param mac[out] + * Pointer to the array table of MAC addresses to fill. + * Its size should be of MLX5_MAX_MAC_ADDRESSES. + * @param mac_n[out] + * Number of entries filled in MAC array. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct ether_addr (*mac)[], + int *mac_n) +{ + struct priv *priv = dev->data->dev_private; + unsigned int iface_idx = mlx5_ifindex(dev); + struct { + struct nlmsghdr hdr; + struct ifinfomsg ifm; + } req = { + .hdr = { + .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nlmsg_type = RTM_GETNEIGH, + .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + }, + .ifm = { + .ifi_family = PF_BRIDGE, + .ifi_index = iface_idx, + }, + }; + struct mlx5_nl_mac_addr data = { + .mac = mac, + .mac_n = 0, + }; + int fd; + int ret; + uint32_t sn = priv->nl_sn++; + + if (priv->nl_socket_route == -1) + return 0; + fd = priv->nl_socket_route; + ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm, + sizeof(struct ifinfomsg)); + if (ret < 0) + goto error; + ret = mlx5_nl_recv(fd, sn, mlx5_nl_mac_addr_cb, &data); + if (ret < 0) + goto error; + *mac_n = data.mac_n; + return 0; +error: + DRV_LOG(DEBUG, "port %u cannot retrieve MAC address list %s", + dev->data->port_id, strerror(rte_errno)); + return -rte_errno; +} + +/** + * Modify the MAC address neighbour table with Netlink. + * + * @param dev + * Pointer to Ethernet device. + * @param mac + * MAC address to consider. + * @param add + * 1 to add the MAC address, 0 to remove the MAC address. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac, + int add) +{ + struct priv *priv = dev->data->dev_private; + unsigned int iface_idx = mlx5_ifindex(dev); + struct { + struct nlmsghdr hdr; + struct ndmsg ndm; + struct rtattr rta; + uint8_t buffer[ETHER_ADDR_LEN]; + } req = { + .hdr = { + .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), + .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | + NLM_F_EXCL | NLM_F_ACK, + .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH, + }, + .ndm = { + .ndm_family = PF_BRIDGE, + .ndm_state = NUD_NOARP | NUD_PERMANENT, + .ndm_ifindex = iface_idx, + .ndm_flags = NTF_SELF, + }, + .rta = { + .rta_type = NDA_LLADDR, + .rta_len = RTA_LENGTH(ETHER_ADDR_LEN), + }, + }; + int fd; + int ret; + uint32_t sn = priv->nl_sn++; + + if (priv->nl_socket_route == -1) + return 0; + fd = priv->nl_socket_route; + memcpy(RTA_DATA(&req.rta), mac, ETHER_ADDR_LEN); + req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + + RTA_ALIGN(req.rta.rta_len); + ret = mlx5_nl_send(fd, &req.hdr, sn); + if (ret < 0) + goto error; + ret = mlx5_nl_recv(fd, sn, NULL, NULL); + if (ret < 0) + goto error; + return 0; +error: + DRV_LOG(DEBUG, + "port %u cannot %s MAC address %02X:%02X:%02X:%02X:%02X:%02X" + " %s", + dev->data->port_id, + add ? "add" : "remove", + mac->addr_bytes[0], mac->addr_bytes[1], + mac->addr_bytes[2], mac->addr_bytes[3], + mac->addr_bytes[4], mac->addr_bytes[5], + strerror(rte_errno)); + return -rte_errno; +} + +/** + * Add a MAC address. + * + * @param dev + * Pointer to Ethernet device. + * @param mac + * MAC address to register. + * @param index + * MAC address index. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index) +{ + struct priv *priv = dev->data->dev_private; + int ret; + + ret = mlx5_nl_mac_addr_modify(dev, mac, 1); + if (!ret) + BITFIELD_SET(priv->mac_own, index); + if (ret == -EEXIST) + return 0; + return ret; +} + +/** + * Remove a MAC address. + * + * @param dev + * Pointer to Ethernet device. + * @param mac + * MAC address to remove. + * @param index + * MAC address index. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac, + uint32_t index) +{ + struct priv *priv = dev->data->dev_private; + + BITFIELD_RESET(priv->mac_own, index); + return mlx5_nl_mac_addr_modify(dev, mac, 0); +} + +/** + * Synchronize Netlink bridge table to the internal table. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev) +{ + struct ether_addr macs[MLX5_MAX_MAC_ADDRESSES]; + int macs_n = 0; + int i; + int ret; + + ret = mlx5_nl_mac_addr_list(dev, &macs, &macs_n); + if (ret) + return; + for (i = 0; i != macs_n; ++i) { + int j; + + /* Verify the address is not in the array yet. */ + for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) + if (is_same_ether_addr(&macs[i], + &dev->data->mac_addrs[j])) + break; + if (j != MLX5_MAX_MAC_ADDRESSES) + continue; + /* Find the first entry available. */ + for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) { + if (is_zero_ether_addr(&dev->data->mac_addrs[j])) { + dev->data->mac_addrs[j] = macs[i]; + break; + } + } + } +} + +/** + * Flush all added MAC addresses. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + int i; + + for (i = MLX5_MAX_MAC_ADDRESSES - 1; i >= 0; --i) { + struct ether_addr *m = &dev->data->mac_addrs[i]; + + if (BITFIELD_ISSET(priv->mac_own, i)) + mlx5_nl_mac_addr_remove(dev, m, i); + } +} + +/** + * Enable promiscuous / all multicast mode through Netlink. + * + * @param dev + * Pointer to Ethernet device structure. + * @param flags + * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti. + * @param enable + * Nonzero to enable, disable otherwise. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_device_flags(struct rte_eth_dev *dev, uint32_t flags, int enable) +{ + struct priv *priv = dev->data->dev_private; + unsigned int iface_idx = mlx5_ifindex(dev); + struct { + struct nlmsghdr hdr; + struct ifinfomsg ifi; + } req = { + .hdr = { + .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nlmsg_type = RTM_NEWLINK, + .nlmsg_flags = NLM_F_REQUEST, + }, + .ifi = { + .ifi_flags = enable ? flags : 0, + .ifi_change = flags, + .ifi_index = iface_idx, + }, + }; + int fd; + int ret; + + assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI))); + if (priv->nl_socket_route < 0) + return 0; + fd = priv->nl_socket_route; + ret = mlx5_nl_send(fd, &req.hdr, priv->nl_sn++); + if (ret < 0) + return ret; + return 0; +} + +/** + * Enable promiscuous mode through Netlink. + * + * @param dev + * Pointer to Ethernet device structure. + * @param enable + * Nonzero to enable, disable otherwise. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_promisc(struct rte_eth_dev *dev, int enable) +{ + int ret = mlx5_nl_device_flags(dev, IFF_PROMISC, enable); + + if (ret) + DRV_LOG(DEBUG, + "port %u cannot %s promisc mode: Netlink error %s", + dev->data->port_id, enable ? "enable" : "disable", + strerror(rte_errno)); + return ret; +} + +/** + * Enable all multicast mode through Netlink. + * + * @param dev + * Pointer to Ethernet device structure. + * @param enable + * Nonzero to enable, disable otherwise. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable) +{ + int ret = mlx5_nl_device_flags(dev, IFF_ALLMULTI, enable); + + if (ret) + DRV_LOG(DEBUG, + "port %u cannot %s allmulti mode: Netlink error %s", + dev->data->port_id, enable ? "enable" : "disable", + strerror(rte_errno)); + return ret; +} + +/** + * Process network interface information from Netlink message. + * + * @param nh + * Pointer to Netlink message header. + * @param arg + * Opaque data pointer for this callback. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_ifindex_cb(struct nlmsghdr *nh, void *arg) +{ + struct mlx5_nl_ifindex_data *data = arg; + size_t off = NLMSG_HDRLEN; + uint32_t ibindex = 0; + uint32_t ifindex = 0; + int found = 0; + + if (nh->nlmsg_type != + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) && + nh->nlmsg_type != + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET)) + goto error; + while (off < nh->nlmsg_len) { + struct nlattr *na = (void *)((uintptr_t)nh + off); + void *payload = (void *)((uintptr_t)na + NLA_HDRLEN); + + if (na->nla_len > nh->nlmsg_len - off) + goto error; + switch (na->nla_type) { + case RDMA_NLDEV_ATTR_DEV_INDEX: + ibindex = *(uint32_t *)payload; + break; + case RDMA_NLDEV_ATTR_DEV_NAME: + if (!strcmp(payload, data->name)) + found = 1; + break; + case RDMA_NLDEV_ATTR_NDEV_INDEX: + ifindex = *(uint32_t *)payload; + break; + default: + break; + } + off += NLA_ALIGN(na->nla_len); + } + if (found) { + data->ibindex = ibindex; + data->ifindex = ifindex; + } + return 0; +error: + rte_errno = EINVAL; + return -rte_errno; +} + +/** + * Get index of network interface associated with some IB device. + * + * This is the only somewhat safe method to avoid resorting to heuristics + * when faced with port representors. Unfortunately it requires at least + * Linux 4.17. + * + * @param nl + * Netlink socket of the RDMA kind (NETLINK_RDMA). + * @param[in] name + * IB device name. + * + * @return + * A valid (nonzero) interface index on success, 0 otherwise and rte_errno + * is set. + */ +unsigned int +mlx5_nl_ifindex(int nl, const char *name) +{ + static const uint32_t pindex = 1; + uint32_t seq = random(); + struct mlx5_nl_ifindex_data data = { + .name = name, + .ibindex = 0, /* Determined during first pass. */ + .ifindex = 0, /* Determined during second pass. */ + }; + union { + struct nlmsghdr nh; + uint8_t buf[NLMSG_HDRLEN + + NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) + + NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))]; + } req = { + .nh = { + .nlmsg_len = NLMSG_LENGTH(0), + .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_GET), + .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP, + }, + }; + struct nlattr *na; + int ret; + + ret = mlx5_nl_send(nl, &req.nh, seq); + if (ret < 0) + return 0; + ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data); + if (ret < 0) + return 0; + if (!data.ibindex) + goto error; + ++seq; + req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_PORT_GET); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN); + na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN); + na->nla_len = NLA_HDRLEN + sizeof(data.ibindex); + na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX; + memcpy((void *)((uintptr_t)na + NLA_HDRLEN), + &data.ibindex, sizeof(data.ibindex)); + na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len)); + na->nla_len = NLA_HDRLEN + sizeof(pindex); + na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX; + memcpy((void *)((uintptr_t)na + NLA_HDRLEN), + &pindex, sizeof(pindex)); + ret = mlx5_nl_send(nl, &req.nh, seq); + if (ret < 0) + return 0; + ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data); + if (ret < 0) + return 0; + if (!data.ifindex) + goto error; + return data.ifindex; +error: + rte_errno = ENODEV; + return 0; +} + +/** + * Process switch information from Netlink message. + * + * @param nh + * Pointer to Netlink message header. + * @param arg + * Opaque data pointer for this callback. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg) +{ + struct mlx5_switch_info info = { + .master = 0, + .representor = 0, + .port_name = 0, + .switch_id = 0, + }; + size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + bool port_name_set = false; + bool switch_id_set = false; + + if (nh->nlmsg_type != RTM_NEWLINK) + goto error; + while (off < nh->nlmsg_len) { + struct rtattr *ra = (void *)((uintptr_t)nh + off); + void *payload = RTA_DATA(ra); + char *end; + unsigned int i; + + if (ra->rta_len > nh->nlmsg_len - off) + goto error; + switch (ra->rta_type) { + case IFLA_PHYS_PORT_NAME: + errno = 0; + info.port_name = strtol(payload, &end, 0); + if (errno || + (size_t)(end - (char *)payload) != strlen(payload)) + goto error; + port_name_set = true; + break; + case IFLA_PHYS_SWITCH_ID: + info.switch_id = 0; + for (i = 0; i < RTA_PAYLOAD(ra); ++i) { + info.switch_id <<= 8; + info.switch_id |= ((uint8_t *)payload)[i]; + } + switch_id_set = true; + break; + } + off += RTA_ALIGN(ra->rta_len); + } + info.master = switch_id_set && !port_name_set; + info.representor = switch_id_set && port_name_set; + memcpy(arg, &info, sizeof(info)); + return 0; +error: + rte_errno = EINVAL; + return -rte_errno; +} + +/** + * Get switch information associated with network interface. + * + * @param nl + * Netlink socket of the ROUTE kind (NETLINK_ROUTE). + * @param ifindex + * Network interface index. + * @param[out] info + * Switch information object, populated in case of success. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info) +{ + uint32_t seq = random(); + struct { + struct nlmsghdr nh; + struct ifinfomsg info; + } req = { + .nh = { + .nlmsg_len = NLMSG_LENGTH(sizeof(req.info)), + .nlmsg_type = RTM_GETLINK, + .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, + }, + .info = { + .ifi_family = AF_UNSPEC, + .ifi_index = ifindex, + }, + }; + int ret; + + ret = mlx5_nl_send(nl, &req.nh, seq); + if (ret >= 0) + ret = mlx5_nl_recv(nl, seq, mlx5_nl_switch_info_cb, info); + return ret; +} diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c new file mode 100644 index 00000000..a1c8c340 --- /dev/null +++ b/drivers/net/mlx5/mlx5_nl_flow.c @@ -0,0 +1,1248 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 6WIND S.A. + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include <assert.h> +#include <errno.h> +#include <libmnl/libmnl.h> +#include <linux/if_ether.h> +#include <linux/netlink.h> +#include <linux/pkt_cls.h> +#include <linux/pkt_sched.h> +#include <linux/rtnetlink.h> +#include <linux/tc_act/tc_gact.h> +#include <linux/tc_act/tc_mirred.h> +#include <netinet/in.h> +#include <stdalign.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> +#include <sys/socket.h> + +#include <rte_byteorder.h> +#include <rte_errno.h> +#include <rte_ether.h> +#include <rte_flow.h> + +#include "mlx5.h" +#include "mlx5_autoconf.h" + +#ifdef HAVE_TC_ACT_VLAN + +#include <linux/tc_act/tc_vlan.h> + +#else /* HAVE_TC_ACT_VLAN */ + +#define TCA_VLAN_ACT_POP 1 +#define TCA_VLAN_ACT_PUSH 2 +#define TCA_VLAN_ACT_MODIFY 3 +#define TCA_VLAN_PARMS 2 +#define TCA_VLAN_PUSH_VLAN_ID 3 +#define TCA_VLAN_PUSH_VLAN_PROTOCOL 4 +#define TCA_VLAN_PAD 5 +#define TCA_VLAN_PUSH_VLAN_PRIORITY 6 + +struct tc_vlan { + tc_gen; + int v_action; +}; + +#endif /* HAVE_TC_ACT_VLAN */ + +/* Normally found in linux/netlink.h. */ +#ifndef NETLINK_CAP_ACK +#define NETLINK_CAP_ACK 10 +#endif + +/* Normally found in linux/pkt_sched.h. */ +#ifndef TC_H_MIN_INGRESS +#define TC_H_MIN_INGRESS 0xfff2u +#endif + +/* Normally found in linux/pkt_cls.h. */ +#ifndef TCA_CLS_FLAGS_SKIP_SW +#define TCA_CLS_FLAGS_SKIP_SW (1 << 1) +#endif +#ifndef HAVE_TCA_FLOWER_ACT +#define TCA_FLOWER_ACT 3 +#endif +#ifndef HAVE_TCA_FLOWER_FLAGS +#define TCA_FLOWER_FLAGS 22 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE +#define TCA_FLOWER_KEY_ETH_TYPE 8 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ETH_DST +#define TCA_FLOWER_KEY_ETH_DST 4 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK +#define TCA_FLOWER_KEY_ETH_DST_MASK 5 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC +#define TCA_FLOWER_KEY_ETH_SRC 6 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK +#define TCA_FLOWER_KEY_ETH_SRC_MASK 7 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO +#define TCA_FLOWER_KEY_IP_PROTO 9 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC +#define TCA_FLOWER_KEY_IPV4_SRC 10 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK +#define TCA_FLOWER_KEY_IPV4_SRC_MASK 11 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST +#define TCA_FLOWER_KEY_IPV4_DST 12 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK +#define TCA_FLOWER_KEY_IPV4_DST_MASK 13 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC +#define TCA_FLOWER_KEY_IPV6_SRC 14 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK +#define TCA_FLOWER_KEY_IPV6_SRC_MASK 15 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST +#define TCA_FLOWER_KEY_IPV6_DST 16 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK +#define TCA_FLOWER_KEY_IPV6_DST_MASK 17 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC +#define TCA_FLOWER_KEY_TCP_SRC 18 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK +#define TCA_FLOWER_KEY_TCP_SRC_MASK 35 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_TCP_DST +#define TCA_FLOWER_KEY_TCP_DST 19 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK +#define TCA_FLOWER_KEY_TCP_DST_MASK 36 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC +#define TCA_FLOWER_KEY_UDP_SRC 20 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK +#define TCA_FLOWER_KEY_UDP_SRC_MASK 37 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_UDP_DST +#define TCA_FLOWER_KEY_UDP_DST 21 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK +#define TCA_FLOWER_KEY_UDP_DST_MASK 38 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID +#define TCA_FLOWER_KEY_VLAN_ID 23 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO +#define TCA_FLOWER_KEY_VLAN_PRIO 24 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE +#define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25 +#endif + +/** Parser state definitions for mlx5_nl_flow_trans[]. */ +enum mlx5_nl_flow_trans { + INVALID, + BACK, + ATTR, + PATTERN, + ITEM_VOID, + ITEM_PORT_ID, + ITEM_ETH, + ITEM_VLAN, + ITEM_IPV4, + ITEM_IPV6, + ITEM_TCP, + ITEM_UDP, + ACTIONS, + ACTION_VOID, + ACTION_PORT_ID, + ACTION_DROP, + ACTION_OF_POP_VLAN, + ACTION_OF_PUSH_VLAN, + ACTION_OF_SET_VLAN_VID, + ACTION_OF_SET_VLAN_PCP, + END, +}; + +#define TRANS(...) (const enum mlx5_nl_flow_trans []){ __VA_ARGS__, INVALID, } + +#define PATTERN_COMMON \ + ITEM_VOID, ITEM_PORT_ID, ACTIONS +#define ACTIONS_COMMON \ + ACTION_VOID, ACTION_OF_POP_VLAN, ACTION_OF_PUSH_VLAN, \ + ACTION_OF_SET_VLAN_VID, ACTION_OF_SET_VLAN_PCP +#define ACTIONS_FATE \ + ACTION_PORT_ID, ACTION_DROP + +/** Parser state transitions used by mlx5_nl_flow_transpose(). */ +static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = { + [INVALID] = NULL, + [BACK] = NULL, + [ATTR] = TRANS(PATTERN), + [PATTERN] = TRANS(ITEM_ETH, PATTERN_COMMON), + [ITEM_VOID] = TRANS(BACK), + [ITEM_PORT_ID] = TRANS(BACK), + [ITEM_ETH] = TRANS(ITEM_IPV4, ITEM_IPV6, ITEM_VLAN, PATTERN_COMMON), + [ITEM_VLAN] = TRANS(ITEM_IPV4, ITEM_IPV6, PATTERN_COMMON), + [ITEM_IPV4] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON), + [ITEM_IPV6] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON), + [ITEM_TCP] = TRANS(PATTERN_COMMON), + [ITEM_UDP] = TRANS(PATTERN_COMMON), + [ACTIONS] = TRANS(ACTIONS_FATE, ACTIONS_COMMON), + [ACTION_VOID] = TRANS(BACK), + [ACTION_PORT_ID] = TRANS(ACTION_VOID, END), + [ACTION_DROP] = TRANS(ACTION_VOID, END), + [ACTION_OF_POP_VLAN] = TRANS(ACTIONS_FATE, ACTIONS_COMMON), + [ACTION_OF_PUSH_VLAN] = TRANS(ACTIONS_FATE, ACTIONS_COMMON), + [ACTION_OF_SET_VLAN_VID] = TRANS(ACTIONS_FATE, ACTIONS_COMMON), + [ACTION_OF_SET_VLAN_PCP] = TRANS(ACTIONS_FATE, ACTIONS_COMMON), + [END] = NULL, +}; + +/** Empty masks for known item types. */ +static const union { + struct rte_flow_item_port_id port_id; + struct rte_flow_item_eth eth; + struct rte_flow_item_vlan vlan; + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv6 ipv6; + struct rte_flow_item_tcp tcp; + struct rte_flow_item_udp udp; +} mlx5_nl_flow_mask_empty; + +/** Supported masks for known item types. */ +static const struct { + struct rte_flow_item_port_id port_id; + struct rte_flow_item_eth eth; + struct rte_flow_item_vlan vlan; + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv6 ipv6; + struct rte_flow_item_tcp tcp; + struct rte_flow_item_udp udp; +} mlx5_nl_flow_mask_supported = { + .port_id = { + .id = 0xffffffff, + }, + .eth = { + .type = RTE_BE16(0xffff), + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }, + .vlan = { + /* PCP and VID only, no DEI. */ + .tci = RTE_BE16(0xefff), + .inner_type = RTE_BE16(0xffff), + }, + .ipv4.hdr = { + .next_proto_id = 0xff, + .src_addr = RTE_BE32(0xffffffff), + .dst_addr = RTE_BE32(0xffffffff), + }, + .ipv6.hdr = { + .proto = 0xff, + .src_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .dst_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + }, + .tcp.hdr = { + .src_port = RTE_BE16(0xffff), + .dst_port = RTE_BE16(0xffff), + }, + .udp.hdr = { + .src_port = RTE_BE16(0xffff), + .dst_port = RTE_BE16(0xffff), + }, +}; + +/** + * Retrieve mask for pattern item. + * + * This function does basic sanity checks on a pattern item in order to + * return the most appropriate mask for it. + * + * @param[in] item + * Item specification. + * @param[in] mask_default + * Default mask for pattern item as specified by the flow API. + * @param[in] mask_supported + * Mask fields supported by the implementation. + * @param[in] mask_empty + * Empty mask to return when there is no specification. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * Either @p item->mask or one of the mask parameters on success, NULL + * otherwise and rte_errno is set. + */ +static const void * +mlx5_nl_flow_item_mask(const struct rte_flow_item *item, + const void *mask_default, + const void *mask_supported, + const void *mask_empty, + size_t mask_size, + struct rte_flow_error *error) +{ + const uint8_t *mask; + size_t i; + + /* item->last and item->mask cannot exist without item->spec. */ + if (!item->spec && (item->mask || item->last)) { + rte_flow_error_set + (error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, + "\"mask\" or \"last\" field provided without a" + " corresponding \"spec\""); + return NULL; + } + /* No spec, no mask, no problem. */ + if (!item->spec) + return mask_empty; + mask = item->mask ? item->mask : mask_default; + assert(mask); + /* + * Single-pass check to make sure that: + * - Mask is supported, no bits are set outside mask_supported. + * - Both item->spec and item->last are included in mask. + */ + for (i = 0; i != mask_size; ++i) { + if (!mask[i]) + continue; + if ((mask[i] | ((const uint8_t *)mask_supported)[i]) != + ((const uint8_t *)mask_supported)[i]) { + rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask, "unsupported field found in \"mask\""); + return NULL; + } + if (item->last && + (((const uint8_t *)item->spec)[i] & mask[i]) != + (((const uint8_t *)item->last)[i] & mask[i])) { + rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_LAST, + item->last, + "range between \"spec\" and \"last\" not" + " comprised in \"mask\""); + return NULL; + } + } + return mask; +} + +/** + * Transpose flow rule description to rtnetlink message. + * + * This function transposes a flow rule description to a traffic control + * (TC) filter creation message ready to be sent over Netlink. + * + * Target interface is specified as the first entry of the @p ptoi table. + * Subsequent entries enable this function to resolve other DPDK port IDs + * found in the flow rule. + * + * @param[out] buf + * Output message buffer. May be NULL when @p size is 0. + * @param size + * Size of @p buf. Message may be truncated if not large enough. + * @param[in] ptoi + * DPDK port ID to network interface index translation table. This table + * is terminated by an entry with a zero ifindex value. + * @param[in] attr + * Flow rule attributes. + * @param[in] pattern + * Pattern specification. + * @param[in] actions + * Associated actions. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * A positive value representing the exact size of the message in bytes + * regardless of the @p size parameter on success, a negative errno value + * otherwise and rte_errno is set. + */ +int +mlx5_nl_flow_transpose(void *buf, + size_t size, + const struct mlx5_nl_flow_ptoi *ptoi, + const struct rte_flow_attr *attr, + const struct rte_flow_item *pattern, + const struct rte_flow_action *actions, + struct rte_flow_error *error) +{ + alignas(struct nlmsghdr) + uint8_t buf_tmp[mnl_nlmsg_size(sizeof(struct tcmsg) + 1024)]; + const struct rte_flow_item *item; + const struct rte_flow_action *action; + unsigned int n; + uint32_t act_index_cur; + bool in_port_id_set; + bool eth_type_set; + bool vlan_present; + bool vlan_eth_type_set; + bool ip_proto_set; + struct nlattr *na_flower; + struct nlattr *na_flower_act; + struct nlattr *na_vlan_id; + struct nlattr *na_vlan_priority; + const enum mlx5_nl_flow_trans *trans; + const enum mlx5_nl_flow_trans *back; + + if (!size) + goto error_nobufs; +init: + item = pattern; + action = actions; + n = 0; + act_index_cur = 0; + in_port_id_set = false; + eth_type_set = false; + vlan_present = false; + vlan_eth_type_set = false; + ip_proto_set = false; + na_flower = NULL; + na_flower_act = NULL; + na_vlan_id = NULL; + na_vlan_priority = NULL; + trans = TRANS(ATTR); + back = trans; +trans: + switch (trans[n++]) { + union { + const struct rte_flow_item_port_id *port_id; + const struct rte_flow_item_eth *eth; + const struct rte_flow_item_vlan *vlan; + const struct rte_flow_item_ipv4 *ipv4; + const struct rte_flow_item_ipv6 *ipv6; + const struct rte_flow_item_tcp *tcp; + const struct rte_flow_item_udp *udp; + } spec, mask; + union { + const struct rte_flow_action_port_id *port_id; + const struct rte_flow_action_of_push_vlan *of_push_vlan; + const struct rte_flow_action_of_set_vlan_vid * + of_set_vlan_vid; + const struct rte_flow_action_of_set_vlan_pcp * + of_set_vlan_pcp; + } conf; + struct nlmsghdr *nlh; + struct tcmsg *tcm; + struct nlattr *act_index; + struct nlattr *act; + unsigned int i; + + case INVALID: + if (item->type) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, "unsupported pattern item combination"); + else if (action->type) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, + action, "unsupported action combination"); + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "flow rule lacks some kind of fate action"); + case BACK: + trans = back; + n = 0; + goto trans; + case ATTR: + /* + * Supported attributes: no groups, some priorities and + * ingress only. Don't care about transfer as it is the + * caller's problem. + */ + if (attr->group) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + attr, "groups are not supported"); + if (attr->priority > 0xfffe) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + attr, "lowest priority level is 0xfffe"); + if (!attr->ingress) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + attr, "only ingress is supported"); + if (attr->egress) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + attr, "egress is not supported"); + if (size < mnl_nlmsg_size(sizeof(*tcm))) + goto error_nobufs; + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = 0; + nlh->nlmsg_flags = 0; + nlh->nlmsg_seq = 0; + tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = ptoi[0].ifindex; + /* + * Let kernel pick a handle by default. A predictable handle + * can be set by the caller on the resulting buffer through + * mlx5_nl_flow_brand(). + */ + tcm->tcm_handle = 0; + tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS); + /* + * Priority cannot be zero to prevent the kernel from + * picking one automatically. + */ + tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, + RTE_BE16(ETH_P_ALL)); + break; + case PATTERN: + if (!mnl_attr_put_strz_check(buf, size, TCA_KIND, "flower")) + goto error_nobufs; + na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS); + if (!na_flower) + goto error_nobufs; + if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS, + TCA_CLS_FLAGS_SKIP_SW)) + goto error_nobufs; + break; + case ITEM_VOID: + if (item->type != RTE_FLOW_ITEM_TYPE_VOID) + goto trans; + ++item; + break; + case ITEM_PORT_ID: + if (item->type != RTE_FLOW_ITEM_TYPE_PORT_ID) + goto trans; + mask.port_id = mlx5_nl_flow_item_mask + (item, &rte_flow_item_port_id_mask, + &mlx5_nl_flow_mask_supported.port_id, + &mlx5_nl_flow_mask_empty.port_id, + sizeof(mlx5_nl_flow_mask_supported.port_id), error); + if (!mask.port_id) + return -rte_errno; + if (mask.port_id == &mlx5_nl_flow_mask_empty.port_id) { + in_port_id_set = 1; + ++item; + break; + } + spec.port_id = item->spec; + if (mask.port_id->id && mask.port_id->id != 0xffffffff) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.port_id, + "no support for partial mask on" + " \"id\" field"); + if (!mask.port_id->id) + i = 0; + else + for (i = 0; ptoi[i].ifindex; ++i) + if (ptoi[i].port_id == spec.port_id->id) + break; + if (!ptoi[i].ifindex) + return rte_flow_error_set + (error, ENODEV, RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + spec.port_id, + "missing data to convert port ID to ifindex"); + tcm = mnl_nlmsg_get_payload(buf); + if (in_port_id_set && + ptoi[i].ifindex != (unsigned int)tcm->tcm_ifindex) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + spec.port_id, + "cannot match traffic for several port IDs" + " through a single flow rule"); + tcm->tcm_ifindex = ptoi[i].ifindex; + in_port_id_set = 1; + ++item; + break; + case ITEM_ETH: + if (item->type != RTE_FLOW_ITEM_TYPE_ETH) + goto trans; + mask.eth = mlx5_nl_flow_item_mask + (item, &rte_flow_item_eth_mask, + &mlx5_nl_flow_mask_supported.eth, + &mlx5_nl_flow_mask_empty.eth, + sizeof(mlx5_nl_flow_mask_supported.eth), error); + if (!mask.eth) + return -rte_errno; + if (mask.eth == &mlx5_nl_flow_mask_empty.eth) { + ++item; + break; + } + spec.eth = item->spec; + if (mask.eth->type && mask.eth->type != RTE_BE16(0xffff)) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.eth, + "no support for partial mask on" + " \"type\" field"); + if (mask.eth->type) { + if (!mnl_attr_put_u16_check(buf, size, + TCA_FLOWER_KEY_ETH_TYPE, + spec.eth->type)) + goto error_nobufs; + eth_type_set = 1; + } + if ((!is_zero_ether_addr(&mask.eth->dst) && + (!mnl_attr_put_check(buf, size, + TCA_FLOWER_KEY_ETH_DST, + ETHER_ADDR_LEN, + spec.eth->dst.addr_bytes) || + !mnl_attr_put_check(buf, size, + TCA_FLOWER_KEY_ETH_DST_MASK, + ETHER_ADDR_LEN, + mask.eth->dst.addr_bytes))) || + (!is_zero_ether_addr(&mask.eth->src) && + (!mnl_attr_put_check(buf, size, + TCA_FLOWER_KEY_ETH_SRC, + ETHER_ADDR_LEN, + spec.eth->src.addr_bytes) || + !mnl_attr_put_check(buf, size, + TCA_FLOWER_KEY_ETH_SRC_MASK, + ETHER_ADDR_LEN, + mask.eth->src.addr_bytes)))) + goto error_nobufs; + ++item; + break; + case ITEM_VLAN: + if (item->type != RTE_FLOW_ITEM_TYPE_VLAN) + goto trans; + mask.vlan = mlx5_nl_flow_item_mask + (item, &rte_flow_item_vlan_mask, + &mlx5_nl_flow_mask_supported.vlan, + &mlx5_nl_flow_mask_empty.vlan, + sizeof(mlx5_nl_flow_mask_supported.vlan), error); + if (!mask.vlan) + return -rte_errno; + if (!eth_type_set && + !mnl_attr_put_u16_check(buf, size, + TCA_FLOWER_KEY_ETH_TYPE, + RTE_BE16(ETH_P_8021Q))) + goto error_nobufs; + eth_type_set = 1; + vlan_present = 1; + if (mask.vlan == &mlx5_nl_flow_mask_empty.vlan) { + ++item; + break; + } + spec.vlan = item->spec; + if ((mask.vlan->tci & RTE_BE16(0xe000) && + (mask.vlan->tci & RTE_BE16(0xe000)) != RTE_BE16(0xe000)) || + (mask.vlan->tci & RTE_BE16(0x0fff) && + (mask.vlan->tci & RTE_BE16(0x0fff)) != RTE_BE16(0x0fff)) || + (mask.vlan->inner_type && + mask.vlan->inner_type != RTE_BE16(0xffff))) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.vlan, + "no support for partial masks on" + " \"tci\" (PCP and VID parts) and" + " \"inner_type\" fields"); + if (mask.vlan->inner_type) { + if (!mnl_attr_put_u16_check + (buf, size, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + spec.vlan->inner_type)) + goto error_nobufs; + vlan_eth_type_set = 1; + } + if ((mask.vlan->tci & RTE_BE16(0xe000) && + !mnl_attr_put_u8_check + (buf, size, TCA_FLOWER_KEY_VLAN_PRIO, + (rte_be_to_cpu_16(spec.vlan->tci) >> 13) & 0x7)) || + (mask.vlan->tci & RTE_BE16(0x0fff) && + !mnl_attr_put_u16_check + (buf, size, TCA_FLOWER_KEY_VLAN_ID, + rte_be_to_cpu_16(spec.vlan->tci & RTE_BE16(0x0fff))))) + goto error_nobufs; + ++item; + break; + case ITEM_IPV4: + if (item->type != RTE_FLOW_ITEM_TYPE_IPV4) + goto trans; + mask.ipv4 = mlx5_nl_flow_item_mask + (item, &rte_flow_item_ipv4_mask, + &mlx5_nl_flow_mask_supported.ipv4, + &mlx5_nl_flow_mask_empty.ipv4, + sizeof(mlx5_nl_flow_mask_supported.ipv4), error); + if (!mask.ipv4) + return -rte_errno; + if ((!eth_type_set || !vlan_eth_type_set) && + !mnl_attr_put_u16_check(buf, size, + vlan_present ? + TCA_FLOWER_KEY_VLAN_ETH_TYPE : + TCA_FLOWER_KEY_ETH_TYPE, + RTE_BE16(ETH_P_IP))) + goto error_nobufs; + eth_type_set = 1; + vlan_eth_type_set = 1; + if (mask.ipv4 == &mlx5_nl_flow_mask_empty.ipv4) { + ++item; + break; + } + spec.ipv4 = item->spec; + if (mask.ipv4->hdr.next_proto_id && + mask.ipv4->hdr.next_proto_id != 0xff) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.ipv4, + "no support for partial mask on" + " \"hdr.next_proto_id\" field"); + if (mask.ipv4->hdr.next_proto_id) { + if (!mnl_attr_put_u8_check + (buf, size, TCA_FLOWER_KEY_IP_PROTO, + spec.ipv4->hdr.next_proto_id)) + goto error_nobufs; + ip_proto_set = 1; + } + if ((mask.ipv4->hdr.src_addr && + (!mnl_attr_put_u32_check(buf, size, + TCA_FLOWER_KEY_IPV4_SRC, + spec.ipv4->hdr.src_addr) || + !mnl_attr_put_u32_check(buf, size, + TCA_FLOWER_KEY_IPV4_SRC_MASK, + mask.ipv4->hdr.src_addr))) || + (mask.ipv4->hdr.dst_addr && + (!mnl_attr_put_u32_check(buf, size, + TCA_FLOWER_KEY_IPV4_DST, + spec.ipv4->hdr.dst_addr) || + !mnl_attr_put_u32_check(buf, size, + TCA_FLOWER_KEY_IPV4_DST_MASK, + mask.ipv4->hdr.dst_addr)))) + goto error_nobufs; + ++item; + break; + case ITEM_IPV6: + if (item->type != RTE_FLOW_ITEM_TYPE_IPV6) + goto trans; + mask.ipv6 = mlx5_nl_flow_item_mask + (item, &rte_flow_item_ipv6_mask, + &mlx5_nl_flow_mask_supported.ipv6, + &mlx5_nl_flow_mask_empty.ipv6, + sizeof(mlx5_nl_flow_mask_supported.ipv6), error); + if (!mask.ipv6) + return -rte_errno; + if ((!eth_type_set || !vlan_eth_type_set) && + !mnl_attr_put_u16_check(buf, size, + vlan_present ? + TCA_FLOWER_KEY_VLAN_ETH_TYPE : + TCA_FLOWER_KEY_ETH_TYPE, + RTE_BE16(ETH_P_IPV6))) + goto error_nobufs; + eth_type_set = 1; + vlan_eth_type_set = 1; + if (mask.ipv6 == &mlx5_nl_flow_mask_empty.ipv6) { + ++item; + break; + } + spec.ipv6 = item->spec; + if (mask.ipv6->hdr.proto && mask.ipv6->hdr.proto != 0xff) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.ipv6, + "no support for partial mask on" + " \"hdr.proto\" field"); + if (mask.ipv6->hdr.proto) { + if (!mnl_attr_put_u8_check + (buf, size, TCA_FLOWER_KEY_IP_PROTO, + spec.ipv6->hdr.proto)) + goto error_nobufs; + ip_proto_set = 1; + } + if ((!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr) && + (!mnl_attr_put_check(buf, size, + TCA_FLOWER_KEY_IPV6_SRC, + sizeof(spec.ipv6->hdr.src_addr), + spec.ipv6->hdr.src_addr) || + !mnl_attr_put_check(buf, size, + TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(mask.ipv6->hdr.src_addr), + mask.ipv6->hdr.src_addr))) || + (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr) && + (!mnl_attr_put_check(buf, size, + TCA_FLOWER_KEY_IPV6_DST, + sizeof(spec.ipv6->hdr.dst_addr), + spec.ipv6->hdr.dst_addr) || + !mnl_attr_put_check(buf, size, + TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(mask.ipv6->hdr.dst_addr), + mask.ipv6->hdr.dst_addr)))) + goto error_nobufs; + ++item; + break; + case ITEM_TCP: + if (item->type != RTE_FLOW_ITEM_TYPE_TCP) + goto trans; + mask.tcp = mlx5_nl_flow_item_mask + (item, &rte_flow_item_tcp_mask, + &mlx5_nl_flow_mask_supported.tcp, + &mlx5_nl_flow_mask_empty.tcp, + sizeof(mlx5_nl_flow_mask_supported.tcp), error); + if (!mask.tcp) + return -rte_errno; + if (!ip_proto_set && + !mnl_attr_put_u8_check(buf, size, + TCA_FLOWER_KEY_IP_PROTO, + IPPROTO_TCP)) + goto error_nobufs; + if (mask.tcp == &mlx5_nl_flow_mask_empty.tcp) { + ++item; + break; + } + spec.tcp = item->spec; + if ((mask.tcp->hdr.src_port && + mask.tcp->hdr.src_port != RTE_BE16(0xffff)) || + (mask.tcp->hdr.dst_port && + mask.tcp->hdr.dst_port != RTE_BE16(0xffff))) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.tcp, + "no support for partial masks on" + " \"hdr.src_port\" and \"hdr.dst_port\"" + " fields"); + if ((mask.tcp->hdr.src_port && + (!mnl_attr_put_u16_check(buf, size, + TCA_FLOWER_KEY_TCP_SRC, + spec.tcp->hdr.src_port) || + !mnl_attr_put_u16_check(buf, size, + TCA_FLOWER_KEY_TCP_SRC_MASK, + mask.tcp->hdr.src_port))) || + (mask.tcp->hdr.dst_port && + (!mnl_attr_put_u16_check(buf, size, + TCA_FLOWER_KEY_TCP_DST, + spec.tcp->hdr.dst_port) || + !mnl_attr_put_u16_check(buf, size, + TCA_FLOWER_KEY_TCP_DST_MASK, + mask.tcp->hdr.dst_port)))) + goto error_nobufs; + ++item; + break; + case ITEM_UDP: + if (item->type != RTE_FLOW_ITEM_TYPE_UDP) + goto trans; + mask.udp = mlx5_nl_flow_item_mask + (item, &rte_flow_item_udp_mask, + &mlx5_nl_flow_mask_supported.udp, + &mlx5_nl_flow_mask_empty.udp, + sizeof(mlx5_nl_flow_mask_supported.udp), error); + if (!mask.udp) + return -rte_errno; + if (!ip_proto_set && + !mnl_attr_put_u8_check(buf, size, + TCA_FLOWER_KEY_IP_PROTO, + IPPROTO_UDP)) + goto error_nobufs; + if (mask.udp == &mlx5_nl_flow_mask_empty.udp) { + ++item; + break; + } + spec.udp = item->spec; + if ((mask.udp->hdr.src_port && + mask.udp->hdr.src_port != RTE_BE16(0xffff)) || + (mask.udp->hdr.dst_port && + mask.udp->hdr.dst_port != RTE_BE16(0xffff))) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.udp, + "no support for partial masks on" + " \"hdr.src_port\" and \"hdr.dst_port\"" + " fields"); + if ((mask.udp->hdr.src_port && + (!mnl_attr_put_u16_check(buf, size, + TCA_FLOWER_KEY_UDP_SRC, + spec.udp->hdr.src_port) || + !mnl_attr_put_u16_check(buf, size, + TCA_FLOWER_KEY_UDP_SRC_MASK, + mask.udp->hdr.src_port))) || + (mask.udp->hdr.dst_port && + (!mnl_attr_put_u16_check(buf, size, + TCA_FLOWER_KEY_UDP_DST, + spec.udp->hdr.dst_port) || + !mnl_attr_put_u16_check(buf, size, + TCA_FLOWER_KEY_UDP_DST_MASK, + mask.udp->hdr.dst_port)))) + goto error_nobufs; + ++item; + break; + case ACTIONS: + if (item->type != RTE_FLOW_ITEM_TYPE_END) + goto trans; + assert(na_flower); + assert(!na_flower_act); + na_flower_act = + mnl_attr_nest_start_check(buf, size, TCA_FLOWER_ACT); + if (!na_flower_act) + goto error_nobufs; + act_index_cur = 1; + break; + case ACTION_VOID: + if (action->type != RTE_FLOW_ACTION_TYPE_VOID) + goto trans; + ++action; + break; + case ACTION_PORT_ID: + if (action->type != RTE_FLOW_ACTION_TYPE_PORT_ID) + goto trans; + conf.port_id = action->conf; + if (conf.port_id->original) + i = 0; + else + for (i = 0; ptoi[i].ifindex; ++i) + if (ptoi[i].port_id == conf.port_id->id) + break; + if (!ptoi[i].ifindex) + return rte_flow_error_set + (error, ENODEV, RTE_FLOW_ERROR_TYPE_ACTION_CONF, + conf.port_id, + "missing data to convert port ID to ifindex"); + act_index = + mnl_attr_nest_start_check(buf, size, act_index_cur++); + if (!act_index || + !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "mirred")) + goto error_nobufs; + act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS); + if (!act) + goto error_nobufs; + if (!mnl_attr_put_check(buf, size, TCA_MIRRED_PARMS, + sizeof(struct tc_mirred), + &(struct tc_mirred){ + .action = TC_ACT_STOLEN, + .eaction = TCA_EGRESS_REDIR, + .ifindex = ptoi[i].ifindex, + })) + goto error_nobufs; + mnl_attr_nest_end(buf, act); + mnl_attr_nest_end(buf, act_index); + ++action; + break; + case ACTION_DROP: + if (action->type != RTE_FLOW_ACTION_TYPE_DROP) + goto trans; + act_index = + mnl_attr_nest_start_check(buf, size, act_index_cur++); + if (!act_index || + !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "gact")) + goto error_nobufs; + act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS); + if (!act) + goto error_nobufs; + if (!mnl_attr_put_check(buf, size, TCA_GACT_PARMS, + sizeof(struct tc_gact), + &(struct tc_gact){ + .action = TC_ACT_SHOT, + })) + goto error_nobufs; + mnl_attr_nest_end(buf, act); + mnl_attr_nest_end(buf, act_index); + ++action; + break; + case ACTION_OF_POP_VLAN: + if (action->type != RTE_FLOW_ACTION_TYPE_OF_POP_VLAN) + goto trans; + conf.of_push_vlan = NULL; + i = TCA_VLAN_ACT_POP; + goto action_of_vlan; + case ACTION_OF_PUSH_VLAN: + if (action->type != RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN) + goto trans; + conf.of_push_vlan = action->conf; + i = TCA_VLAN_ACT_PUSH; + goto action_of_vlan; + case ACTION_OF_SET_VLAN_VID: + if (action->type != RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) + goto trans; + conf.of_set_vlan_vid = action->conf; + if (na_vlan_id) + goto override_na_vlan_id; + i = TCA_VLAN_ACT_MODIFY; + goto action_of_vlan; + case ACTION_OF_SET_VLAN_PCP: + if (action->type != RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) + goto trans; + conf.of_set_vlan_pcp = action->conf; + if (na_vlan_priority) + goto override_na_vlan_priority; + i = TCA_VLAN_ACT_MODIFY; + goto action_of_vlan; +action_of_vlan: + act_index = + mnl_attr_nest_start_check(buf, size, act_index_cur++); + if (!act_index || + !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "vlan")) + goto error_nobufs; + act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS); + if (!act) + goto error_nobufs; + if (!mnl_attr_put_check(buf, size, TCA_VLAN_PARMS, + sizeof(struct tc_vlan), + &(struct tc_vlan){ + .action = TC_ACT_PIPE, + .v_action = i, + })) + goto error_nobufs; + if (i == TCA_VLAN_ACT_POP) { + mnl_attr_nest_end(buf, act); + mnl_attr_nest_end(buf, act_index); + ++action; + break; + } + if (i == TCA_VLAN_ACT_PUSH && + !mnl_attr_put_u16_check(buf, size, + TCA_VLAN_PUSH_VLAN_PROTOCOL, + conf.of_push_vlan->ethertype)) + goto error_nobufs; + na_vlan_id = mnl_nlmsg_get_payload_tail(buf); + if (!mnl_attr_put_u16_check(buf, size, TCA_VLAN_PAD, 0)) + goto error_nobufs; + na_vlan_priority = mnl_nlmsg_get_payload_tail(buf); + if (!mnl_attr_put_u8_check(buf, size, TCA_VLAN_PAD, 0)) + goto error_nobufs; + mnl_attr_nest_end(buf, act); + mnl_attr_nest_end(buf, act_index); + if (action->type == RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) { +override_na_vlan_id: + na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID; + *(uint16_t *)mnl_attr_get_payload(na_vlan_id) = + rte_be_to_cpu_16 + (conf.of_set_vlan_vid->vlan_vid); + } else if (action->type == + RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) { +override_na_vlan_priority: + na_vlan_priority->nla_type = + TCA_VLAN_PUSH_VLAN_PRIORITY; + *(uint8_t *)mnl_attr_get_payload(na_vlan_priority) = + conf.of_set_vlan_pcp->vlan_pcp; + } + ++action; + break; + case END: + if (item->type != RTE_FLOW_ITEM_TYPE_END || + action->type != RTE_FLOW_ACTION_TYPE_END) + goto trans; + if (na_flower_act) + mnl_attr_nest_end(buf, na_flower_act); + if (na_flower) + mnl_attr_nest_end(buf, na_flower); + nlh = buf; + return nlh->nlmsg_len; + } + back = trans; + trans = mlx5_nl_flow_trans[trans[n - 1]]; + n = 0; + goto trans; +error_nobufs: + if (buf != buf_tmp) { + buf = buf_tmp; + size = sizeof(buf_tmp); + goto init; + } + return rte_flow_error_set + (error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "generated TC message is too large"); +} + +/** + * Brand rtnetlink buffer with unique handle. + * + * This handle should be unique for a given network interface to avoid + * collisions. + * + * @param buf + * Flow rule buffer previously initialized by mlx5_nl_flow_transpose(). + * @param handle + * Unique 32-bit handle to use. + */ +void +mlx5_nl_flow_brand(void *buf, uint32_t handle) +{ + struct tcmsg *tcm = mnl_nlmsg_get_payload(buf); + + tcm->tcm_handle = handle; +} + +/** + * Send Netlink message with acknowledgment. + * + * @param nl + * Libmnl socket to use. + * @param nlh + * Message to send. This function always raises the NLM_F_ACK flag before + * sending. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh) +{ + alignas(struct nlmsghdr) + uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) + + nlh->nlmsg_len - sizeof(*nlh)]; + uint32_t seq = random(); + int ret; + + nlh->nlmsg_flags |= NLM_F_ACK; + nlh->nlmsg_seq = seq; + ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len); + if (ret != -1) + ret = mnl_socket_recvfrom(nl, ans, sizeof(ans)); + if (ret != -1) + ret = mnl_cb_run + (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL); + if (!ret) + return 0; + rte_errno = errno; + return -rte_errno; +} + +/** + * Create a Netlink flow rule. + * + * @param nl + * Libmnl socket to use. + * @param buf + * Flow rule buffer previously initialized by mlx5_nl_flow_transpose(). + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_flow_create(struct mnl_socket *nl, void *buf, + struct rte_flow_error *error) +{ + struct nlmsghdr *nlh = buf; + + nlh->nlmsg_type = RTM_NEWTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + if (!mlx5_nl_flow_nl_ack(nl, nlh)) + return 0; + return rte_flow_error_set + (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to create TC flow rule"); +} + +/** + * Destroy a Netlink flow rule. + * + * @param nl + * Libmnl socket to use. + * @param buf + * Flow rule buffer previously initialized by mlx5_nl_flow_transpose(). + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf, + struct rte_flow_error *error) +{ + struct nlmsghdr *nlh = buf; + + nlh->nlmsg_type = RTM_DELTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST; + if (!mlx5_nl_flow_nl_ack(nl, nlh)) + return 0; + return rte_flow_error_set + (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to destroy TC flow rule"); +} + +/** + * Initialize ingress qdisc of a given network interface. + * + * @param nl + * Libmnl socket of the @p NETLINK_ROUTE kind. + * @param ifindex + * Index of network interface to initialize. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex, + struct rte_flow_error *error) +{ + struct nlmsghdr *nlh; + struct tcmsg *tcm; + alignas(struct nlmsghdr) + uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)]; + + /* Destroy existing ingress qdisc and everything attached to it. */ + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = RTM_DELQDISC; + nlh->nlmsg_flags = NLM_F_REQUEST; + tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = ifindex; + tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); + tcm->tcm_parent = TC_H_INGRESS; + /* Ignore errors when qdisc is already absent. */ + if (mlx5_nl_flow_nl_ack(nl, nlh) && + rte_errno != EINVAL && rte_errno != ENOENT) + return rte_flow_error_set + (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "netlink: failed to remove ingress qdisc"); + /* Create fresh ingress qdisc. */ + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = RTM_NEWQDISC; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = ifindex; + tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); + tcm->tcm_parent = TC_H_INGRESS; + mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress"); + if (mlx5_nl_flow_nl_ack(nl, nlh)) + return rte_flow_error_set + (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "netlink: failed to create ingress qdisc"); + return 0; +} + +/** + * Create and configure a libmnl socket for Netlink flow rules. + * + * @return + * A valid libmnl socket object pointer on success, NULL otherwise and + * rte_errno is set. + */ +struct mnl_socket * +mlx5_nl_flow_socket_create(void) +{ + struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE); + + if (nl) { + mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 }, + sizeof(int)); + if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID)) + return nl; + } + rte_errno = errno; + if (nl) + mnl_socket_close(nl); + return NULL; +} + +/** + * Destroy a libmnl socket. + */ +void +mlx5_nl_flow_socket_destroy(struct mnl_socket *nl) +{ + mnl_socket_close(nl); +} diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h index 9eb9c15e..0870d32f 100644 --- a/drivers/net/mlx5/mlx5_prm.h +++ b/drivers/net/mlx5/mlx5_prm.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2016 6WIND S.A. - * Copyright 2016 Mellanox. + * Copyright 2016 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_PRM_H_ @@ -21,6 +21,9 @@ #include <rte_vect.h> #include "mlx5_autoconf.h" +/* RSS hash key size. */ +#define MLX5_RSS_HASH_KEY_LEN 40 + /* Get CQE owner bit. */ #define MLX5_CQE_OWNER(op_own) ((op_own) & MLX5_CQE_OWNER_MASK) @@ -107,6 +110,30 @@ /* Inner L4 checksum offload (Tunneled packets only). */ #define MLX5_ETH_WQE_L4_INNER_CSUM (1u << 5) +/* Outer L4 type is TCP. */ +#define MLX5_ETH_WQE_L4_OUTER_TCP (0u << 5) + +/* Outer L4 type is UDP. */ +#define MLX5_ETH_WQE_L4_OUTER_UDP (1u << 5) + +/* Outer L3 type is IPV4. */ +#define MLX5_ETH_WQE_L3_OUTER_IPV4 (0u << 4) + +/* Outer L3 type is IPV6. */ +#define MLX5_ETH_WQE_L3_OUTER_IPV6 (1u << 4) + +/* Inner L4 type is TCP. */ +#define MLX5_ETH_WQE_L4_INNER_TCP (0u << 1) + +/* Inner L4 type is UDP. */ +#define MLX5_ETH_WQE_L4_INNER_UDP (1u << 1) + +/* Inner L3 type is IPV4. */ +#define MLX5_ETH_WQE_L3_INNER_IPV4 (0u << 0) + +/* Inner L3 type is IPV6. */ +#define MLX5_ETH_WQE_L3_INNER_IPV6 (1u << 0) + /* Is flow mark valid. */ #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN #define MLX5_FLOW_MARK_IS_VALID(val) ((val) & 0xffffff00) @@ -195,13 +222,30 @@ struct mlx5_mpw { } data; }; +/* WQE for Multi-Packet RQ. */ +struct mlx5_wqe_mprq { + struct mlx5_wqe_srq_next_seg next_seg; + struct mlx5_wqe_data_seg dseg; +}; + +#define MLX5_MPRQ_LEN_MASK 0x000ffff +#define MLX5_MPRQ_LEN_SHIFT 0 +#define MLX5_MPRQ_STRIDE_NUM_MASK 0x3fff0000 +#define MLX5_MPRQ_STRIDE_NUM_SHIFT 16 +#define MLX5_MPRQ_FILLER_MASK 0x80000000 +#define MLX5_MPRQ_FILLER_SHIFT 31 + +#define MLX5_MPRQ_STRIDE_SHIFT_BYTE 2 + /* CQ element structure - should be equal to the cache line size */ struct mlx5_cqe { #if (RTE_CACHE_LINE_SIZE == 128) uint8_t padding[64]; #endif uint8_t pkt_info; - uint8_t rsvd0[11]; + uint8_t rsvd0; + uint16_t wqe_id; + uint8_t rsvd3[8]; uint32_t rx_hash_res; uint8_t rx_hash_type; uint8_t rsvd1[11]; @@ -246,7 +290,10 @@ struct mlx5_cqe { struct mlx5_mini_cqe8 { union { uint32_t rx_hash_result; - uint32_t checksum; + struct { + uint16_t checksum; + uint16_t stride_idx; + }; struct { uint16_t wqe_counter; uint8_t s_wqe_opcode; diff --git a/drivers/net/mlx5/mlx5_rss.c b/drivers/net/mlx5/mlx5_rss.c index d06b0bee..b95778a8 100644 --- a/drivers/net/mlx5/mlx5_rss.c +++ b/drivers/net/mlx5/mlx5_rss.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -35,35 +35,49 @@ * RSS configuration data. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_rss_hash_update(struct rte_eth_dev *dev, struct rte_eth_rss_conf *rss_conf) { struct priv *priv = dev->data->dev_private; - int ret = 0; + unsigned int i; + unsigned int idx; - priv_lock(priv); if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) { - ret = -EINVAL; - goto out; + rte_errno = EINVAL; + return -rte_errno; } if (rss_conf->rss_key && rss_conf->rss_key_len) { + if (rss_conf->rss_key_len != MLX5_RSS_HASH_KEY_LEN) { + DRV_LOG(ERR, + "port %u RSS key len must be %s Bytes long", + dev->data->port_id, + RTE_STR(MLX5_RSS_HASH_KEY_LEN)); + rte_errno = EINVAL; + return -rte_errno; + } priv->rss_conf.rss_key = rte_realloc(priv->rss_conf.rss_key, rss_conf->rss_key_len, 0); if (!priv->rss_conf.rss_key) { - ret = -ENOMEM; - goto out; + rte_errno = ENOMEM; + return -rte_errno; } memcpy(priv->rss_conf.rss_key, rss_conf->rss_key, rss_conf->rss_key_len); priv->rss_conf.rss_key_len = rss_conf->rss_key_len; } priv->rss_conf.rss_hf = rss_conf->rss_hf; -out: - priv_unlock(priv); - return ret; + /* Enable the RSS hash in all Rx queues. */ + for (i = 0, idx = 0; idx != priv->rxqs_n; ++i) { + if (!(*priv->rxqs)[i]) + continue; + (*priv->rxqs)[i]->rss_hash = !!rss_conf->rss_hf && + !!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS); + ++idx; + } + return 0; } /** @@ -75,7 +89,7 @@ out: * RSS configuration data. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_rss_hash_conf_get(struct rte_eth_dev *dev, @@ -83,9 +97,10 @@ mlx5_rss_hash_conf_get(struct rte_eth_dev *dev, { struct priv *priv = dev->data->dev_private; - if (!rss_conf) - return -EINVAL; - priv_lock(priv); + if (!rss_conf) { + rte_errno = EINVAL; + return -rte_errno; + } if (rss_conf->rss_key && (rss_conf->rss_key_len >= priv->rss_conf.rss_key_len)) { memcpy(rss_conf->rss_key, priv->rss_conf.rss_key, @@ -93,24 +108,24 @@ mlx5_rss_hash_conf_get(struct rte_eth_dev *dev, } rss_conf->rss_key_len = priv->rss_conf.rss_key_len; rss_conf->rss_hf = priv->rss_conf.rss_hf; - priv_unlock(priv); return 0; } /** * Allocate/reallocate RETA index table. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @praram reta_size * The size of the array to allocate. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_rss_reta_index_resize(struct priv *priv, unsigned int reta_size) +mlx5_rss_reta_index_resize(struct rte_eth_dev *dev, unsigned int reta_size) { + struct priv *priv = dev->data->dev_private; void *mem; unsigned int old_size = priv->reta_idx_n; @@ -119,11 +134,12 @@ priv_rss_reta_index_resize(struct priv *priv, unsigned int reta_size) mem = rte_realloc(priv->reta_idx, reta_size * sizeof((*priv->reta_idx)[0]), 0); - if (!mem) - return ENOMEM; + if (!mem) { + rte_errno = ENOMEM; + return -rte_errno; + } priv->reta_idx = mem; priv->reta_idx_n = reta_size; - if (old_size < reta_size) memset(&(*priv->reta_idx)[old_size], 0, (reta_size - old_size) * @@ -132,28 +148,31 @@ priv_rss_reta_index_resize(struct priv *priv, unsigned int reta_size) } /** - * Query RETA table. + * DPDK callback to get the RETA indirection table. * - * @param priv - * Pointer to private structure. - * @param[in, out] reta_conf - * Pointer to the first RETA configuration structure. + * @param dev + * Pointer to Ethernet device structure. + * @param reta_conf + * Pointer to RETA configuration structure array. * @param reta_size - * Number of entries. + * Size of the RETA table. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -priv_dev_rss_reta_query(struct priv *priv, +int +mlx5_dev_rss_reta_query(struct rte_eth_dev *dev, struct rte_eth_rss_reta_entry64 *reta_conf, - unsigned int reta_size) + uint16_t reta_size) { + struct priv *priv = dev->data->dev_private; unsigned int idx; unsigned int i; - if (!reta_size || reta_size > priv->reta_idx_n) - return EINVAL; + if (!reta_size || reta_size > priv->reta_idx_n) { + rte_errno = EINVAL; + return -rte_errno; + } /* Fill each entry of the table even if its bit is not set. */ for (idx = 0, i = 0; (i != reta_size); ++i) { idx = i / RTE_RETA_GROUP_SIZE; @@ -164,34 +183,36 @@ priv_dev_rss_reta_query(struct priv *priv, } /** - * Update RETA table. + * DPDK callback to update the RETA indirection table. * - * @param priv - * Pointer to private structure. - * @param[in] reta_conf - * Pointer to the first RETA configuration structure. + * @param dev + * Pointer to Ethernet device structure. + * @param reta_conf + * Pointer to RETA configuration structure array. * @param reta_size - * Number of entries. + * Size of the RETA table. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -priv_dev_rss_reta_update(struct priv *priv, +int +mlx5_dev_rss_reta_update(struct rte_eth_dev *dev, struct rte_eth_rss_reta_entry64 *reta_conf, - unsigned int reta_size) + uint16_t reta_size) { + int ret; + struct priv *priv = dev->data->dev_private; unsigned int idx; unsigned int i; unsigned int pos; - int ret; - if (!reta_size) - return EINVAL; - ret = priv_rss_reta_index_resize(priv, reta_size); + if (!reta_size) { + rte_errno = EINVAL; + return -rte_errno; + } + ret = mlx5_rss_reta_index_resize(dev, reta_size); if (ret) return ret; - for (idx = 0, i = 0; (i != reta_size); ++i) { idx = i / RTE_RETA_GROUP_SIZE; pos = i % RTE_RETA_GROUP_SIZE; @@ -200,63 +221,9 @@ priv_dev_rss_reta_update(struct priv *priv, assert(reta_conf[idx].reta[pos] < priv->rxqs_n); (*priv->reta_idx)[i] = reta_conf[idx].reta[pos]; } - return 0; -} - -/** - * DPDK callback to get the RETA indirection table. - * - * @param dev - * Pointer to Ethernet device structure. - * @param reta_conf - * Pointer to RETA configuration structure array. - * @param reta_size - * Size of the RETA table. - * - * @return - * 0 on success, negative errno value on failure. - */ -int -mlx5_dev_rss_reta_query(struct rte_eth_dev *dev, - struct rte_eth_rss_reta_entry64 *reta_conf, - uint16_t reta_size) -{ - int ret; - struct priv *priv = dev->data->dev_private; - - priv_lock(priv); - ret = priv_dev_rss_reta_query(priv, reta_conf, reta_size); - priv_unlock(priv); - return -ret; -} - -/** - * DPDK callback to update the RETA indirection table. - * - * @param dev - * Pointer to Ethernet device structure. - * @param reta_conf - * Pointer to RETA configuration structure array. - * @param reta_size - * Size of the RETA table. - * - * @return - * 0 on success, negative errno value on failure. - */ -int -mlx5_dev_rss_reta_update(struct rte_eth_dev *dev, - struct rte_eth_rss_reta_entry64 *reta_conf, - uint16_t reta_size) -{ - int ret; - struct priv *priv = dev->data->dev_private; - - priv_lock(priv); - ret = priv_dev_rss_reta_update(priv, reta_conf, reta_size); - priv_unlock(priv); if (dev->data->dev_started) { mlx5_dev_stop(dev); - mlx5_dev_start(dev); + return mlx5_dev_start(dev); } - return -ret; + return 0; } diff --git a/drivers/net/mlx5/mlx5_rxmode.c b/drivers/net/mlx5/mlx5_rxmode.c index 4ffc869a..e74fdef8 100644 --- a/drivers/net/mlx5/mlx5_rxmode.c +++ b/drivers/net/mlx5/mlx5_rxmode.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -32,8 +32,23 @@ void mlx5_promiscuous_enable(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; + int ret; + dev->data->promiscuous = 1; - mlx5_traffic_restart(dev); + if (priv->isolated) { + DRV_LOG(WARNING, + "port %u cannot enable promiscuous mode" + " in flow isolation mode", + dev->data->port_id); + return; + } + if (priv->config.vf) + mlx5_nl_promisc(dev, 1); + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot enable promiscuous mode: %s", + dev->data->port_id, strerror(rte_errno)); } /** @@ -45,8 +60,16 @@ mlx5_promiscuous_enable(struct rte_eth_dev *dev) void mlx5_promiscuous_disable(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; + int ret; + dev->data->promiscuous = 0; - mlx5_traffic_restart(dev); + if (priv->config.vf) + mlx5_nl_promisc(dev, 0); + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot disable promiscuous mode: %s", + dev->data->port_id, strerror(rte_errno)); } /** @@ -58,8 +81,23 @@ mlx5_promiscuous_disable(struct rte_eth_dev *dev) void mlx5_allmulticast_enable(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; + int ret; + dev->data->all_multicast = 1; - mlx5_traffic_restart(dev); + if (priv->isolated) { + DRV_LOG(WARNING, + "port %u cannot enable allmulticast mode" + " in flow isolation mode", + dev->data->port_id); + return; + } + if (priv->config.vf) + mlx5_nl_allmulti(dev, 1); + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot enable allmulicast mode: %s", + dev->data->port_id, strerror(rte_errno)); } /** @@ -71,6 +109,14 @@ mlx5_allmulticast_enable(struct rte_eth_dev *dev) void mlx5_allmulticast_disable(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; + int ret; + dev->data->all_multicast = 0; - mlx5_traffic_restart(dev); + if (priv->config.vf) + mlx5_nl_allmulti(dev, 0); + ret = mlx5_traffic_restart(dev); + if (ret) + DRV_LOG(ERR, "port %u cannot disable allmulicast mode: %s", + dev->data->port_id, strerror(rte_errno)); } diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index ff58c492..1f7bfd44 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -52,10 +52,129 @@ uint8_t rss_hash_default_key[] = { }; /* Length of the default RSS hash key. */ -const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key); +static_assert(MLX5_RSS_HASH_KEY_LEN == + (unsigned int)sizeof(rss_hash_default_key), + "wrong RSS default key size."); /** - * Allocate RX queue elements. + * Check whether Multi-Packet RQ can be enabled for the device. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 1 if supported, negative errno value if not. + */ +inline int +mlx5_check_mprq_support(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + + if (priv->config.mprq.enabled && + priv->rxqs_n >= priv->config.mprq.min_rxqs_num) + return 1; + return -ENOTSUP; +} + +/** + * Check whether Multi-Packet RQ is enabled for the Rx queue. + * + * @param rxq + * Pointer to receive queue structure. + * + * @return + * 0 if disabled, otherwise enabled. + */ +inline int +mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq) +{ + return rxq->strd_num_n > 0; +} + +/** + * Check whether Multi-Packet RQ is enabled for the device. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 0 if disabled, otherwise enabled. + */ +inline int +mlx5_mprq_enabled(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + uint16_t i; + uint16_t n = 0; + + if (mlx5_check_mprq_support(dev) < 0) + return 0; + /* All the configured queues should be enabled. */ + for (i = 0; i < priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + + if (!rxq) + continue; + if (mlx5_rxq_mprq_enabled(rxq)) + ++n; + } + /* Multi-Packet RQ can't be partially configured. */ + assert(n == 0 || n == priv->rxqs_n); + return n == priv->rxqs_n; +} + +/** + * Allocate RX queue elements for Multi-Packet RQ. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; + unsigned int wqe_n = 1 << rxq->elts_n; + unsigned int i; + int err; + + /* Iterate on segments. */ + for (i = 0; i <= wqe_n; ++i) { + struct mlx5_mprq_buf *buf; + + if (rte_mempool_get(rxq->mprq_mp, (void **)&buf) < 0) { + DRV_LOG(ERR, "port %u empty mbuf pool", rxq->port_id); + rte_errno = ENOMEM; + goto error; + } + if (i < wqe_n) + (*rxq->mprq_bufs)[i] = buf; + else + rxq->mprq_repl = buf; + } + DRV_LOG(DEBUG, + "port %u Rx queue %u allocated and configured %u segments", + rxq->port_id, rxq_ctrl->idx, wqe_n); + return 0; +error: + err = rte_errno; /* Save rte_errno before cleanup. */ + wqe_n = i; + for (i = 0; (i != wqe_n); ++i) { + if ((*rxq->mprq_bufs)[i] != NULL) + rte_mempool_put(rxq->mprq_mp, + (*rxq->mprq_bufs)[i]); + (*rxq->mprq_bufs)[i] = NULL; + } + DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything", + rxq->port_id, rxq_ctrl->idx); + rte_errno = err; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Allocate RX queue elements for Single-Packet RQ. * * @param rxq_ctrl * Pointer to RX queue structure. @@ -63,13 +182,13 @@ const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key); * @return * 0 on success, errno value on failure. */ -int -rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) +static int +rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) { const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n; unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n; unsigned int i; - int ret = 0; + int err; /* Iterate on segments. */ for (i = 0; (i != elts_n); ++i) { @@ -77,8 +196,9 @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp); if (buf == NULL) { - ERROR("%p: empty mbuf pool", (void *)rxq_ctrl); - ret = ENOMEM; + DRV_LOG(ERR, "port %u empty mbuf pool", + PORT_ID(rxq_ctrl->priv)); + rte_errno = ENOMEM; goto error; } /* Headroom is reserved by rte_pktmbuf_alloc(). */ @@ -97,7 +217,7 @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) (*rxq_ctrl->rxq.elts)[i] = buf; } /* If Rx vector is activated. */ - if (rxq_check_vec_support(&rxq_ctrl->rxq) > 0) { + if (mlx5_rxq_check_vec_support(&rxq_ctrl->rxq) > 0) { struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; struct rte_mbuf *mbuf_init = &rxq->fake_mbuf; int j; @@ -118,30 +238,78 @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j) (*rxq->elts)[elts_n + j] = &rxq->fake_mbuf; } - DEBUG("%p: allocated and configured %u segments (max %u packets)", - (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n)); - assert(ret == 0); + DRV_LOG(DEBUG, + "port %u Rx queue %u allocated and configured %u segments" + " (max %u packets)", + PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx, elts_n, + elts_n / (1 << rxq_ctrl->rxq.sges_n)); return 0; error: + err = rte_errno; /* Save rte_errno before cleanup. */ elts_n = i; for (i = 0; (i != elts_n); ++i) { if ((*rxq_ctrl->rxq.elts)[i] != NULL) rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]); (*rxq_ctrl->rxq.elts)[i] = NULL; } - DEBUG("%p: failed, freed everything", (void *)rxq_ctrl); - assert(ret > 0); - return ret; + DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything", + PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx); + rte_errno = err; /* Restore rte_errno. */ + return -rte_errno; } /** - * Free RX queue elements. + * Allocate RX queue elements. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + * + * @return + * 0 on success, errno value on failure. + */ +int +rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? + rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl); +} + +/** + * Free RX queue elements for Multi-Packet RQ. * * @param rxq_ctrl * Pointer to RX queue structure. */ static void -rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) +rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; + uint16_t i; + + DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs", + rxq->port_id, rxq_ctrl->idx); + if (rxq->mprq_bufs == NULL) + return; + assert(mlx5_rxq_check_vec_support(rxq) < 0); + for (i = 0; (i != (1u << rxq->elts_n)); ++i) { + if ((*rxq->mprq_bufs)[i] != NULL) + mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]); + (*rxq->mprq_bufs)[i] = NULL; + } + if (rxq->mprq_repl != NULL) { + mlx5_mprq_buf_free(rxq->mprq_repl); + rxq->mprq_repl = NULL; + } +} + +/** + * Free RX queue elements for Single-Packet RQ. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + */ +static void +rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) { struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; const uint16_t q_n = (1 << rxq->elts_n); @@ -149,14 +317,15 @@ rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi); uint16_t i; - DEBUG("%p: freeing WRs", (void *)rxq_ctrl); + DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs", + PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx); if (rxq->elts == NULL) return; /** * Some mbuf in the Ring belongs to the application. They cannot be * freed. */ - if (rxq_check_vec_support(rxq) > 0) { + if (mlx5_rxq_check_vec_support(rxq) > 0) { for (i = 0; i < used; ++i) (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL; rxq->rq_pi = rxq->rq_ci; @@ -169,6 +338,21 @@ rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) } /** + * Free RX queue elements. + * + * @param rxq_ctrl + * Pointer to RX queue structure. + */ +static void +rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) +{ + if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) + rxq_free_elts_mprq(rxq_ctrl); + else + rxq_free_elts_sprq(rxq_ctrl); +} + +/** * Clean up a RX queue. * * Destroy objects, free allocated memory and reset the structure for reuse. @@ -179,31 +363,35 @@ rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) void mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl) { - DEBUG("cleaning up %p", (void *)rxq_ctrl); + DRV_LOG(DEBUG, "port %u cleaning up Rx queue %u", + PORT_ID(rxq_ctrl->priv), rxq_ctrl->idx); if (rxq_ctrl->ibv) - mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv); + mlx5_rxq_ibv_release(rxq_ctrl->ibv); memset(rxq_ctrl, 0, sizeof(*rxq_ctrl)); } /** * Returns the per-queue supported offloads. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return * Supported Rx offloads. */ uint64_t -mlx5_priv_get_rx_queue_offloads(struct priv *priv) +mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_dev_config *config = &priv->config; uint64_t offloads = (DEV_RX_OFFLOAD_SCATTER | DEV_RX_OFFLOAD_TIMESTAMP | DEV_RX_OFFLOAD_JUMBO_FRAME); + offloads |= DEV_RX_OFFLOAD_CRC_STRIP; if (config->hw_fcs_strip) - offloads |= DEV_RX_OFFLOAD_CRC_STRIP; + offloads |= DEV_RX_OFFLOAD_KEEP_CRC; + if (config->hw_csum) offloads |= (DEV_RX_OFFLOAD_IPV4_CKSUM | DEV_RX_OFFLOAD_UDP_CKSUM | @@ -217,13 +405,11 @@ mlx5_priv_get_rx_queue_offloads(struct priv *priv) /** * Returns the per-port supported offloads. * - * @param priv - * Pointer to private structure. * @return * Supported Rx offloads. */ uint64_t -mlx5_priv_get_rx_port_offloads(struct priv *priv __rte_unused) +mlx5_get_rx_port_offloads(void) { uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER; @@ -231,33 +417,6 @@ mlx5_priv_get_rx_port_offloads(struct priv *priv __rte_unused) } /** - * Checks if the per-queue offload configuration is valid. - * - * @param priv - * Pointer to private structure. - * @param offloads - * Per-queue offloads configuration. - * - * @return - * 1 if the configuration is valid, 0 otherwise. - */ -static int -priv_is_rx_queue_offloads_allowed(struct priv *priv, uint64_t offloads) -{ - uint64_t port_offloads = priv->dev->data->dev_conf.rxmode.offloads; - uint64_t queue_supp_offloads = - mlx5_priv_get_rx_queue_offloads(priv); - uint64_t port_supp_offloads = mlx5_priv_get_rx_port_offloads(priv); - - if ((offloads & (queue_supp_offloads | port_supp_offloads)) != - offloads) - return 0; - if (((port_offloads ^ offloads) & port_supp_offloads)) - return 0; - return 1; -} - -/** * * @param dev * Pointer to Ethernet device structure. @@ -273,7 +432,7 @@ priv_is_rx_queue_offloads_allowed(struct priv *priv, uint64_t offloads) * Memory pool for buffer allocations. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, @@ -284,53 +443,40 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx]; struct mlx5_rxq_ctrl *rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); - int ret = 0; - priv_lock(priv); if (!rte_is_power_of_2(desc)) { desc = 1 << log2above(desc); - WARN("%p: increased number of descriptors in RX queue %u" - " to the next power of two (%d)", - (void *)dev, idx, desc); + DRV_LOG(WARNING, + "port %u increased number of descriptors in Rx queue %u" + " to the next power of two (%d)", + dev->data->port_id, idx, desc); } - DEBUG("%p: configuring queue %u for %u descriptors", - (void *)dev, idx, desc); + DRV_LOG(DEBUG, "port %u configuring Rx queue %u for %u descriptors", + dev->data->port_id, idx, desc); if (idx >= priv->rxqs_n) { - ERROR("%p: queue index out of range (%u >= %u)", - (void *)dev, idx, priv->rxqs_n); - priv_unlock(priv); - return -EOVERFLOW; - } - if (!priv_is_rx_queue_offloads_allowed(priv, conf->offloads)) { - ret = ENOTSUP; - ERROR("%p: Rx queue offloads 0x%" PRIx64 " don't match port " - "offloads 0x%" PRIx64 " or supported offloads 0x%" PRIx64, - (void *)dev, conf->offloads, - dev->data->dev_conf.rxmode.offloads, - (mlx5_priv_get_rx_port_offloads(priv) | - mlx5_priv_get_rx_queue_offloads(priv))); - goto out; - } - if (!mlx5_priv_rxq_releasable(priv, idx)) { - ret = EBUSY; - ERROR("%p: unable to release queue index %u", - (void *)dev, idx); - goto out; - } - mlx5_priv_rxq_release(priv, idx); - rxq_ctrl = mlx5_priv_rxq_new(priv, idx, desc, socket, conf, mp); + DRV_LOG(ERR, "port %u Rx queue index out of range (%u >= %u)", + dev->data->port_id, idx, priv->rxqs_n); + rte_errno = EOVERFLOW; + return -rte_errno; + } + if (!mlx5_rxq_releasable(dev, idx)) { + DRV_LOG(ERR, "port %u unable to release queue index %u", + dev->data->port_id, idx); + rte_errno = EBUSY; + return -rte_errno; + } + mlx5_rxq_release(dev, idx); + rxq_ctrl = mlx5_rxq_new(dev, idx, desc, socket, conf, mp); if (!rxq_ctrl) { - ERROR("%p: unable to allocate queue index %u", - (void *)dev, idx); - ret = ENOMEM; - goto out; + DRV_LOG(ERR, "port %u unable to allocate queue index %u", + dev->data->port_id, idx); + rte_errno = ENOMEM; + return -rte_errno; } - DEBUG("%p: adding RX queue %p to list", - (void *)dev, (void *)rxq_ctrl); + DRV_LOG(DEBUG, "port %u adding Rx queue %u to list", + dev->data->port_id, idx); (*priv->rxqs)[idx] = &rxq_ctrl->rxq; -out: - priv_unlock(priv); - return -ret; + return 0; } /** @@ -350,45 +496,48 @@ mlx5_rx_queue_release(void *dpdk_rxq) return; rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); priv = rxq_ctrl->priv; - priv_lock(priv); - if (!mlx5_priv_rxq_releasable(priv, rxq_ctrl->rxq.stats.idx)) - rte_panic("Rx queue %p is still used by a flow and cannot be" - " removed\n", (void *)rxq_ctrl); - mlx5_priv_rxq_release(priv, rxq_ctrl->rxq.stats.idx); - priv_unlock(priv); + if (!mlx5_rxq_releasable(ETH_DEV(priv), rxq_ctrl->rxq.stats.idx)) + rte_panic("port %u Rx queue %u is still used by a flow and" + " cannot be removed\n", + PORT_ID(priv), rxq_ctrl->idx); + mlx5_rxq_release(ETH_DEV(priv), rxq_ctrl->rxq.stats.idx); } /** * Allocate queue vector and fill epoll fd list for Rx interrupts. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return - * 0 on success, negative on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_rx_intr_vec_enable(struct priv *priv) +mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; unsigned int i; unsigned int rxqs_n = priv->rxqs_n; unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); unsigned int count = 0; - struct rte_intr_handle *intr_handle = priv->dev->intr_handle; + struct rte_intr_handle *intr_handle = dev->intr_handle; - if (!priv->dev->data->dev_conf.intr_conf.rxq) + if (!dev->data->dev_conf.intr_conf.rxq) return 0; - priv_rx_intr_vec_disable(priv); + mlx5_rx_intr_vec_disable(dev); intr_handle->intr_vec = malloc(n * sizeof(intr_handle->intr_vec[0])); if (intr_handle->intr_vec == NULL) { - ERROR("failed to allocate memory for interrupt vector," - " Rx interrupts will not be supported"); - return -ENOMEM; + DRV_LOG(ERR, + "port %u failed to allocate memory for interrupt" + " vector, Rx interrupts will not be supported", + dev->data->port_id); + rte_errno = ENOMEM; + return -rte_errno; } intr_handle->type = RTE_INTR_HANDLE_EXT; for (i = 0; i != n; ++i) { /* This rxq ibv must not be released in this function. */ - struct mlx5_rxq_ibv *rxq_ibv = mlx5_priv_rxq_ibv_get(priv, i); + struct mlx5_rxq_ibv *rxq_ibv = mlx5_rxq_ibv_get(dev, i); int fd; int flags; int rc; @@ -402,27 +551,34 @@ priv_rx_intr_vec_enable(struct priv *priv) continue; } if (count >= RTE_MAX_RXTX_INTR_VEC_ID) { - ERROR("too many Rx queues for interrupt vector size" - " (%d), Rx interrupts cannot be enabled", - RTE_MAX_RXTX_INTR_VEC_ID); - priv_rx_intr_vec_disable(priv); - return -1; + DRV_LOG(ERR, + "port %u too many Rx queues for interrupt" + " vector size (%d), Rx interrupts cannot be" + " enabled", + dev->data->port_id, RTE_MAX_RXTX_INTR_VEC_ID); + mlx5_rx_intr_vec_disable(dev); + rte_errno = ENOMEM; + return -rte_errno; } fd = rxq_ibv->channel->fd; flags = fcntl(fd, F_GETFL); rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK); if (rc < 0) { - ERROR("failed to make Rx interrupt file descriptor" - " %d non-blocking for queue index %d", fd, i); - priv_rx_intr_vec_disable(priv); - return -1; + rte_errno = errno; + DRV_LOG(ERR, + "port %u failed to make Rx interrupt file" + " descriptor %d non-blocking for queue index" + " %d", + dev->data->port_id, fd, i); + mlx5_rx_intr_vec_disable(dev); + return -rte_errno; } intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count; intr_handle->efds[count] = fd; count++; } if (!count) - priv_rx_intr_vec_disable(priv); + mlx5_rx_intr_vec_disable(dev); else intr_handle->nb_efd = count; return 0; @@ -431,18 +587,19 @@ priv_rx_intr_vec_enable(struct priv *priv) /** * Clean up Rx interrupts handler. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. */ void -priv_rx_intr_vec_disable(struct priv *priv) +mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev) { - struct rte_intr_handle *intr_handle = priv->dev->intr_handle; + struct priv *priv = dev->data->dev_private; + struct rte_intr_handle *intr_handle = dev->intr_handle; unsigned int i; unsigned int rxqs_n = priv->rxqs_n; unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); - if (!priv->dev->data->dev_conf.intr_conf.rxq) + if (!dev->data->dev_conf.intr_conf.rxq) return; if (!intr_handle->intr_vec) goto free; @@ -459,7 +616,7 @@ priv_rx_intr_vec_disable(struct priv *priv) */ rxq_data = (*priv->rxqs)[i]; rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); - mlx5_priv_rxq_ibv_release(priv, rxq_ctrl->ibv); + mlx5_rxq_ibv_release(rxq_ctrl->ibv); } free: rte_intr_free_epoll_fd(intr_handle); @@ -490,7 +647,8 @@ mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq) doorbell = (uint64_t)doorbell_hi << 32; doorbell |= rxq->cqn; rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi); - rte_write64(rte_cpu_to_be_64(doorbell), cq_db_reg); + mlx5_uar_write64(rte_cpu_to_be_64(doorbell), + cq_db_reg, rxq->uar_lock_cq); } /** @@ -502,7 +660,7 @@ mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq) * Rx queue number. * * @return - * 0 on success, negative on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id) @@ -510,31 +668,25 @@ mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id) struct priv *priv = dev->data->dev_private; struct mlx5_rxq_data *rxq_data; struct mlx5_rxq_ctrl *rxq_ctrl; - int ret = 0; - priv_lock(priv); rxq_data = (*priv->rxqs)[rx_queue_id]; if (!rxq_data) { - ret = EINVAL; - goto exit; + rte_errno = EINVAL; + return -rte_errno; } rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); if (rxq_ctrl->irq) { struct mlx5_rxq_ibv *rxq_ibv; - rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id); + rxq_ibv = mlx5_rxq_ibv_get(dev, rx_queue_id); if (!rxq_ibv) { - ret = EINVAL; - goto exit; + rte_errno = EINVAL; + return -rte_errno; } mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn); - mlx5_priv_rxq_ibv_release(priv, rxq_ibv); + mlx5_rxq_ibv_release(rxq_ibv); } -exit: - priv_unlock(priv); - if (ret) - WARN("unable to arm interrupt on rx queue %d", rx_queue_id); - return -ret; + return 0; } /** @@ -546,7 +698,7 @@ exit: * Rx queue number. * * @return - * 0 on success, negative on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id) @@ -557,53 +709,54 @@ mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id) struct mlx5_rxq_ibv *rxq_ibv = NULL; struct ibv_cq *ev_cq; void *ev_ctx; - int ret = 0; + int ret; - priv_lock(priv); rxq_data = (*priv->rxqs)[rx_queue_id]; if (!rxq_data) { - ret = EINVAL; - goto exit; + rte_errno = EINVAL; + return -rte_errno; } rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); if (!rxq_ctrl->irq) - goto exit; - rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id); + return 0; + rxq_ibv = mlx5_rxq_ibv_get(dev, rx_queue_id); if (!rxq_ibv) { - ret = EINVAL; - goto exit; + rte_errno = EINVAL; + return -rte_errno; } ret = mlx5_glue->get_cq_event(rxq_ibv->channel, &ev_cq, &ev_ctx); if (ret || ev_cq != rxq_ibv->cq) { - ret = EINVAL; + rte_errno = EINVAL; goto exit; } rxq_data->cq_arm_sn++; mlx5_glue->ack_cq_events(rxq_ibv->cq, 1); + return 0; exit: + ret = rte_errno; /* Save rte_errno before cleanup. */ if (rxq_ibv) - mlx5_priv_rxq_ibv_release(priv, rxq_ibv); - priv_unlock(priv); - if (ret) - WARN("unable to disable interrupt on rx queue %d", - rx_queue_id); - return -ret; + mlx5_rxq_ibv_release(rxq_ibv); + DRV_LOG(WARNING, "port %u unable to disable interrupt on Rx queue %d", + dev->data->port_id, rx_queue_id); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } /** * Create the Rx queue Verbs object. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * Queue index in DPDK Rx queue array * * @return - * The Verbs object initialised if it can be created. + * The Verbs object initialised, NULL otherwise and rte_errno is set. */ -struct mlx5_rxq_ibv* -mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) +struct mlx5_rxq_ibv * +mlx5_rxq_ibv_new(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx]; struct mlx5_rxq_ctrl *rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); @@ -613,10 +766,16 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) struct ibv_cq_init_attr_ex ibv; struct mlx5dv_cq_init_attr mlx5; } cq; - struct ibv_wq_init_attr wq; + struct { + struct ibv_wq_init_attr ibv; +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + struct mlx5dv_wq_init_attr mlx5; +#endif + } wq; struct ibv_cq_ex cq_attr; } attr; - unsigned int cqe_n = (1 << rxq_data->elts_n) - 1; + unsigned int cqe_n; + unsigned int wqe_n = 1 << rxq_data->elts_n; struct mlx5_rxq_ibv *tmpl; struct mlx5dv_cq cq_info; struct mlx5dv_rwq rwq; @@ -624,6 +783,7 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) int ret = 0; struct mlx5dv_obj obj; struct mlx5_dev_config *config = &priv->config; + const int mprq_en = mlx5_rxq_mprq_enabled(rxq_data); assert(rxq_data); assert(!rxq_ctrl->ibv); @@ -632,28 +792,26 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0, rxq_ctrl->socket); if (!tmpl) { - ERROR("%p: cannot allocate verbs resources", - (void *)rxq_ctrl); + DRV_LOG(ERR, + "port %u Rx queue %u cannot allocate verbs resources", + dev->data->port_id, rxq_ctrl->idx); + rte_errno = ENOMEM; goto error; } tmpl->rxq_ctrl = rxq_ctrl; - /* Use the entire RX mempool as the memory region. */ - tmpl->mr = priv_mr_get(priv, rxq_data->mp); - if (!tmpl->mr) { - tmpl->mr = priv_mr_new(priv, rxq_data->mp); - if (!tmpl->mr) { - ERROR("%p: MR creation failure", (void *)rxq_ctrl); - goto error; - } - } if (rxq_ctrl->irq) { tmpl->channel = mlx5_glue->create_comp_channel(priv->ctx); if (!tmpl->channel) { - ERROR("%p: Comp Channel creation failure", - (void *)rxq_ctrl); + DRV_LOG(ERR, "port %u: comp channel creation failure", + dev->data->port_id); + rte_errno = ENOMEM; goto error; } } + if (mprq_en) + cqe_n = wqe_n * (1 << rxq_data->strd_num_n) - 1; + else + cqe_n = wqe_n - 1; attr.cq.ibv = (struct ibv_cq_init_attr_ex){ .cqe = cqe_n, .channel = tmpl->channel, @@ -665,32 +823,43 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) if (config->cqe_comp && !rxq_data->hw_timestamp) { attr.cq.mlx5.comp_mask |= MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE; +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + attr.cq.mlx5.cqe_comp_res_format = + mprq_en ? MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX : + MLX5DV_CQE_RES_FORMAT_HASH; +#else attr.cq.mlx5.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH; +#endif /* * For vectorized Rx, it must not be doubled in order to * make cq_ci and rq_ci aligned. */ - if (rxq_check_vec_support(rxq_data) < 0) + if (mlx5_rxq_check_vec_support(rxq_data) < 0) attr.cq.ibv.cqe *= 2; } else if (config->cqe_comp && rxq_data->hw_timestamp) { - DEBUG("Rx CQE compression is disabled for HW timestamp"); + DRV_LOG(DEBUG, + "port %u Rx CQE compression is disabled for HW" + " timestamp", + dev->data->port_id); } tmpl->cq = mlx5_glue->cq_ex_to_cq (mlx5_glue->dv_create_cq(priv->ctx, &attr.cq.ibv, &attr.cq.mlx5)); if (tmpl->cq == NULL) { - ERROR("%p: CQ creation failure", (void *)rxq_ctrl); + DRV_LOG(ERR, "port %u Rx queue %u CQ creation failure", + dev->data->port_id, idx); + rte_errno = ENOMEM; goto error; } - DEBUG("priv->device_attr.max_qp_wr is %d", - priv->device_attr.orig_attr.max_qp_wr); - DEBUG("priv->device_attr.max_sge is %d", - priv->device_attr.orig_attr.max_sge); - attr.wq = (struct ibv_wq_init_attr){ + DRV_LOG(DEBUG, "port %u priv->device_attr.max_qp_wr is %d", + dev->data->port_id, priv->device_attr.orig_attr.max_qp_wr); + DRV_LOG(DEBUG, "port %u priv->device_attr.max_sge is %d", + dev->data->port_id, priv->device_attr.orig_attr.max_sge); + attr.wq.ibv = (struct ibv_wq_init_attr){ .wq_context = NULL, /* Could be useful in the future. */ .wq_type = IBV_WQT_RQ, /* Max number of outstanding WRs. */ - .max_wr = (1 << rxq_data->elts_n) >> rxq_data->sges_n, + .max_wr = wqe_n >> rxq_data->sges_n, /* Max number of scatter/gather elements in a WR. */ .max_sge = 1 << rxq_data->sges_n, .pd = priv->pd, @@ -704,32 +873,54 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) }; /* By default, FCS (CRC) is stripped by hardware. */ if (rxq_data->crc_present) { - attr.wq.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS; - attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; + attr.wq.ibv.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS; + attr.wq.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; } #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING if (config->hw_padding) { - attr.wq.create_flags |= IBV_WQ_FLAG_RX_END_PADDING; - attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; + attr.wq.ibv.create_flags |= IBV_WQ_FLAG_RX_END_PADDING; + attr.wq.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS; + } +#endif +#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT + attr.wq.mlx5 = (struct mlx5dv_wq_init_attr){ + .comp_mask = 0, + }; + if (mprq_en) { + struct mlx5dv_striding_rq_init_attr *mprq_attr = + &attr.wq.mlx5.striding_rq_attrs; + + attr.wq.mlx5.comp_mask |= MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ; + *mprq_attr = (struct mlx5dv_striding_rq_init_attr){ + .single_stride_log_num_of_bytes = rxq_data->strd_sz_n, + .single_wqe_log_num_of_strides = rxq_data->strd_num_n, + .two_byte_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT, + }; } + tmpl->wq = mlx5_glue->dv_create_wq(priv->ctx, &attr.wq.ibv, + &attr.wq.mlx5); +#else + tmpl->wq = mlx5_glue->create_wq(priv->ctx, &attr.wq.ibv); #endif - tmpl->wq = mlx5_glue->create_wq(priv->ctx, &attr.wq); if (tmpl->wq == NULL) { - ERROR("%p: WQ creation failure", (void *)rxq_ctrl); + DRV_LOG(ERR, "port %u Rx queue %u WQ creation failure", + dev->data->port_id, idx); + rte_errno = ENOMEM; goto error; } /* * Make sure number of WRs*SGEs match expectations since a queue * cannot allocate more than "desc" buffers. */ - if (((int)attr.wq.max_wr != - ((1 << rxq_data->elts_n) >> rxq_data->sges_n)) || - ((int)attr.wq.max_sge != (1 << rxq_data->sges_n))) { - ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs", - (void *)rxq_ctrl, - ((1 << rxq_data->elts_n) >> rxq_data->sges_n), - (1 << rxq_data->sges_n), - attr.wq.max_wr, attr.wq.max_sge); + if (attr.wq.ibv.max_wr != (wqe_n >> rxq_data->sges_n) || + attr.wq.ibv.max_sge != (1u << rxq_data->sges_n)) { + DRV_LOG(ERR, + "port %u Rx queue %u requested %u*%u but got %u*%u" + " WRs*SGEs", + dev->data->port_id, idx, + wqe_n >> rxq_data->sges_n, (1 << rxq_data->sges_n), + attr.wq.ibv.max_wr, attr.wq.ibv.max_sge); + rte_errno = EINVAL; goto error; } /* Change queue state to ready. */ @@ -739,8 +930,10 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) }; ret = mlx5_glue->modify_wq(tmpl->wq, &mod); if (ret) { - ERROR("%p: WQ state to IBV_WQS_RDY failed", - (void *)rxq_ctrl); + DRV_LOG(ERR, + "port %u Rx queue %u WQ state to IBV_WQS_RDY failed", + dev->data->port_id, idx); + rte_errno = ret; goto error; } obj.cq.in = tmpl->cq; @@ -748,33 +941,53 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) obj.rwq.in = tmpl->wq; obj.rwq.out = &rwq; ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ); - if (ret != 0) + if (ret) { + rte_errno = ret; goto error; + } if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) { - ERROR("Wrong MLX5_CQE_SIZE environment variable value: " - "it should be set to %u", RTE_CACHE_LINE_SIZE); + DRV_LOG(ERR, + "port %u wrong MLX5_CQE_SIZE environment variable" + " value: it should be set to %u", + dev->data->port_id, RTE_CACHE_LINE_SIZE); + rte_errno = EINVAL; goto error; } /* Fill the rings. */ - rxq_data->wqes = (volatile struct mlx5_wqe_data_seg (*)[]) - (uintptr_t)rwq.buf; - for (i = 0; (i != (unsigned int)(1 << rxq_data->elts_n)); ++i) { - struct rte_mbuf *buf = (*rxq_data->elts)[i]; - volatile struct mlx5_wqe_data_seg *scat = &(*rxq_data->wqes)[i]; - + rxq_data->wqes = rwq.buf; + for (i = 0; (i != wqe_n); ++i) { + volatile struct mlx5_wqe_data_seg *scat; + uintptr_t addr; + uint32_t byte_count; + + if (mprq_en) { + struct mlx5_mprq_buf *buf = (*rxq_data->mprq_bufs)[i]; + + scat = &((volatile struct mlx5_wqe_mprq *) + rxq_data->wqes)[i].dseg; + addr = (uintptr_t)mlx5_mprq_buf_addr(buf); + byte_count = (1 << rxq_data->strd_sz_n) * + (1 << rxq_data->strd_num_n); + } else { + struct rte_mbuf *buf = (*rxq_data->elts)[i]; + + scat = &((volatile struct mlx5_wqe_data_seg *) + rxq_data->wqes)[i]; + addr = rte_pktmbuf_mtod(buf, uintptr_t); + byte_count = DATA_LEN(buf); + } /* scat->addr must be able to store a pointer. */ assert(sizeof(scat->addr) >= sizeof(uintptr_t)); *scat = (struct mlx5_wqe_data_seg){ - .addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, - uintptr_t)), - .byte_count = rte_cpu_to_be_32(DATA_LEN(buf)), - .lkey = tmpl->mr->lkey, + .addr = rte_cpu_to_be_64(addr), + .byte_count = rte_cpu_to_be_32(byte_count), + .lkey = mlx5_rx_addr2mr(rxq_data, addr), }; } rxq_data->rq_db = rwq.dbrec; rxq_data->cqe_n = log2above(cq_info.cqe_cnt); rxq_data->cq_ci = 0; - rxq_data->rq_ci = 0; + rxq_data->consumed_strd = 0; rxq_data->rq_pi = 0; rxq_data->zip = (struct rxq_zip){ .ai = 0, @@ -785,43 +998,43 @@ mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx) rxq_data->cqn = cq_info.cqn; rxq_data->cq_arm_sn = 0; /* Update doorbell counter. */ - rxq_data->rq_ci = (1 << rxq_data->elts_n) >> rxq_data->sges_n; + rxq_data->rq_ci = wqe_n >> rxq_data->sges_n; rte_wmb(); *rxq_data->rq_db = rte_cpu_to_be_32(rxq_data->rq_ci); - DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl); + DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id, + idx, (void *)&tmpl); rte_atomic32_inc(&tmpl->refcnt); - DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv, - (void *)tmpl, rte_atomic32_read(&tmpl->refcnt)); LIST_INSERT_HEAD(&priv->rxqsibv, tmpl, next); priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; return tmpl; error: + ret = rte_errno; /* Save rte_errno before cleanup. */ if (tmpl->wq) claim_zero(mlx5_glue->destroy_wq(tmpl->wq)); if (tmpl->cq) claim_zero(mlx5_glue->destroy_cq(tmpl->cq)); if (tmpl->channel) claim_zero(mlx5_glue->destroy_comp_channel(tmpl->channel)); - if (tmpl->mr) - priv_mr_release(priv, tmpl->mr); priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; + rte_errno = ret; /* Restore rte_errno. */ return NULL; } /** * Get an Rx queue Verbs object. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * Queue index in DPDK Rx queue array * * @return * The Verbs object if it exists. */ -struct mlx5_rxq_ibv* -mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx) +struct mlx5_rxq_ibv * +mlx5_rxq_ibv_get(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx]; struct mlx5_rxq_ctrl *rxq_ctrl; @@ -831,11 +1044,7 @@ mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx) return NULL; rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); if (rxq_ctrl->ibv) { - priv_mr_get(priv, rxq_data->mp); rte_atomic32_inc(&rxq_ctrl->ibv->refcnt); - DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv, - (void *)rxq_ctrl->ibv, - rte_atomic32_read(&rxq_ctrl->ibv->refcnt)); } return rxq_ctrl->ibv; } @@ -843,28 +1052,18 @@ mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx) /** * Release an Rx verbs queue object. * - * @param priv - * Pointer to private structure. * @param rxq_ibv * Verbs Rx queue object. * * @return - * 0 on success, errno value on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv) +mlx5_rxq_ibv_release(struct mlx5_rxq_ibv *rxq_ibv) { - int ret; - assert(rxq_ibv); assert(rxq_ibv->wq); assert(rxq_ibv->cq); - assert(rxq_ibv->mr); - ret = priv_mr_release(priv, rxq_ibv->mr); - if (!ret) - rxq_ibv->mr = NULL; - DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv, - (void *)rxq_ibv, rte_atomic32_read(&rxq_ibv->refcnt)); if (rte_atomic32_dec_and_test(&rxq_ibv->refcnt)) { rxq_free_elts(rxq_ibv->rxq_ctrl); claim_zero(mlx5_glue->destroy_wq(rxq_ibv->wq)); @@ -876,26 +1075,28 @@ mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv) rte_free(rxq_ibv); return 0; } - return EBUSY; + return 1; } /** * Verify the Verbs Rx queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_rxq_ibv_verify(struct priv *priv) +mlx5_rxq_ibv_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; int ret = 0; struct mlx5_rxq_ibv *rxq_ibv; LIST_FOREACH(rxq_ibv, &priv->rxqsibv, next) { - DEBUG("%p: Verbs Rx queue %p still referenced", (void *)priv, - (void *)rxq_ibv); + DRV_LOG(DEBUG, "port %u Verbs Rx queue %u still referenced", + dev->data->port_id, rxq_ibv->rxq_ctrl->idx); ++ret; } return ret; @@ -904,65 +1105,279 @@ mlx5_priv_rxq_ibv_verify(struct priv *priv) /** * Return true if a single reference exists on the object. * - * @param priv - * Pointer to private structure. * @param rxq_ibv * Verbs Rx queue object. */ int -mlx5_priv_rxq_ibv_releasable(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv) +mlx5_rxq_ibv_releasable(struct mlx5_rxq_ibv *rxq_ibv) { - (void)priv; assert(rxq_ibv); return (rte_atomic32_read(&rxq_ibv->refcnt) == 1); } /** + * Callback function to initialize mbufs for Multi-Packet RQ. + */ +static inline void +mlx5_mprq_buf_init(struct rte_mempool *mp, void *opaque_arg __rte_unused, + void *_m, unsigned int i __rte_unused) +{ + struct mlx5_mprq_buf *buf = _m; + + memset(_m, 0, sizeof(*buf)); + buf->mp = mp; + rte_atomic16_set(&buf->refcnt, 1); +} + +/** + * Free mempool of Multi-Packet RQ. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 0 on success, negative errno value on failure. + */ +int +mlx5_mprq_free_mp(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct rte_mempool *mp = priv->mprq_mp; + unsigned int i; + + if (mp == NULL) + return 0; + DRV_LOG(DEBUG, "port %u freeing mempool (%s) for Multi-Packet RQ", + dev->data->port_id, mp->name); + /* + * If a buffer in the pool has been externally attached to a mbuf and it + * is still in use by application, destroying the Rx qeueue can spoil + * the packet. It is unlikely to happen but if application dynamically + * creates and destroys with holding Rx packets, this can happen. + * + * TODO: It is unavoidable for now because the mempool for Multi-Packet + * RQ isn't provided by application but managed by PMD. + */ + if (!rte_mempool_full(mp)) { + DRV_LOG(ERR, + "port %u mempool for Multi-Packet RQ is still in use", + dev->data->port_id); + rte_errno = EBUSY; + return -rte_errno; + } + rte_mempool_free(mp); + /* Unset mempool for each Rx queue. */ + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + + if (rxq == NULL) + continue; + rxq->mprq_mp = NULL; + } + return 0; +} + +/** + * Allocate a mempool for Multi-Packet RQ. All configured Rx queues share the + * mempool. If already allocated, reuse it if there're enough elements. + * Otherwise, resize it. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * 0 on success, negative errno value on failure. + */ +int +mlx5_mprq_alloc_mp(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct rte_mempool *mp = priv->mprq_mp; + char name[RTE_MEMPOOL_NAMESIZE]; + unsigned int desc = 0; + unsigned int buf_len; + unsigned int obj_num; + unsigned int obj_size; + unsigned int strd_num_n = 0; + unsigned int strd_sz_n = 0; + unsigned int i; + + if (!mlx5_mprq_enabled(dev)) + return 0; + /* Count the total number of descriptors configured. */ + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + + if (rxq == NULL) + continue; + desc += 1 << rxq->elts_n; + /* Get the max number of strides. */ + if (strd_num_n < rxq->strd_num_n) + strd_num_n = rxq->strd_num_n; + /* Get the max size of a stride. */ + if (strd_sz_n < rxq->strd_sz_n) + strd_sz_n = rxq->strd_sz_n; + } + assert(strd_num_n && strd_sz_n); + buf_len = (1 << strd_num_n) * (1 << strd_sz_n); + obj_size = buf_len + sizeof(struct mlx5_mprq_buf); + /* + * Received packets can be either memcpy'd or externally referenced. In + * case that the packet is attached to an mbuf as an external buffer, as + * it isn't possible to predict how the buffers will be queued by + * application, there's no option to exactly pre-allocate needed buffers + * in advance but to speculatively prepares enough buffers. + * + * In the data path, if this Mempool is depleted, PMD will try to memcpy + * received packets to buffers provided by application (rxq->mp) until + * this Mempool gets available again. + */ + desc *= 4; + obj_num = desc + MLX5_MPRQ_MP_CACHE_SZ * priv->rxqs_n; + /* + * rte_mempool_create_empty() has sanity check to refuse large cache + * size compared to the number of elements. + * CACHE_FLUSHTHRESH_MULTIPLIER is defined in a C file, so using a + * constant number 2 instead. + */ + obj_num = RTE_MAX(obj_num, MLX5_MPRQ_MP_CACHE_SZ * 2); + /* Check a mempool is already allocated and if it can be resued. */ + if (mp != NULL && mp->elt_size >= obj_size && mp->size >= obj_num) { + DRV_LOG(DEBUG, "port %u mempool %s is being reused", + dev->data->port_id, mp->name); + /* Reuse. */ + goto exit; + } else if (mp != NULL) { + DRV_LOG(DEBUG, "port %u mempool %s should be resized, freeing it", + dev->data->port_id, mp->name); + /* + * If failed to free, which means it may be still in use, no way + * but to keep using the existing one. On buffer underrun, + * packets will be memcpy'd instead of external buffer + * attachment. + */ + if (mlx5_mprq_free_mp(dev)) { + if (mp->elt_size >= obj_size) + goto exit; + else + return -rte_errno; + } + } + snprintf(name, sizeof(name), "%s-mprq", dev->device->name); + mp = rte_mempool_create(name, obj_num, obj_size, MLX5_MPRQ_MP_CACHE_SZ, + 0, NULL, NULL, mlx5_mprq_buf_init, NULL, + dev->device->numa_node, 0); + if (mp == NULL) { + DRV_LOG(ERR, + "port %u failed to allocate a mempool for" + " Multi-Packet RQ, count=%u, size=%u", + dev->data->port_id, obj_num, obj_size); + rte_errno = ENOMEM; + return -rte_errno; + } + priv->mprq_mp = mp; +exit: + /* Set mempool for each Rx queue. */ + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; + + if (rxq == NULL) + continue; + rxq->mprq_mp = mp; + } + DRV_LOG(INFO, "port %u Multi-Packet RQ is configured", + dev->data->port_id); + return 0; +} + +/** * Create a DPDK Rx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx - * TX queue index. + * RX queue index. * @param desc * Number of descriptors to configure in queue. * @param socket * NUMA socket on which memory must be allocated. * * @return - * A DPDK queue object on success. + * A DPDK queue object on success, NULL otherwise and rte_errno is set. */ -struct mlx5_rxq_ctrl* -mlx5_priv_rxq_new(struct priv *priv, uint16_t idx, uint16_t desc, - unsigned int socket, const struct rte_eth_rxconf *conf, - struct rte_mempool *mp) +struct mlx5_rxq_ctrl * +mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_rxconf *conf, + struct rte_mempool *mp) { - struct rte_eth_dev *dev = priv->dev; + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *tmpl; unsigned int mb_len = rte_pktmbuf_data_room_size(mp); + unsigned int mprq_stride_size; struct mlx5_dev_config *config = &priv->config; /* * Always allocate extra slots, even if eventually * the vector Rx will not be used. */ - const uint16_t desc_n = + uint16_t desc_n = desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP; + uint64_t offloads = conf->offloads | + dev->data->dev_conf.rxmode.offloads; + const int mprq_en = mlx5_check_mprq_support(dev) > 0; tmpl = rte_calloc_socket("RXQ", 1, sizeof(*tmpl) + desc_n * sizeof(struct rte_mbuf *), 0, socket); - if (!tmpl) + if (!tmpl) { + rte_errno = ENOMEM; return NULL; + } + if (mlx5_mr_btree_init(&tmpl->rxq.mr_ctrl.cache_bh, + MLX5_MR_BTREE_CACHE_N, socket)) { + /* rte_errno is already set. */ + goto error; + } tmpl->socket = socket; - if (priv->dev->data->dev_conf.intr_conf.rxq) + if (dev->data->dev_conf.intr_conf.rxq) tmpl->irq = 1; - /* Enable scattered packets support for this queue if necessary. */ + /* + * This Rx queue can be configured as a Multi-Packet RQ if all of the + * following conditions are met: + * - MPRQ is enabled. + * - The number of descs is more than the number of strides. + * - max_rx_pkt_len plus overhead is less than the max size of a + * stride. + * Otherwise, enable Rx scatter if necessary. + */ assert(mb_len >= RTE_PKTMBUF_HEADROOM); - if (dev->data->dev_conf.rxmode.max_rx_pkt_len <= - (mb_len - RTE_PKTMBUF_HEADROOM)) { + mprq_stride_size = + dev->data->dev_conf.rxmode.max_rx_pkt_len + + sizeof(struct rte_mbuf_ext_shared_info) + + RTE_PKTMBUF_HEADROOM; + if (mprq_en && + desc > (1U << config->mprq.stride_num_n) && + mprq_stride_size <= (1U << config->mprq.max_stride_size_n)) { + /* TODO: Rx scatter isn't supported yet. */ + tmpl->rxq.sges_n = 0; + /* Trim the number of descs needed. */ + desc >>= config->mprq.stride_num_n; + tmpl->rxq.strd_num_n = config->mprq.stride_num_n; + tmpl->rxq.strd_sz_n = RTE_MAX(log2above(mprq_stride_size), + config->mprq.min_stride_size_n); + tmpl->rxq.strd_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT; + tmpl->rxq.mprq_max_memcpy_len = + RTE_MIN(mb_len - RTE_PKTMBUF_HEADROOM, + config->mprq.max_memcpy_len); + DRV_LOG(DEBUG, + "port %u Rx queue %u: Multi-Packet RQ is enabled" + " strd_num_n = %u, strd_sz_n = %u", + dev->data->port_id, idx, + tmpl->rxq.strd_num_n, tmpl->rxq.strd_sz_n); + } else if (dev->data->dev_conf.rxmode.max_rx_pkt_len <= + (mb_len - RTE_PKTMBUF_HEADROOM)) { tmpl->rxq.sges_n = 0; - } else if (conf->offloads & DEV_RX_OFFLOAD_SCATTER) { + } else if (offloads & DEV_RX_OFFLOAD_SCATTER) { unsigned int size = RTE_PKTMBUF_HEADROOM + dev->data->dev_conf.rxmode.max_rx_pkt_len; @@ -978,57 +1393,71 @@ mlx5_priv_rxq_new(struct priv *priv, uint16_t idx, uint16_t desc, size = mb_len * (1 << tmpl->rxq.sges_n); size -= RTE_PKTMBUF_HEADROOM; if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) { - ERROR("%p: too many SGEs (%u) needed to handle" - " requested maximum packet size %u", - (void *)dev, - 1 << sges_n, - dev->data->dev_conf.rxmode.max_rx_pkt_len); + DRV_LOG(ERR, + "port %u too many SGEs (%u) needed to handle" + " requested maximum packet size %u", + dev->data->port_id, + 1 << sges_n, + dev->data->dev_conf.rxmode.max_rx_pkt_len); + rte_errno = EOVERFLOW; goto error; } } else { - WARN("%p: the requested maximum Rx packet size (%u) is" - " larger than a single mbuf (%u) and scattered" - " mode has not been requested", - (void *)dev, - dev->data->dev_conf.rxmode.max_rx_pkt_len, - mb_len - RTE_PKTMBUF_HEADROOM); - } - DEBUG("%p: maximum number of segments per packet: %u", - (void *)dev, 1 << tmpl->rxq.sges_n); + DRV_LOG(WARNING, + "port %u the requested maximum Rx packet size (%u) is" + " larger than a single mbuf (%u) and scattered mode has" + " not been requested", + dev->data->port_id, + dev->data->dev_conf.rxmode.max_rx_pkt_len, + mb_len - RTE_PKTMBUF_HEADROOM); + } + if (mprq_en && !mlx5_rxq_mprq_enabled(&tmpl->rxq)) + DRV_LOG(WARNING, + "port %u MPRQ is requested but cannot be enabled" + " (requested: desc = %u, stride_sz = %u," + " supported: min_stride_num = %u, max_stride_sz = %u).", + dev->data->port_id, desc, mprq_stride_size, + (1 << config->mprq.stride_num_n), + (1 << config->mprq.max_stride_size_n)); + DRV_LOG(DEBUG, "port %u maximum number of segments per packet: %u", + dev->data->port_id, 1 << tmpl->rxq.sges_n); if (desc % (1 << tmpl->rxq.sges_n)) { - ERROR("%p: number of RX queue descriptors (%u) is not a" - " multiple of SGEs per packet (%u)", - (void *)dev, - desc, - 1 << tmpl->rxq.sges_n); + DRV_LOG(ERR, + "port %u number of Rx queue descriptors (%u) is not a" + " multiple of SGEs per packet (%u)", + dev->data->port_id, + desc, + 1 << tmpl->rxq.sges_n); + rte_errno = EINVAL; goto error; } /* Toggle RX checksum offload if hardware supports it. */ - tmpl->rxq.csum = !!(conf->offloads & DEV_RX_OFFLOAD_CHECKSUM); - tmpl->rxq.csum_l2tun = (!!(conf->offloads & DEV_RX_OFFLOAD_CHECKSUM) && - priv->config.hw_csum_l2tun); - tmpl->rxq.hw_timestamp = !!(conf->offloads & DEV_RX_OFFLOAD_TIMESTAMP); + tmpl->rxq.csum = !!(offloads & DEV_RX_OFFLOAD_CHECKSUM); + tmpl->rxq.hw_timestamp = !!(offloads & DEV_RX_OFFLOAD_TIMESTAMP); /* Configure VLAN stripping. */ - tmpl->rxq.vlan_strip = !!(conf->offloads & DEV_RX_OFFLOAD_VLAN_STRIP); + tmpl->rxq.vlan_strip = !!(offloads & DEV_RX_OFFLOAD_VLAN_STRIP); /* By default, FCS (CRC) is stripped by hardware. */ - if (conf->offloads & DEV_RX_OFFLOAD_CRC_STRIP) { - tmpl->rxq.crc_present = 0; - } else if (config->hw_fcs_strip) { - tmpl->rxq.crc_present = 1; - } else { - WARN("%p: CRC stripping has been disabled but will still" - " be performed by hardware, make sure MLNX_OFED and" - " firmware are up to date", - (void *)dev); - tmpl->rxq.crc_present = 0; - } - DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from" - " incoming frames to hide it", - (void *)dev, - tmpl->rxq.crc_present ? "disabled" : "enabled", - tmpl->rxq.crc_present << 2); + tmpl->rxq.crc_present = 0; + if (rte_eth_dev_must_keep_crc(offloads)) { + if (config->hw_fcs_strip) { + tmpl->rxq.crc_present = 1; + } else { + DRV_LOG(WARNING, + "port %u CRC stripping has been disabled but will" + " still be performed by hardware, make sure MLNX_OFED" + " and firmware are up to date", + dev->data->port_id); + } + } + DRV_LOG(DEBUG, + "port %u CRC stripping is %s, %u bytes will be subtracted from" + " incoming frames to hide it", + dev->data->port_id, + tmpl->rxq.crc_present ? "disabled" : "enabled", + tmpl->rxq.crc_present << 2); /* Save port ID. */ - tmpl->rxq.rss_hash = priv->rxqs_n > 1; + tmpl->rxq.rss_hash = !!priv->rss_conf.rss_hf && + (!!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS)); tmpl->rxq.port_id = dev->data->port_id; tmpl->priv = priv; tmpl->rxq.mp = mp; @@ -1036,9 +1465,11 @@ mlx5_priv_rxq_new(struct priv *priv, uint16_t idx, uint16_t desc, tmpl->rxq.elts_n = log2above(desc); tmpl->rxq.elts = (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1); +#ifndef RTE_ARCH_64 + tmpl->rxq.uar_lock_cq = &priv->uar_lock_cq; +#endif + tmpl->idx = idx; rte_atomic32_inc(&tmpl->refcnt); - DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv, - (void *)tmpl, rte_atomic32_read(&tmpl->refcnt)); LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next); return tmpl; error: @@ -1049,28 +1480,26 @@ error: /** * Get a Rx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * * @return - * A pointer to the queue if it exists. + * A pointer to the queue if it exists, NULL otherwise. */ -struct mlx5_rxq_ctrl* -mlx5_priv_rxq_get(struct priv *priv, uint16_t idx) +struct mlx5_rxq_ctrl * +mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *rxq_ctrl = NULL; if ((*priv->rxqs)[idx]) { rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq); - - mlx5_priv_rxq_ibv_get(priv, idx); + mlx5_rxq_ibv_get(dev, idx); rte_atomic32_inc(&rxq_ctrl->refcnt); - DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv, - (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt)); } return rxq_ctrl; } @@ -1078,59 +1507,58 @@ mlx5_priv_rxq_get(struct priv *priv, uint16_t idx) /** * Release a Rx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * * @return - * 0 on success, errno value on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_rxq_release(struct priv *priv, uint16_t idx) +mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *rxq_ctrl; if (!(*priv->rxqs)[idx]) return 0; rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq); assert(rxq_ctrl->priv); - if (rxq_ctrl->ibv) { - int ret; - - ret = mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv); - if (!ret) - rxq_ctrl->ibv = NULL; - } - DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv, - (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt)); + if (rxq_ctrl->ibv && !mlx5_rxq_ibv_release(rxq_ctrl->ibv)) + rxq_ctrl->ibv = NULL; if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) { + mlx5_mr_btree_free(&rxq_ctrl->rxq.mr_ctrl.cache_bh); LIST_REMOVE(rxq_ctrl, next); rte_free(rxq_ctrl); (*priv->rxqs)[idx] = NULL; return 0; } - return EBUSY; + return 1; } /** * Verify if the queue can be released. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * * @return - * 1 if the queue can be released. + * 1 if the queue can be released, negative errno otherwise and rte_errno is + * set. */ int -mlx5_priv_rxq_releasable(struct priv *priv, uint16_t idx) +mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *rxq_ctrl; - if (!(*priv->rxqs)[idx]) - return -1; + if (!(*priv->rxqs)[idx]) { + rte_errno = EINVAL; + return -rte_errno; + } rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq); return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1); } @@ -1138,20 +1566,22 @@ mlx5_priv_rxq_releasable(struct priv *priv, uint16_t idx) /** * Verify the Rx Queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_rxq_verify(struct priv *priv) +mlx5_rxq_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *rxq_ctrl; int ret = 0; LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) { - DEBUG("%p: Rx Queue %p still referenced", (void *)priv, - (void *)rxq_ctrl); + DRV_LOG(DEBUG, "port %u Rx Queue %u still referenced", + dev->data->port_id, rxq_ctrl->idx); ++ret; } return ret; @@ -1160,20 +1590,21 @@ mlx5_priv_rxq_verify(struct priv *priv) /** * Create an indirection table. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param queues * Queues entering in the indirection table. * @param queues_n * Number of queues in the array. * * @return - * A new indirection table. + * The Verbs object initialised, NULL otherwise and rte_errno is set. */ -struct mlx5_ind_table_ibv* -mlx5_priv_ind_table_ibv_new(struct priv *priv, uint16_t queues[], - uint16_t queues_n) +struct mlx5_ind_table_ibv * +mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, const uint16_t *queues, + uint32_t queues_n) { + struct priv *priv = dev->data->dev_private; struct mlx5_ind_table_ibv *ind_tbl; const unsigned int wq_n = rte_is_power_of_2(queues_n) ? log2above(queues_n) : @@ -1184,11 +1615,12 @@ mlx5_priv_ind_table_ibv_new(struct priv *priv, uint16_t queues[], ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) + queues_n * sizeof(uint16_t), 0); - if (!ind_tbl) + if (!ind_tbl) { + rte_errno = ENOMEM; return NULL; + } for (i = 0; i != queues_n; ++i) { - struct mlx5_rxq_ctrl *rxq = - mlx5_priv_rxq_get(priv, queues[i]); + struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev, queues[i]); if (!rxq) goto error; @@ -1206,24 +1638,24 @@ mlx5_priv_ind_table_ibv_new(struct priv *priv, uint16_t queues[], .ind_tbl = wq, .comp_mask = 0, }); - if (!ind_tbl->ind_table) + if (!ind_tbl->ind_table) { + rte_errno = errno; goto error; + } rte_atomic32_inc(&ind_tbl->refcnt); LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next); - DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv, - (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt)); return ind_tbl; error: rte_free(ind_tbl); - DEBUG("%p cannot create indirection table", (void *)priv); + DEBUG("port %u cannot create indirection table", dev->data->port_id); return NULL; } /** * Get an indirection table. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param queues * Queues entering in the indirection table. * @param queues_n @@ -1232,10 +1664,11 @@ error: * @return * An indirection table if found. */ -struct mlx5_ind_table_ibv* -mlx5_priv_ind_table_ibv_get(struct priv *priv, uint16_t queues[], - uint16_t queues_n) +struct mlx5_ind_table_ibv * +mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, const uint16_t *queues, + uint32_t queues_n) { + struct priv *priv = dev->data->dev_private; struct mlx5_ind_table_ibv *ind_tbl; LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) { @@ -1249,10 +1682,8 @@ mlx5_priv_ind_table_ibv_get(struct priv *priv, uint16_t queues[], unsigned int i; rte_atomic32_inc(&ind_tbl->refcnt); - DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv, - (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt)); for (i = 0; i != ind_tbl->queues_n; ++i) - mlx5_priv_rxq_get(priv, ind_tbl->queues[i]); + mlx5_rxq_get(dev, ind_tbl->queues[i]); } return ind_tbl; } @@ -1260,52 +1691,53 @@ mlx5_priv_ind_table_ibv_get(struct priv *priv, uint16_t queues[], /** * Release an indirection table. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param ind_table * Indirection table to release. * * @return - * 0 on success, errno value on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_ind_table_ibv_release(struct priv *priv, - struct mlx5_ind_table_ibv *ind_tbl) +mlx5_ind_table_ibv_release(struct rte_eth_dev *dev, + struct mlx5_ind_table_ibv *ind_tbl) { unsigned int i; - DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv, - (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt)); if (rte_atomic32_dec_and_test(&ind_tbl->refcnt)) claim_zero(mlx5_glue->destroy_rwq_ind_table (ind_tbl->ind_table)); for (i = 0; i != ind_tbl->queues_n; ++i) - claim_nonzero(mlx5_priv_rxq_release(priv, ind_tbl->queues[i])); + claim_nonzero(mlx5_rxq_release(dev, ind_tbl->queues[i])); if (!rte_atomic32_read(&ind_tbl->refcnt)) { LIST_REMOVE(ind_tbl, next); rte_free(ind_tbl); return 0; } - return EBUSY; + return 1; } /** * Verify the Rx Queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_ind_table_ibv_verify(struct priv *priv) +mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_ind_table_ibv *ind_tbl; int ret = 0; LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) { - DEBUG("%p: Verbs indirection table %p still referenced", - (void *)priv, (void *)ind_tbl); + DRV_LOG(DEBUG, + "port %u Verbs indirection table %p still referenced", + dev->data->port_id, (void *)ind_tbl); ++ret; } return ret; @@ -1314,8 +1746,8 @@ mlx5_priv_ind_table_ibv_verify(struct priv *priv) /** * Create an Rx Hash queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param rss_key * RSS key for the Rx hash queue. * @param rss_key_len @@ -1329,22 +1761,60 @@ mlx5_priv_ind_table_ibv_verify(struct priv *priv) * Number of queues. * * @return - * An hash Rx queue on success. + * The Verbs object initialised, NULL otherwise and rte_errno is set. */ -struct mlx5_hrxq* -mlx5_priv_hrxq_new(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, - uint64_t hash_fields, uint16_t queues[], uint16_t queues_n) +struct mlx5_hrxq * +mlx5_hrxq_new(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n, + int tunnel __rte_unused) { + struct priv *priv = dev->data->dev_private; struct mlx5_hrxq *hrxq; struct mlx5_ind_table_ibv *ind_tbl; struct ibv_qp *qp; + int err; queues_n = hash_fields ? queues_n : 1; - ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n); - if (!ind_tbl) - ind_tbl = mlx5_priv_ind_table_ibv_new(priv, queues, queues_n); + ind_tbl = mlx5_ind_table_ibv_get(dev, queues, queues_n); if (!ind_tbl) + ind_tbl = mlx5_ind_table_ibv_new(dev, queues, queues_n); + if (!ind_tbl) { + rte_errno = ENOMEM; return NULL; + } + if (!rss_key_len) { + rss_key_len = MLX5_RSS_HASH_KEY_LEN; + rss_key = rss_hash_default_key; + } +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + qp = mlx5_glue->dv_create_qp + (priv->ctx, + &(struct ibv_qp_init_attr_ex){ + .qp_type = IBV_QPT_RAW_PACKET, + .comp_mask = + IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_IND_TABLE | + IBV_QP_INIT_ATTR_RX_HASH, + .rx_hash_conf = (struct ibv_rx_hash_conf){ + .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ, + .rx_hash_key_len = rss_key_len ? rss_key_len : + MLX5_RSS_HASH_KEY_LEN, + .rx_hash_key = rss_key ? + (void *)(uintptr_t)rss_key : + rss_hash_default_key, + .rx_hash_fields_mask = hash_fields, + }, + .rwq_ind_tbl = ind_tbl->ind_table, + .pd = priv->pd, + }, + &(struct mlx5dv_qp_init_attr){ + .comp_mask = tunnel ? + MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS : 0, + .create_flags = MLX5DV_QP_CREATE_TUNNEL_OFFLOADS, + }); +#else qp = mlx5_glue->create_qp_ex (priv->ctx, &(struct ibv_qp_init_attr_ex){ @@ -1355,15 +1825,21 @@ mlx5_priv_hrxq_new(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, IBV_QP_INIT_ATTR_RX_HASH, .rx_hash_conf = (struct ibv_rx_hash_conf){ .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ, - .rx_hash_key_len = rss_key_len, - .rx_hash_key = rss_key, + .rx_hash_key_len = rss_key_len ? rss_key_len : + MLX5_RSS_HASH_KEY_LEN, + .rx_hash_key = rss_key ? + (void *)(uintptr_t)rss_key : + rss_hash_default_key, .rx_hash_fields_mask = hash_fields, }, .rwq_ind_tbl = ind_tbl->ind_table, .pd = priv->pd, }); - if (!qp) +#endif + if (!qp) { + rte_errno = errno; goto error; + } hrxq = rte_calloc(__func__, 1, sizeof(*hrxq) + rss_key_len, 0); if (!hrxq) goto error; @@ -1374,21 +1850,21 @@ mlx5_priv_hrxq_new(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, memcpy(hrxq->rss_key, rss_key, rss_key_len); rte_atomic32_inc(&hrxq->refcnt); LIST_INSERT_HEAD(&priv->hrxqs, hrxq, next); - DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv, - (void *)hrxq, rte_atomic32_read(&hrxq->refcnt)); return hrxq; error: - mlx5_priv_ind_table_ibv_release(priv, ind_tbl); + err = rte_errno; /* Save rte_errno before cleanup. */ + mlx5_ind_table_ibv_release(dev, ind_tbl); if (qp) claim_zero(mlx5_glue->destroy_qp(qp)); + rte_errno = err; /* Restore rte_errno. */ return NULL; } /** * Get an Rx Hash queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param rss_conf * RSS configuration for the Rx hash queue. * @param queues @@ -1400,10 +1876,13 @@ error: * @return * An hash Rx queue on success. */ -struct mlx5_hrxq* -mlx5_priv_hrxq_get(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, - uint64_t hash_fields, uint16_t queues[], uint16_t queues_n) +struct mlx5_hrxq * +mlx5_hrxq_get(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n) { + struct priv *priv = dev->data->dev_private; struct mlx5_hrxq *hrxq; queues_n = hash_fields ? queues_n : 1; @@ -1416,16 +1895,14 @@ mlx5_priv_hrxq_get(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, continue; if (hrxq->hash_fields != hash_fields) continue; - ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n); + ind_tbl = mlx5_ind_table_ibv_get(dev, queues, queues_n); if (!ind_tbl) continue; if (ind_tbl != hrxq->ind_table) { - mlx5_priv_ind_table_ibv_release(priv, ind_tbl); + mlx5_ind_table_ibv_release(dev, ind_tbl); continue; } rte_atomic32_inc(&hrxq->refcnt); - DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv, - (void *)hrxq, rte_atomic32_read(&hrxq->refcnt)); return hrxq; } return NULL; @@ -1434,48 +1911,281 @@ mlx5_priv_hrxq_get(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len, /** * Release the hash Rx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param hrxq * Pointer to Hash Rx queue to release. * * @return - * 0 on success, errno value on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_hrxq_release(struct priv *priv, struct mlx5_hrxq *hrxq) +mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hrxq) { - DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv, - (void *)hrxq, rte_atomic32_read(&hrxq->refcnt)); if (rte_atomic32_dec_and_test(&hrxq->refcnt)) { claim_zero(mlx5_glue->destroy_qp(hrxq->qp)); - mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table); + mlx5_ind_table_ibv_release(dev, hrxq->ind_table); LIST_REMOVE(hrxq, next); rte_free(hrxq); return 0; } - claim_nonzero(mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table)); - return EBUSY; + claim_nonzero(mlx5_ind_table_ibv_release(dev, hrxq->ind_table)); + return 1; } /** * Verify the Rx Queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_hrxq_ibv_verify(struct priv *priv) +mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_hrxq *hrxq; int ret = 0; LIST_FOREACH(hrxq, &priv->hrxqs, next) { - DEBUG("%p: Verbs Hash Rx queue %p still referenced", - (void *)priv, (void *)hrxq); + DRV_LOG(DEBUG, + "port %u Verbs hash Rx queue %p still referenced", + dev->data->port_id, (void *)hrxq); ++ret; } return ret; } + +/** + * Create a drop Rx queue Verbs object. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The Verbs object initialised, NULL otherwise and rte_errno is set. + */ +struct mlx5_rxq_ibv * +mlx5_rxq_ibv_drop_new(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct ibv_cq *cq; + struct ibv_wq *wq = NULL; + struct mlx5_rxq_ibv *rxq; + + if (priv->drop_queue.rxq) + return priv->drop_queue.rxq; + cq = mlx5_glue->create_cq(priv->ctx, 1, NULL, NULL, 0); + if (!cq) { + DEBUG("port %u cannot allocate CQ for drop queue", + dev->data->port_id); + rte_errno = errno; + goto error; + } + wq = mlx5_glue->create_wq(priv->ctx, + &(struct ibv_wq_init_attr){ + .wq_type = IBV_WQT_RQ, + .max_wr = 1, + .max_sge = 1, + .pd = priv->pd, + .cq = cq, + }); + if (!wq) { + DEBUG("port %u cannot allocate WQ for drop queue", + dev->data->port_id); + rte_errno = errno; + goto error; + } + rxq = rte_calloc(__func__, 1, sizeof(*rxq), 0); + if (!rxq) { + DEBUG("port %u cannot allocate drop Rx queue memory", + dev->data->port_id); + rte_errno = ENOMEM; + goto error; + } + rxq->cq = cq; + rxq->wq = wq; + priv->drop_queue.rxq = rxq; + return rxq; +error: + if (wq) + claim_zero(mlx5_glue->destroy_wq(wq)); + if (cq) + claim_zero(mlx5_glue->destroy_cq(cq)); + return NULL; +} + +/** + * Release a drop Rx queue Verbs object. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The Verbs object initialised, NULL otherwise and rte_errno is set. + */ +void +mlx5_rxq_ibv_drop_release(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_rxq_ibv *rxq = priv->drop_queue.rxq; + + if (rxq->wq) + claim_zero(mlx5_glue->destroy_wq(rxq->wq)); + if (rxq->cq) + claim_zero(mlx5_glue->destroy_cq(rxq->cq)); + rte_free(rxq); + priv->drop_queue.rxq = NULL; +} + +/** + * Create a drop indirection table. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The Verbs object initialised, NULL otherwise and rte_errno is set. + */ +struct mlx5_ind_table_ibv * +mlx5_ind_table_ibv_drop_new(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_ind_table_ibv *ind_tbl; + struct mlx5_rxq_ibv *rxq; + struct mlx5_ind_table_ibv tmpl; + + rxq = mlx5_rxq_ibv_drop_new(dev); + if (!rxq) + return NULL; + tmpl.ind_table = mlx5_glue->create_rwq_ind_table + (priv->ctx, + &(struct ibv_rwq_ind_table_init_attr){ + .log_ind_tbl_size = 0, + .ind_tbl = &rxq->wq, + .comp_mask = 0, + }); + if (!tmpl.ind_table) { + DEBUG("port %u cannot allocate indirection table for drop" + " queue", + dev->data->port_id); + rte_errno = errno; + goto error; + } + ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl), 0); + if (!ind_tbl) { + rte_errno = ENOMEM; + goto error; + } + ind_tbl->ind_table = tmpl.ind_table; + return ind_tbl; +error: + mlx5_rxq_ibv_drop_release(dev); + return NULL; +} + +/** + * Release a drop indirection table. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_ind_table_ibv_drop_release(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_ind_table_ibv *ind_tbl = priv->drop_queue.hrxq->ind_table; + + claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl->ind_table)); + mlx5_rxq_ibv_drop_release(dev); + rte_free(ind_tbl); + priv->drop_queue.hrxq->ind_table = NULL; +} + +/** + * Create a drop Rx Hash queue. + * + * @param dev + * Pointer to Ethernet device. + * + * @return + * The Verbs object initialised, NULL otherwise and rte_errno is set. + */ +struct mlx5_hrxq * +mlx5_hrxq_drop_new(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_ind_table_ibv *ind_tbl; + struct ibv_qp *qp; + struct mlx5_hrxq *hrxq; + + if (priv->drop_queue.hrxq) { + rte_atomic32_inc(&priv->drop_queue.hrxq->refcnt); + return priv->drop_queue.hrxq; + } + ind_tbl = mlx5_ind_table_ibv_drop_new(dev); + if (!ind_tbl) + return NULL; + qp = mlx5_glue->create_qp_ex(priv->ctx, + &(struct ibv_qp_init_attr_ex){ + .qp_type = IBV_QPT_RAW_PACKET, + .comp_mask = + IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_IND_TABLE | + IBV_QP_INIT_ATTR_RX_HASH, + .rx_hash_conf = (struct ibv_rx_hash_conf){ + .rx_hash_function = + IBV_RX_HASH_FUNC_TOEPLITZ, + .rx_hash_key_len = MLX5_RSS_HASH_KEY_LEN, + .rx_hash_key = rss_hash_default_key, + .rx_hash_fields_mask = 0, + }, + .rwq_ind_tbl = ind_tbl->ind_table, + .pd = priv->pd + }); + if (!qp) { + DEBUG("port %u cannot allocate QP for drop queue", + dev->data->port_id); + rte_errno = errno; + goto error; + } + hrxq = rte_calloc(__func__, 1, sizeof(*hrxq), 0); + if (!hrxq) { + DRV_LOG(WARNING, + "port %u cannot allocate memory for drop queue", + dev->data->port_id); + rte_errno = ENOMEM; + goto error; + } + hrxq->ind_table = ind_tbl; + hrxq->qp = qp; + priv->drop_queue.hrxq = hrxq; + rte_atomic32_set(&hrxq->refcnt, 1); + return hrxq; +error: + if (ind_tbl) + mlx5_ind_table_ibv_drop_release(dev); + return NULL; +} + +/** + * Release a drop hash Rx queue. + * + * @param dev + * Pointer to Ethernet device. + */ +void +mlx5_hrxq_drop_release(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_hrxq *hrxq = priv->drop_queue.hrxq; + + if (rte_atomic32_dec_and_test(&hrxq->refcnt)) { + claim_zero(mlx5_glue->destroy_qp(hrxq->qp)); + mlx5_ind_table_ibv_drop_release(dev); + rte_free(hrxq); + priv->drop_queue.hrxq = NULL; + } +} diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index dc4ead93..2d14f8a6 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <assert.h> @@ -34,19 +34,29 @@ #include "mlx5_prm.h" static __rte_always_inline uint32_t -rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe); +rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); static __rte_always_inline int mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, - uint16_t cqe_cnt, uint32_t *rss_hash); + uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe); static __rte_always_inline uint32_t -rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); +rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); + +static __rte_always_inline void +rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, + volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); + +static __rte_always_inline void +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx); uint32_t mlx5_ptype_table[] __rte_cache_aligned = { [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ }; +uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; +uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; + /** * Build a table to translate Rx completion flags to packet type. * @@ -86,6 +96,14 @@ mlx5_set_ptype_table(void) RTE_PTYPE_L4_TCP; (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP; + (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; /* UDP */ (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP; @@ -104,17 +122,27 @@ mlx5_set_ptype_table(void) RTE_PTYPE_L4_TCP; (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP; + (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP; (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP; /* Tunneled - L3 */ + (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_NONFRAG; (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_NONFRAG; + (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_NONFRAG; @@ -141,12 +169,36 @@ mlx5_set_ptype_table(void) (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_TCP; + (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_TCP; (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L4_TCP; + (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; + (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP; /* Tunneled - UDP */ (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | @@ -163,6 +215,74 @@ mlx5_set_ptype_table(void) } /** + * Build a table to translate packet to checksum type of Verbs. + */ +void +mlx5_set_cksum_table(void) +{ + unsigned int i; + uint8_t v; + + /* + * The index should have: + * bit[0] = PKT_TX_TCP_SEG + * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM + * bit[4] = PKT_TX_IP_CKSUM + * bit[8] = PKT_TX_OUTER_IP_CKSUM + * bit[9] = tunnel + */ + for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) { + v = 0; + if (i & (1 << 9)) { + /* Tunneled packet. */ + if (i & (1 << 8)) /* Outer IP. */ + v |= MLX5_ETH_WQE_L3_CSUM; + if (i & (1 << 4)) /* Inner IP. */ + v |= MLX5_ETH_WQE_L3_INNER_CSUM; + if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ + v |= MLX5_ETH_WQE_L4_INNER_CSUM; + } else { + /* No tunnel. */ + if (i & (1 << 4)) /* IP. */ + v |= MLX5_ETH_WQE_L3_CSUM; + if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */ + v |= MLX5_ETH_WQE_L4_CSUM; + } + mlx5_cksum_table[i] = v; + } +} + +/** + * Build a table to translate packet type of mbuf to SWP type of Verbs. + */ +void +mlx5_set_swp_types_table(void) +{ + unsigned int i; + uint8_t v; + + /* + * The index should have: + * bit[0:1] = PKT_TX_L4_MASK + * bit[4] = PKT_TX_IPV6 + * bit[8] = PKT_TX_OUTER_IPV6 + * bit[9] = PKT_TX_OUTER_UDP + */ + for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) { + v = 0; + if (i & (1 << 8)) + v |= MLX5_ETH_WQE_L3_OUTER_IPV6; + if (i & (1 << 9)) + v |= MLX5_ETH_WQE_L4_OUTER_UDP; + if (i & (1 << 4)) + v |= MLX5_ETH_WQE_L3_INNER_IPV6; + if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52)) + v |= MLX5_ETH_WQE_L4_INNER_UDP; + mlx5_swp_types_table[i] = v; + } +} + +/** * Return the size of tailroom of WQ. * * @param txq @@ -219,6 +339,60 @@ mlx5_copy_to_wq(void *dst, const void *src, size_t n, } /** + * Inline TSO headers into WQE. + * + * @return + * 0 on success, negative errno value on failure. + */ +static int +inline_tso(struct mlx5_txq_data *txq, struct rte_mbuf *buf, + uint32_t *length, + uintptr_t *addr, + uint16_t *pkt_inline_sz, + uint8_t **raw, + uint16_t *max_wqe, + uint16_t *tso_segsz, + uint16_t *tso_header_sz) +{ + uintptr_t end = (uintptr_t)(((uintptr_t)txq->wqes) + + (1 << txq->wqe_n) * MLX5_WQE_SIZE); + unsigned int copy_b; + uint8_t vlan_sz = (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0; + const uint8_t tunneled = txq->tunnel_en && (buf->ol_flags & + PKT_TX_TUNNEL_MASK); + uint16_t n_wqe; + + *tso_segsz = buf->tso_segsz; + *tso_header_sz = buf->l2_len + vlan_sz + buf->l3_len + buf->l4_len; + if (unlikely(*tso_segsz == 0 || *tso_header_sz == 0)) { + txq->stats.oerrors++; + return -EINVAL; + } + if (tunneled) + *tso_header_sz += buf->outer_l2_len + buf->outer_l3_len; + /* First seg must contain all TSO headers. */ + if (unlikely(*tso_header_sz > MLX5_MAX_TSO_HEADER) || + *tso_header_sz > DATA_LEN(buf)) { + txq->stats.oerrors++; + return -EINVAL; + } + copy_b = *tso_header_sz - *pkt_inline_sz; + if (!copy_b || ((end - (uintptr_t)*raw) < copy_b)) + return -EAGAIN; + n_wqe = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4; + if (unlikely(*max_wqe < n_wqe)) + return -EINVAL; + *max_wqe -= n_wqe; + rte_memcpy((void *)*raw, (void *)*addr, copy_b); + *length -= copy_b; + *addr += copy_b; + copy_b = MLX5_WQE_DS(copy_b) * MLX5_WQE_DWORD_SIZE; + *pkt_inline_sz += copy_b; + *raw += copy_b; + return 0; +} + +/** * DPDK callback to check the status of a tx descriptor. * * @param tx_queue @@ -321,6 +495,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) volatile struct mlx5_wqe_ctrl *last_wqe = NULL; unsigned int segs_n = 0; const unsigned int max_inline = txq->max_inline; + uint64_t addr_64; if (unlikely(!pkts_n)) return 0; @@ -329,13 +504,11 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) /* Start processing. */ mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); if (unlikely(!max_wqe)) return 0; do { - struct rte_mbuf *buf = NULL; + struct rte_mbuf *buf = *pkts; /* First_seg. */ uint8_t *raw; volatile struct mlx5_wqe_v *wqe = NULL; volatile rte_v128u32_t *dseg = NULL; @@ -347,14 +520,15 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) uint16_t tso_header_sz = 0; uint16_t ehdr; uint8_t cs_flags; - uint64_t tso = 0; + uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG); + uint32_t swp_offsets = 0; + uint8_t swp_types = 0; uint16_t tso_segsz = 0; #ifdef MLX5_PMD_SOFT_COUNTERS uint32_t total_length = 0; #endif + int ret; - /* first_seg */ - buf = *pkts; segs_n = buf->nb_segs; /* * Make sure there is enough room to store this packet and @@ -389,7 +563,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (pkts_n - i > 1) rte_prefetch0( rte_pktmbuf_mtod(*(pkts + 1), volatile void *)); - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); + txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types); raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE; /* Replace the Ethernet type by the VLAN if necessary. */ if (buf->ol_flags & PKT_TX_VLAN_PKT) { @@ -415,54 +590,14 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) addr += pkt_inline_sz; } raw += MLX5_WQE_DWORD_SIZE; - tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG); if (tso) { - uintptr_t end = - (uintptr_t)(((uintptr_t)txq->wqes) + - (1 << txq->wqe_n) * MLX5_WQE_SIZE); - unsigned int copy_b; - uint8_t vlan_sz = - (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0; - const uint64_t is_tunneled = - buf->ol_flags & (PKT_TX_TUNNEL_GRE | - PKT_TX_TUNNEL_VXLAN); - - tso_header_sz = buf->l2_len + vlan_sz + - buf->l3_len + buf->l4_len; - tso_segsz = buf->tso_segsz; - if (unlikely(tso_segsz == 0)) { - txq->stats.oerrors++; - break; - } - if (is_tunneled && txq->tunnel_en) { - tso_header_sz += buf->outer_l2_len + - buf->outer_l3_len; - cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; - } else { - cs_flags |= MLX5_ETH_WQE_L4_CSUM; - } - if (unlikely(tso_header_sz > MLX5_MAX_TSO_HEADER)) { - txq->stats.oerrors++; + ret = inline_tso(txq, buf, &length, + &addr, &pkt_inline_sz, + &raw, &max_wqe, + &tso_segsz, &tso_header_sz); + if (ret == -EINVAL) { break; - } - copy_b = tso_header_sz - pkt_inline_sz; - /* First seg must contain all headers. */ - assert(copy_b <= length); - if (copy_b && ((end - (uintptr_t)raw) > copy_b)) { - uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4; - - if (unlikely(max_wqe < n)) - break; - max_wqe -= n; - rte_memcpy((void *)raw, (void *)addr, copy_b); - addr += copy_b; - length -= copy_b; - /* Include padding for TSO header. */ - copy_b = MLX5_WQE_DS(copy_b) * - MLX5_WQE_DWORD_SIZE; - pkt_inline_sz += copy_b; - raw += copy_b; - } else { + } else if (ret == -EAGAIN) { /* NOP WQE. */ wqe->ctrl = (rte_v128u32_t){ rte_cpu_to_be_32(txq->wqe_ci << 8), @@ -507,7 +642,8 @@ pkt_inline: if (unlikely(max_wqe < n)) break; max_wqe -= n; - if (tso && !inl) { + if (tso) { + assert(inl == 0); inl = rte_cpu_to_be_32(copy_b | MLX5_INLINE_SEG); rte_memcpy((void *)raw, @@ -542,8 +678,17 @@ pkt_inline: } else if (!segs_n) { goto next_pkt; } else { - raw += copy_b; - inline_room -= copy_b; + /* + * Further inline the next segment only for + * non-TSO packets. + */ + if (!tso) { + raw += copy_b; + inline_room -= copy_b; + } else { + inline_room = 0; + } + /* Move to the next segment. */ --segs_n; buf = buf->next; assert(buf); @@ -565,12 +710,12 @@ pkt_inline: ds = 3; use_dseg: /* Add the remaining packet as a simple ds. */ - addr = rte_cpu_to_be_64(addr); + addr_64 = rte_cpu_to_be_64(addr); *dseg = (rte_v128u32_t){ rte_cpu_to_be_32(length), mlx5_tx_mb2mr(txq, buf), - addr, - addr >> 32, + addr_64, + addr_64 >> 32, }; ++ds; if (!segs_n) @@ -604,12 +749,12 @@ next_seg: total_length += length; #endif /* Store segment information. */ - addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t)); + addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t)); *dseg = (rte_v128u32_t){ rte_cpu_to_be_32(length), mlx5_tx_mb2mr(txq, buf), - addr, - addr >> 32, + addr_64, + addr_64 >> 32, }; (*txq->elts)[++elts_head & elts_m] = buf; if (--segs_n) @@ -633,8 +778,9 @@ next_pkt: 0, }; wqe->eseg = (rte_v128u32_t){ - 0, - cs_flags | (rte_cpu_to_be_16(tso_segsz) << 16), + swp_offsets, + cs_flags | (swp_types << 8) | + (rte_cpu_to_be_16(tso_segsz) << 16), 0, (ehdr << 16) | rte_cpu_to_be_16(tso_header_sz), }; @@ -647,8 +793,8 @@ next_pkt: 0, }; wqe->eseg = (rte_v128u32_t){ - 0, - cs_flags, + swp_offsets, + cs_flags | (swp_types << 8), 0, (ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz), }; @@ -669,14 +815,13 @@ next_wqe: /* Check whether completion threshold has been reached. */ comp = txq->elts_comp + i + j + k; if (comp >= MLX5_TX_COMP_THRESH) { + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); /* Request completion on last WQE. */ last_wqe->ctrl2 = rte_cpu_to_be_32(8); /* Save elts_head in unused "immediate" field of WQE. */ last_wqe->ctrl3 = txq->elts_head; txq->elts_comp = 0; -#ifndef NDEBUG - ++txq->cq_pi; -#endif } else { txq->elts_comp = comp; } @@ -795,8 +940,6 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) /* Start processing. */ mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); if (unlikely(!max_wqe)) return 0; @@ -820,7 +963,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) } max_elts -= segs_n; --pkts_n; - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); /* Retrieve packet information. */ length = PKT_LEN(buf); assert(length); @@ -885,14 +1028,13 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (comp >= MLX5_TX_COMP_THRESH) { volatile struct mlx5_wqe *wqe = mpw.wqe; + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); /* Request completion on last WQE. */ wqe->ctrl[2] = rte_cpu_to_be_32(8); /* Save elts_head in unused "immediate" field of WQE. */ wqe->ctrl[3] = elts_head; txq->elts_comp = 0; -#ifndef NDEBUG - ++txq->cq_pi; -#endif } else { txq->elts_comp = comp; } @@ -1024,8 +1166,6 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, /* Start processing. */ mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); do { struct rte_mbuf *buf = *(pkts++); uintptr_t addr; @@ -1052,7 +1192,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, * iteration. */ max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); /* Retrieve packet information. */ length = PKT_LEN(buf); /* Start new session if packet differs. */ @@ -1182,14 +1322,13 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, if (comp >= MLX5_TX_COMP_THRESH) { volatile struct mlx5_wqe *wqe = mpw.wqe; + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); /* Request completion on last WQE. */ wqe->ctrl[2] = rte_cpu_to_be_32(8); /* Save elts_head in unused "immediate" field of WQE. */ wqe->ctrl[3] = elts_head; txq->elts_comp = 0; -#ifndef NDEBUG - ++txq->cq_pi; -#endif } else { txq->elts_comp = comp; } @@ -1303,6 +1442,7 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, unsigned int mpw_room = 0; unsigned int inl_pad = 0; uint32_t inl_hdr; + uint64_t addr_64; struct mlx5_mpw mpw = { .state = MLX5_MPW_STATE_CLOSED, }; @@ -1312,15 +1452,12 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, /* Start processing. */ mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); if (unlikely(!max_wqe)) return 0; do { struct rte_mbuf *buf = *(pkts++); uintptr_t addr; - unsigned int n; unsigned int do_inline = 0; /* Whether inline is possible. */ uint32_t length; uint8_t cs_flags; @@ -1330,7 +1467,7 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, /* Make sure there is enough room to store this packet. */ if (max_elts - j == 0) break; - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); /* Retrieve packet information. */ length = PKT_LEN(buf); /* Start new session if: @@ -1382,7 +1519,7 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, (!txq->mpw_hdr_dseg || mpw.total_len >= MLX5_WQE_SIZE); } - if (do_inline) { + if (max_inline && do_inline) { /* Inline packet into WQE. */ unsigned int max; @@ -1440,16 +1577,13 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, ((uintptr_t)mpw.data.raw + inl_pad); (*txq->elts)[elts_head++ & elts_m] = buf; - addr = rte_pktmbuf_mtod(buf, uintptr_t); - for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++) - rte_prefetch2((void *)(addr + - n * RTE_CACHE_LINE_SIZE)); - addr = rte_cpu_to_be_64(addr); + addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, + uintptr_t)); *dseg = (rte_v128u32_t) { rte_cpu_to_be_32(length), mlx5_tx_mb2mr(txq, buf), - addr, - addr >> 32, + addr_64, + addr_64 >> 32, }; mpw.data.raw = (volatile void *)(dseg + 1); mpw.total_len += (inl_pad + sizeof(*dseg)); @@ -1473,15 +1607,14 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) { volatile struct mlx5_wqe *wqe = mpw.wqe; + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); /* Request completion on last WQE. */ wqe->ctrl[2] = rte_cpu_to_be_32(8); /* Save elts_head in unused "immediate" field of WQE. */ wqe->ctrl[3] = elts_head; txq->elts_comp = 0; txq->mpw_comp = txq->wqe_ci; -#ifndef NDEBUG - ++txq->cq_pi; -#endif } else { txq->elts_comp += j; } @@ -1541,6 +1674,8 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) /** * Translate RX completion flags to packet type. * + * @param[in] rxq + * Pointer to RX queue structure. * @param[in] cqe * Pointer to CQE. * @@ -1550,7 +1685,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) * Packet type for struct rte_mbuf. */ static inline uint32_t -rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) +rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) { uint8_t idx; uint8_t pinfo = cqe->pkt_info; @@ -1565,7 +1700,7 @@ rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) * bit[7] = outer_l3_type */ idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); - return mlx5_ptype_table[idx]; + return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); } /** @@ -1577,8 +1712,9 @@ rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) * Pointer to RX queue. * @param cqe * CQE to process. - * @param[out] rss_hash - * Packet RSS Hash result. + * @param[out] mcqe + * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not + * written. * * @return * Packet size in bytes (0 if there is none), -1 in case of completion @@ -1586,7 +1722,7 @@ rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) */ static inline int mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, - uint16_t cqe_cnt, uint32_t *rss_hash) + uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe) { struct rxq_zip *zip = &rxq->zip; uint16_t cqe_n = cqe_cnt + 1; @@ -1600,7 +1736,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info); len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); - *rss_hash = rte_be_to_cpu_32((*mc)[zip->ai & 7].rx_hash_result); + *mcqe = &(*mc)[zip->ai & 7]; if ((++zip->ai & 7) == 0) { /* Invalidate consumed CQEs */ idx = zip->ca; @@ -1665,7 +1801,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; /* Get packet size to return. */ len = rte_be_to_cpu_32((*mc)[0].byte_cnt); - *rss_hash = rte_be_to_cpu_32((*mc)[0].rx_hash_result); + *mcqe = &(*mc)[0]; zip->ai = 1; /* Prefetch all the entries to be invalidated */ idx = zip->ca; @@ -1676,7 +1812,6 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, } } else { len = rte_be_to_cpu_32(cqe->byte_cnt); - *rss_hash = rte_be_to_cpu_32(cqe->rx_hash_res); } /* Error while receiving packet. */ if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR)) @@ -1688,8 +1823,6 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, /** * Translate RX completion flags to offload flags. * - * @param[in] rxq - * Pointer to RX queue structure. * @param[in] cqe * Pointer to CQE. * @@ -1697,7 +1830,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, * Offload flags (ol_flags) for struct rte_mbuf. */ static inline uint32_t -rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) +rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) { uint32_t ol_flags = 0; uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc); @@ -1709,18 +1842,56 @@ rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) TRANSPOSE(flags, MLX5_CQE_RX_L4_HDR_VALID, PKT_RX_L4_CKSUM_GOOD); - if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) - ol_flags |= - TRANSPOSE(flags, - MLX5_CQE_RX_L3_HDR_VALID, - PKT_RX_IP_CKSUM_GOOD) | - TRANSPOSE(flags, - MLX5_CQE_RX_L4_HDR_VALID, - PKT_RX_L4_CKSUM_GOOD); return ol_flags; } /** + * Fill in mbuf fields from RX completion flags. + * Note that pkt->ol_flags should be initialized outside of this function. + * + * @param rxq + * Pointer to RX queue. + * @param pkt + * mbuf to fill. + * @param cqe + * CQE to process. + * @param rss_hash_res + * Packet RSS Hash result. + */ +static inline void +rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, + volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) +{ + /* Update packet information. */ + pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); + if (rss_hash_res && rxq->rss_hash) { + pkt->hash.rss = rss_hash_res; + pkt->ol_flags |= PKT_RX_RSS_HASH; + } + if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { + pkt->ol_flags |= PKT_RX_FDIR; + if (cqe->sop_drop_qpn != + rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { + uint32_t mark = cqe->sop_drop_qpn; + + pkt->ol_flags |= PKT_RX_FDIR_ID; + pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); + } + } + if (rxq->csum) + pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); + if (rxq->vlan_strip && + (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { + pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; + pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); + } + if (rxq->hw_timestamp) { + pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); + pkt->ol_flags |= PKT_RX_TIMESTAMP; + } +} + +/** * DPDK callback for RX. * * @param dpdk_rxq @@ -1750,9 +1921,11 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) while (pkts_n) { unsigned int idx = rq_ci & wqe_cnt; - volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; + volatile struct mlx5_wqe_data_seg *wqe = + &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; struct rte_mbuf *rep = (*rxq->elts)[idx]; - uint32_t rss_hash_res = 0; + volatile struct mlx5_mini_cqe8 *mcqe = NULL; + uint32_t rss_hash_res; if (pkt) NEXT(seg) = rep; @@ -1782,8 +1955,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) } if (!pkt) { cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt]; - len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, - &rss_hash_res); + len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe); if (!len) { rte_mbuf_raw_free(rep); break; @@ -1796,40 +1968,12 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) } pkt = seg; assert(len >= (rxq->crc_present << 2)); - /* Update packet information. */ - pkt->packet_type = rxq_cq_to_pkt_type(cqe); pkt->ol_flags = 0; - if (rss_hash_res && rxq->rss_hash) { - pkt->hash.rss = rss_hash_res; - pkt->ol_flags = PKT_RX_RSS_HASH; - } - if (rxq->mark && - MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { - pkt->ol_flags |= PKT_RX_FDIR; - if (cqe->sop_drop_qpn != - rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { - uint32_t mark = cqe->sop_drop_qpn; - - pkt->ol_flags |= PKT_RX_FDIR_ID; - pkt->hash.fdir.hi = - mlx5_flow_mark_get(mark); - } - } - if (rxq->csum | rxq->csum_l2tun) - pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe); - if (rxq->vlan_strip && - (cqe->hdr_type_etc & - rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { - pkt->ol_flags |= PKT_RX_VLAN | - PKT_RX_VLAN_STRIPPED; - pkt->vlan_tci = - rte_be_to_cpu_16(cqe->vlan_info); - } - if (rxq->hw_timestamp) { - pkt->timestamp = - rte_be_to_cpu_64(cqe->timestamp); - pkt->ol_flags |= PKT_RX_TIMESTAMP; - } + /* If compressed, take hash result from mini-CQE. */ + rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? + cqe->rx_hash_res : + mcqe->rx_hash_result); + rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); if (rxq->crc_present) len -= ETHER_CRC_LEN; PKT_LEN(pkt) = len; @@ -1845,6 +1989,9 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) * changes. */ wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wqe->lkey = mlx5_rx_mb2mr(rxq, rep); if (len > DATA_LEN(seg)) { len -= DATA_LEN(seg); ++NB_SEGS(pkt); @@ -1882,6 +2029,246 @@ skip: return i; } +void +mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) +{ + struct mlx5_mprq_buf *buf = opaque; + + if (rte_atomic16_read(&buf->refcnt) == 1) { + rte_mempool_put(buf->mp, buf); + } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { + rte_atomic16_set(&buf->refcnt, 1); + rte_mempool_put(buf->mp, buf); + } +} + +void +mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) +{ + mlx5_mprq_buf_free_cb(NULL, buf); +} + +static inline void +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx) +{ + struct mlx5_mprq_buf *rep = rxq->mprq_repl; + volatile struct mlx5_wqe_data_seg *wqe = + &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; + void *addr; + + assert(rep != NULL); + /* Replace MPRQ buf. */ + (*rxq->mprq_bufs)[rq_idx] = rep; + /* Replace WQE. */ + addr = mlx5_mprq_buf_addr(rep); + wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); + /* Stash a mbuf for next replacement. */ + if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) + rxq->mprq_repl = rep; + else + rxq->mprq_repl = NULL; +} + +/** + * DPDK callback for RX with Multi-Packet RQ support. + * + * @param dpdk_rxq + * Generic pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct mlx5_rxq_data *rxq = dpdk_rxq; + const unsigned int strd_n = 1 << rxq->strd_num_n; + const unsigned int strd_sz = 1 << rxq->strd_sz_n; + const unsigned int strd_shift = + MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; + const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; + const unsigned int wq_mask = (1 << rxq->elts_n) - 1; + volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; + unsigned int i = 0; + uint16_t rq_ci = rxq->rq_ci; + uint16_t consumed_strd = rxq->consumed_strd; + struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; + + while (i < pkts_n) { + struct rte_mbuf *pkt; + void *addr; + int ret; + unsigned int len; + uint16_t strd_cnt; + uint16_t strd_idx; + uint32_t offset; + uint32_t byte_cnt; + volatile struct mlx5_mini_cqe8 *mcqe = NULL; + uint32_t rss_hash_res = 0; + + if (consumed_strd == strd_n) { + /* Replace WQE only if the buffer is still in use. */ + if (rte_atomic16_read(&buf->refcnt) > 1) { + mprq_buf_replace(rxq, rq_ci & wq_mask); + /* Release the old buffer. */ + mlx5_mprq_buf_free(buf); + } else if (unlikely(rxq->mprq_repl == NULL)) { + struct mlx5_mprq_buf *rep; + + /* + * Currently, the MPRQ mempool is out of buffer + * and doing memcpy regardless of the size of Rx + * packet. Retry allocation to get back to + * normal. + */ + if (!rte_mempool_get(rxq->mprq_mp, + (void **)&rep)) + rxq->mprq_repl = rep; + } + /* Advance to the next WQE. */ + consumed_strd = 0; + ++rq_ci; + buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; + } + cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; + ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe); + if (!ret) + break; + if (unlikely(ret == -1)) { + /* RX error, packet is likely too large. */ + ++rxq->stats.idropped; + continue; + } + byte_cnt = ret; + strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> + MLX5_MPRQ_STRIDE_NUM_SHIFT; + assert(strd_cnt); + consumed_strd += strd_cnt; + if (byte_cnt & MLX5_MPRQ_FILLER_MASK) + continue; + if (mcqe == NULL) { + rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); + strd_idx = rte_be_to_cpu_16(cqe->wqe_counter); + } else { + /* mini-CQE for MPRQ doesn't have hash result. */ + strd_idx = rte_be_to_cpu_16(mcqe->stride_idx); + } + assert(strd_idx < strd_n); + assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask)); + /* + * Currently configured to receive a packet per a stride. But if + * MTU is adjusted through kernel interface, device could + * consume multiple strides without raising an error. In this + * case, the packet should be dropped because it is bigger than + * the max_rx_pkt_len. + */ + if (unlikely(strd_cnt > 1)) { + ++rxq->stats.idropped; + continue; + } + pkt = rte_pktmbuf_alloc(rxq->mp); + if (unlikely(pkt == NULL)) { + ++rxq->stats.rx_nombuf; + break; + } + len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; + assert((int)len >= (rxq->crc_present << 2)); + if (rxq->crc_present) + len -= ETHER_CRC_LEN; + offset = strd_idx * strd_sz + strd_shift; + addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf), offset); + /* Initialize the offload flag. */ + pkt->ol_flags = 0; + /* + * Memcpy packets to the target mbuf if: + * - The size of packet is smaller than mprq_max_memcpy_len. + * - Out of buffer in the Mempool for Multi-Packet RQ. + */ + if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { + /* + * When memcpy'ing packet due to out-of-buffer, the + * packet must be smaller than the target mbuf. + */ + if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { + rte_pktmbuf_free_seg(pkt); + ++rxq->stats.idropped; + continue; + } + rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); + } else { + rte_iova_t buf_iova; + struct rte_mbuf_ext_shared_info *shinfo; + uint16_t buf_len = strd_cnt * strd_sz; + + /* Increment the refcnt of the whole chunk. */ + rte_atomic16_add_return(&buf->refcnt, 1); + assert((uint16_t)rte_atomic16_read(&buf->refcnt) <= + strd_n + 1); + addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM); + /* + * MLX5 device doesn't use iova but it is necessary in a + * case where the Rx packet is transmitted via a + * different PMD. + */ + buf_iova = rte_mempool_virt2iova(buf) + + RTE_PTR_DIFF(addr, buf); + shinfo = rte_pktmbuf_ext_shinfo_init_helper(addr, + &buf_len, mlx5_mprq_buf_free_cb, buf); + /* + * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when + * attaching the stride to mbuf and more offload flags + * will be added below by calling rxq_cq_to_mbuf(). + * Other fields will be overwritten. + */ + rte_pktmbuf_attach_extbuf(pkt, addr, buf_iova, buf_len, + shinfo); + rte_pktmbuf_reset_headroom(pkt); + assert(pkt->ol_flags == EXT_ATTACHED_MBUF); + /* + * Prevent potential overflow due to MTU change through + * kernel interface. + */ + if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { + rte_pktmbuf_free_seg(pkt); + ++rxq->stats.idropped; + continue; + } + } + rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); + PKT_LEN(pkt) = len; + DATA_LEN(pkt) = len; + PORT(pkt) = rxq->port_id; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment bytes counter. */ + rxq->stats.ibytes += PKT_LEN(pkt); +#endif + /* Return packet. */ + *(pkts++) = pkt; + ++i; + } + /* Update the consumer indexes. */ + rxq->consumed_strd = consumed_strd; + rte_cio_wmb(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + if (rq_ci != rxq->rq_ci) { + rxq->rq_ci = rq_ci; + rte_cio_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment packets counter. */ + rxq->stats.ipackets += i; +#endif + return i; +} + /** * Dummy DPDK callback for TX. * @@ -1899,11 +2286,10 @@ skip: * Number of packets successfully transmitted (<= pkts_n). */ uint16_t -removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +removed_tx_burst(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_txq; - (void)pkts; - (void)pkts_n; return 0; } @@ -1924,11 +2310,10 @@ removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) * Number of packets successfully received (<= pkts_n). */ uint16_t -removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +removed_rx_burst(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_rxq; - (void)pkts; - (void)pkts_n; return 0; } @@ -1940,58 +2325,49 @@ removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) */ uint16_t __attribute__((weak)) -mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_txq; - (void)pkts; - (void)pkts_n; return 0; } uint16_t __attribute__((weak)) -mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +mlx5_tx_burst_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_txq; - (void)pkts; - (void)pkts_n; return 0; } uint16_t __attribute__((weak)) -mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) { - (void)dpdk_rxq; - (void)pkts; - (void)pkts_n; return 0; } int __attribute__((weak)) -priv_check_raw_vec_tx_support(struct priv *priv, struct rte_eth_dev *dev) +mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused) { - (void)priv; - (void)dev; return -ENOTSUP; } int __attribute__((weak)) -priv_check_vec_tx_support(struct priv *priv, struct rte_eth_dev *dev) +mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused) { - (void)priv; - (void)dev; return -ENOTSUP; } int __attribute__((weak)) -rxq_check_vec_support(struct mlx5_rxq_data *rxq) +mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) { - (void)rxq; return -ENOTSUP; } int __attribute__((weak)) -priv_check_vec_rx_support(struct priv *priv) +mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) { - (void)priv; return -ENOTSUP; } diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index d7e89055..48ed2b20 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_RXTX_H_ @@ -26,13 +26,19 @@ #include <rte_common.h> #include <rte_hexdump.h> #include <rte_atomic.h> +#include <rte_spinlock.h> +#include <rte_io.h> #include "mlx5_utils.h" #include "mlx5.h" +#include "mlx5_mr.h" #include "mlx5_autoconf.h" #include "mlx5_defs.h" #include "mlx5_prm.h" +/* Support tunnel matching. */ +#define MLX5_FLOW_TUNNEL 5 + struct mlx5_rxq_stats { unsigned int idx; /**< Mapping index. */ #ifdef MLX5_PMD_SOFT_COUNTERS @@ -54,17 +60,6 @@ struct mlx5_txq_stats { struct priv; -/* Memory region queue object. */ -struct mlx5_mr { - LIST_ENTRY(mlx5_mr) next; /**< Pointer to the next element. */ - rte_atomic32_t refcnt; /*<< Reference counter. */ - uint32_t lkey; /*<< rte_cpu_to_be_32(mr->lkey) */ - uintptr_t start; /* Start address of MR */ - uintptr_t end; /* End address of MR */ - struct ibv_mr *mr; /*<< Memory Region. */ - struct rte_mempool *mp; /*<< Memory Pool. */ -}; - /* Compressed CQE context. */ struct rxq_zip { uint16_t ai; /* Array index. */ @@ -74,10 +69,19 @@ struct rxq_zip { uint32_t cqe_cnt; /* Number of CQEs. */ }; +/* Multi-Packet RQ buffer header. */ +struct mlx5_mprq_buf { + struct rte_mempool *mp; + rte_atomic16_t refcnt; /* Atomically accessed refcnt. */ + uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first packet. */ +} __rte_cache_aligned; + +/* Get pointer to the first stride. */ +#define mlx5_mprq_buf_addr(ptr) ((ptr) + 1) + /* RX queue descriptor. */ struct mlx5_rxq_data { unsigned int csum:1; /* Enable checksum offloading. */ - unsigned int csum_l2tun:1; /* Same for L2 tunnels. */ unsigned int hw_timestamp:1; /* Enable HW timestamp. */ unsigned int vlan_strip:1; /* Enable VLAN stripping. */ unsigned int crc_present:1; /* CRC must be subtracted. */ @@ -86,24 +90,41 @@ struct mlx5_rxq_data { unsigned int elts_n:4; /* Log 2 of Mbufs. */ unsigned int rss_hash:1; /* RSS hash result is enabled. */ unsigned int mark:1; /* Marked flow available on the queue. */ - unsigned int :15; /* Remaining bits. */ + unsigned int strd_num_n:5; /* Log 2 of the number of stride. */ + unsigned int strd_sz_n:4; /* Log 2 of stride size. */ + unsigned int strd_shift_en:1; /* Enable 2bytes shift on a stride. */ + unsigned int :6; /* Remaining bits. */ volatile uint32_t *rq_db; volatile uint32_t *cq_db; uint16_t port_id; uint16_t rq_ci; + uint16_t consumed_strd; /* Number of consumed strides in WQE. */ uint16_t rq_pi; uint16_t cq_ci; - volatile struct mlx5_wqe_data_seg(*wqes)[]; + struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */ + uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */ + volatile void *wqes; volatile struct mlx5_cqe(*cqes)[]; struct rxq_zip zip; /* Compressed context. */ - struct rte_mbuf *(*elts)[]; + RTE_STD_C11 + union { + struct rte_mbuf *(*elts)[]; + struct mlx5_mprq_buf *(*mprq_bufs)[]; + }; struct rte_mempool *mp; + struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */ + struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */ struct mlx5_rxq_stats stats; uint64_t mbuf_initializer; /* Default rearm_data for vectorized Rx. */ struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */ void *cq_uar; /* CQ user access region. */ uint32_t cqn; /* CQ number. */ uint8_t cq_arm_sn; /* CQ arm seq number. */ +#ifndef RTE_ARCH_64 + rte_spinlock_t *uar_lock_cq; + /* CQ (UAR) access lock required for 32bit implementations */ +#endif + uint32_t tunnel; /* Tunnel information. */ } __rte_cache_aligned; /* Verbs Rx queue elements. */ @@ -114,18 +135,20 @@ struct mlx5_rxq_ibv { struct ibv_cq *cq; /* Completion Queue. */ struct ibv_wq *wq; /* Work Queue. */ struct ibv_comp_channel *channel; - struct mlx5_mr *mr; /* Memory Region (for mp). */ }; /* RX queue control descriptor. */ struct mlx5_rxq_ctrl { LIST_ENTRY(mlx5_rxq_ctrl) next; /* Pointer to the next element. */ rte_atomic32_t refcnt; /* Reference counter. */ - struct priv *priv; /* Back pointer to private data. */ struct mlx5_rxq_ibv *ibv; /* Verbs elements. */ + struct priv *priv; /* Back pointer to private data. */ struct mlx5_rxq_data rxq; /* Data path structure. */ unsigned int socket; /* CPU socket ID for allocations. */ unsigned int irq:1; /* Whether IRQ is enabled. */ + uint16_t idx; /* Queue index. */ + uint32_t flow_mark_n; /* Number of Mark/Flag flows using this Queue. */ + uint32_t flow_tunnels_n[MLX5_FLOW_TUNNEL]; /* Tunnels counters. */ }; /* Indirection table. */ @@ -133,7 +156,7 @@ struct mlx5_ind_table_ibv { LIST_ENTRY(mlx5_ind_table_ibv) next; /* Pointer to the next element. */ rte_atomic32_t refcnt; /* Reference counter. */ struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */ - uint16_t queues_n; /**< Number of queues in the list. */ + uint32_t queues_n; /**< Number of queues in the list. */ uint16_t queues[]; /**< Queue list. */ }; @@ -144,7 +167,7 @@ struct mlx5_hrxq { struct mlx5_ind_table_ibv *ind_table; /* Indirection table. */ struct ibv_qp *qp; /* Verbs queue pair. */ uint64_t hash_fields; /* Verbs Hash fields. */ - uint8_t rss_key_len; /* Hash key length in bytes. */ + uint32_t rss_key_len; /* Hash key length in bytes. */ uint8_t rss_key[]; /* Hash key. */ }; @@ -167,26 +190,31 @@ struct mlx5_txq_data { uint16_t tso_en:1; /* When set hardware TSO is enabled. */ uint16_t tunnel_en:1; /* When set TX offload for tunneled packets are supported. */ + uint16_t swp_en:1; /* Whether SW parser is enabled. */ uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */ uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */ uint16_t inline_max_packet_sz; /* Max packet size for inlining. */ - uint16_t mr_cache_idx; /* Index of last hit entry. */ uint32_t qp_num_8s; /* QP number shifted by 8. */ uint64_t offloads; /* Offloads for Tx Queue. */ + struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */ volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */ volatile void *wqes; /* Work queue (use volatile to write into). */ volatile uint32_t *qp_db; /* Work queue doorbell. */ volatile uint32_t *cq_db; /* Completion queue doorbell. */ volatile void *bf_reg; /* Blueflame register remapped. */ - struct mlx5_mr *mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MR translation table. */ struct rte_mbuf *(*elts)[]; /* TX elements. */ struct mlx5_txq_stats stats; /* TX queue counters. */ +#ifndef RTE_ARCH_64 + rte_spinlock_t *uar_lock; + /* UAR access lock required for 32bit implementations */ +#endif } __rte_cache_aligned; /* Verbs Rx queue elements. */ struct mlx5_txq_ibv { LIST_ENTRY(mlx5_txq_ibv) next; /* Pointer to the next element. */ rte_atomic32_t refcnt; /* Reference counter. */ + struct mlx5_txq_ctrl *txq_ctrl; /* Pointer to the control queue. */ struct ibv_cq *cq; /* Completion Queue. */ struct ibv_qp *qp; /* Queue Pair. */ }; @@ -195,112 +223,203 @@ struct mlx5_txq_ibv { struct mlx5_txq_ctrl { LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */ rte_atomic32_t refcnt; /* Reference counter. */ - struct priv *priv; /* Back pointer to private data. */ unsigned int socket; /* CPU socket ID for allocations. */ unsigned int max_inline_data; /* Max inline data. */ unsigned int max_tso_header; /* Max TSO header size. */ struct mlx5_txq_ibv *ibv; /* Verbs queue object. */ + struct priv *priv; /* Back pointer to private data. */ struct mlx5_txq_data txq; /* Data path structure. */ off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */ volatile void *bf_reg_orig; /* Blueflame register from verbs. */ + uint16_t idx; /* Queue index. */ }; /* mlx5_rxq.c */ extern uint8_t rss_hash_default_key[]; -extern const size_t rss_hash_default_key_len; - -void mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *); -int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int, - const struct rte_eth_rxconf *, struct rte_mempool *); -void mlx5_rx_queue_release(void *); -int priv_rx_intr_vec_enable(struct priv *priv); -void priv_rx_intr_vec_disable(struct priv *priv); + +int mlx5_check_mprq_support(struct rte_eth_dev *dev); +int mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq); +int mlx5_mprq_enabled(struct rte_eth_dev *dev); +int mlx5_mprq_free_mp(struct rte_eth_dev *dev); +int mlx5_mprq_alloc_mp(struct rte_eth_dev *dev); +void mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl); +int mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_rxconf *conf, + struct rte_mempool *mp); +void mlx5_rx_queue_release(void *dpdk_rxq); +int mlx5_rx_intr_vec_enable(struct rte_eth_dev *dev); +void mlx5_rx_intr_vec_disable(struct rte_eth_dev *dev); int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id); int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id); -struct mlx5_rxq_ibv *mlx5_priv_rxq_ibv_new(struct priv *, uint16_t); -struct mlx5_rxq_ibv *mlx5_priv_rxq_ibv_get(struct priv *, uint16_t); -int mlx5_priv_rxq_ibv_release(struct priv *, struct mlx5_rxq_ibv *); -int mlx5_priv_rxq_ibv_releasable(struct priv *, struct mlx5_rxq_ibv *); -int mlx5_priv_rxq_ibv_verify(struct priv *); -struct mlx5_rxq_ctrl *mlx5_priv_rxq_new(struct priv *, uint16_t, - uint16_t, unsigned int, - const struct rte_eth_rxconf *, - struct rte_mempool *); -struct mlx5_rxq_ctrl *mlx5_priv_rxq_get(struct priv *, uint16_t); -int mlx5_priv_rxq_release(struct priv *, uint16_t); -int mlx5_priv_rxq_releasable(struct priv *, uint16_t); -int mlx5_priv_rxq_verify(struct priv *); -int rxq_alloc_elts(struct mlx5_rxq_ctrl *); -struct mlx5_ind_table_ibv *mlx5_priv_ind_table_ibv_new(struct priv *, - uint16_t [], - uint16_t); -struct mlx5_ind_table_ibv *mlx5_priv_ind_table_ibv_get(struct priv *, - uint16_t [], - uint16_t); -int mlx5_priv_ind_table_ibv_release(struct priv *, struct mlx5_ind_table_ibv *); -int mlx5_priv_ind_table_ibv_verify(struct priv *); -struct mlx5_hrxq *mlx5_priv_hrxq_new(struct priv *, uint8_t *, uint8_t, - uint64_t, uint16_t [], uint16_t); -struct mlx5_hrxq *mlx5_priv_hrxq_get(struct priv *, uint8_t *, uint8_t, - uint64_t, uint16_t [], uint16_t); -int mlx5_priv_hrxq_release(struct priv *, struct mlx5_hrxq *); -int mlx5_priv_hrxq_ibv_verify(struct priv *); -uint64_t mlx5_priv_get_rx_port_offloads(struct priv *); -uint64_t mlx5_priv_get_rx_queue_offloads(struct priv *); +struct mlx5_rxq_ibv *mlx5_rxq_ibv_new(struct rte_eth_dev *dev, uint16_t idx); +struct mlx5_rxq_ibv *mlx5_rxq_ibv_get(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_rxq_ibv_release(struct mlx5_rxq_ibv *rxq_ibv); +int mlx5_rxq_ibv_releasable(struct mlx5_rxq_ibv *rxq_ibv); +struct mlx5_rxq_ibv *mlx5_rxq_ibv_drop_new(struct rte_eth_dev *dev); +void mlx5_rxq_ibv_drop_release(struct rte_eth_dev *dev); +int mlx5_rxq_ibv_verify(struct rte_eth_dev *dev); +struct mlx5_rxq_ctrl *mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, unsigned int socket, + const struct rte_eth_rxconf *conf, + struct rte_mempool *mp); +struct mlx5_rxq_ctrl *mlx5_rxq_get(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_rxq_verify(struct rte_eth_dev *dev); +int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl); +int rxq_alloc_mprq_buf(struct mlx5_rxq_ctrl *rxq_ctrl); +struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, + const uint16_t *queues, + uint32_t queues_n); +struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, + const uint16_t *queues, + uint32_t queues_n); +int mlx5_ind_table_ibv_release(struct rte_eth_dev *dev, + struct mlx5_ind_table_ibv *ind_tbl); +int mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev); +struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_drop_new(struct rte_eth_dev *dev); +void mlx5_ind_table_ibv_drop_release(struct rte_eth_dev *dev); +struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n, + int tunnel __rte_unused); +struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev, + const uint8_t *rss_key, uint32_t rss_key_len, + uint64_t hash_fields, + const uint16_t *queues, uint32_t queues_n); +int mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hxrq); +int mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev); +struct mlx5_hrxq *mlx5_hrxq_drop_new(struct rte_eth_dev *dev); +void mlx5_hrxq_drop_release(struct rte_eth_dev *dev); +uint64_t mlx5_get_rx_port_offloads(void); +uint64_t mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev); /* mlx5_txq.c */ -int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int, - const struct rte_eth_txconf *); -void mlx5_tx_queue_release(void *); -int priv_tx_uar_remap(struct priv *priv, int fd); -struct mlx5_txq_ibv *mlx5_priv_txq_ibv_new(struct priv *, uint16_t); -struct mlx5_txq_ibv *mlx5_priv_txq_ibv_get(struct priv *, uint16_t); -int mlx5_priv_txq_ibv_release(struct priv *, struct mlx5_txq_ibv *); -int mlx5_priv_txq_ibv_releasable(struct priv *, struct mlx5_txq_ibv *); -int mlx5_priv_txq_ibv_verify(struct priv *); -struct mlx5_txq_ctrl *mlx5_priv_txq_new(struct priv *, uint16_t, - uint16_t, unsigned int, - const struct rte_eth_txconf *); -struct mlx5_txq_ctrl *mlx5_priv_txq_get(struct priv *, uint16_t); -int mlx5_priv_txq_release(struct priv *, uint16_t); -int mlx5_priv_txq_releasable(struct priv *, uint16_t); -int mlx5_priv_txq_verify(struct priv *); -void txq_alloc_elts(struct mlx5_txq_ctrl *); -uint64_t mlx5_priv_get_tx_port_offloads(struct priv *); +int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_txconf *conf); +void mlx5_tx_queue_release(void *dpdk_txq); +int mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd); +struct mlx5_txq_ibv *mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx); +struct mlx5_txq_ibv *mlx5_txq_ibv_get(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_ibv_release(struct mlx5_txq_ibv *txq_ibv); +int mlx5_txq_ibv_releasable(struct mlx5_txq_ibv *txq_ibv); +int mlx5_txq_ibv_verify(struct rte_eth_dev *dev); +struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, unsigned int socket, + const struct rte_eth_txconf *conf); +struct mlx5_txq_ctrl *mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx); +int mlx5_txq_verify(struct rte_eth_dev *dev); +void txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl); +uint64_t mlx5_get_tx_port_offloads(struct rte_eth_dev *dev); /* mlx5_rxtx.c */ extern uint32_t mlx5_ptype_table[]; +extern uint8_t mlx5_cksum_table[]; +extern uint8_t mlx5_swp_types_table[]; void mlx5_set_ptype_table(void); -uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_tx_burst_empw(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t); -uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t); -uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t); -int mlx5_rx_descriptor_status(void *, uint16_t); -int mlx5_tx_descriptor_status(void *, uint16_t); +void mlx5_set_cksum_table(void); +void mlx5_set_swp_types_table(void); +uint16_t mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n); +void mlx5_mprq_buf_free_cb(void *addr, void *opaque); +void mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf); +uint16_t mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); +int mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset); +int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset); /* Vectorized version of mlx5_rxtx.c */ -int priv_check_raw_vec_tx_support(struct priv *, struct rte_eth_dev *); -int priv_check_vec_tx_support(struct priv *, struct rte_eth_dev *); -int rxq_check_vec_support(struct mlx5_rxq_data *); -int priv_check_vec_rx_support(struct priv *); -uint16_t mlx5_tx_burst_raw_vec(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_tx_burst_vec(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_rx_burst_vec(void *, struct rte_mbuf **, uint16_t); +int mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev); +int mlx5_check_vec_tx_support(struct rte_eth_dev *dev); +int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data); +int mlx5_check_vec_rx_support(struct rte_eth_dev *dev); +uint16_t mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); /* mlx5_mr.c */ -void mlx5_mp2mr_iter(struct rte_mempool *, void *); -struct mlx5_mr *priv_txq_mp2mr_reg(struct priv *priv, struct mlx5_txq_data *, - struct rte_mempool *, unsigned int); -struct mlx5_mr *mlx5_txq_mp2mr_reg(struct mlx5_txq_data *, struct rte_mempool *, - unsigned int); +void mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl); +uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr); +uint32_t mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr); + +/** + * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and + * 64bit architectures. + * + * @param val + * value to write in CPU endian format. + * @param addr + * Address to write to. + * @param lock + * Address of the lock to use for that UAR access. + */ +static __rte_always_inline void +__mlx5_uar_write64_relaxed(uint64_t val, volatile void *addr, + rte_spinlock_t *lock __rte_unused) +{ +#ifdef RTE_ARCH_64 + rte_write64_relaxed(val, addr); +#else /* !RTE_ARCH_64 */ + rte_spinlock_lock(lock); + rte_write32_relaxed(val, addr); + rte_io_wmb(); + rte_write32_relaxed(val >> 32, + (volatile void *)((volatile char *)addr + 4)); + rte_spinlock_unlock(lock); +#endif +} + +/** + * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and + * 64bit architectures while guaranteeing the order of execution with the + * code being executed. + * + * @param val + * value to write in CPU endian format. + * @param addr + * Address to write to. + * @param lock + * Address of the lock to use for that UAR access. + */ +static __rte_always_inline void +__mlx5_uar_write64(uint64_t val, volatile void *addr, rte_spinlock_t *lock) +{ + rte_io_wmb(); + __mlx5_uar_write64_relaxed(val, addr, lock); +} + +/* Assist macros, used instead of directly calling the functions they wrap. */ +#ifdef RTE_ARCH_64 +#define mlx5_uar_write64_relaxed(val, dst, lock) \ + __mlx5_uar_write64_relaxed(val, dst, NULL) +#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, NULL) +#else +#define mlx5_uar_write64_relaxed(val, dst, lock) \ + __mlx5_uar_write64_relaxed(val, dst, lock) +#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, lock) +#endif #ifndef NDEBUG /** @@ -316,7 +435,7 @@ static inline int check_cqe_seen(volatile struct mlx5_cqe *cqe) { static const uint8_t magic[] = "seen"; - volatile uint8_t (*buf)[sizeof(cqe->rsvd0)] = &cqe->rsvd0; + volatile uint8_t (*buf)[sizeof(cqe->rsvd1)] = &cqe->rsvd1; int ret = 1; unsigned int i; @@ -363,9 +482,10 @@ check_cqe(volatile struct mlx5_cqe *cqe, (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) return 0; if (!check_cqe_seen(cqe)) { - ERROR("unexpected CQE error %u (0x%02x)" - " syndrome 0x%02x", - op_code, op_code, syndrome); + DRV_LOG(ERR, + "unexpected CQE error %u (0x%02x) syndrome" + " 0x%02x", + op_code, op_code, syndrome); rte_hexdump(stderr, "MLX5 Error CQE:", (const void *)((uintptr_t)err_cqe), sizeof(*err_cqe)); @@ -374,8 +494,8 @@ check_cqe(volatile struct mlx5_cqe *cqe, } else if ((op_code != MLX5_CQE_RESP_SEND) && (op_code != MLX5_CQE_REQ)) { if (!check_cqe_seen(cqe)) { - ERROR("unexpected CQE opcode %u (0x%02x)", - op_code, op_code); + DRV_LOG(ERR, "unexpected CQE opcode %u (0x%02x)", + op_code, op_code); rte_hexdump(stderr, "MLX5 CQE:", (const void *)((uintptr_t)cqe), sizeof(*cqe)); @@ -435,7 +555,7 @@ mlx5_tx_complete(struct mlx5_txq_data *txq) if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { if (!check_cqe_seen(cqe)) { - ERROR("unexpected error CQE, TX stopped"); + DRV_LOG(ERR, "unexpected error CQE, Tx stopped"); rte_hexdump(stderr, "MLX5 TXQ:", (const void *)((uintptr_t)txq->wqes), ((1 << txq->wqe_n) * @@ -487,77 +607,65 @@ mlx5_tx_complete(struct mlx5_txq_data *txq) } /** - * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which - * the cloned mbuf is allocated is returned instead. + * Query LKey from a packet buffer for Rx. No need to flush local caches for Rx + * as mempool is pre-configured and static. * - * @param buf - * Pointer to mbuf. + * @param rxq + * Pointer to Rx queue structure. + * @param addr + * Address to search. * * @return - * Memory pool where data is located for given mbuf. + * Searched LKey on success, UINT32_MAX on no match. */ -static struct rte_mempool * -mlx5_tx_mb2mp(struct rte_mbuf *buf) +static __rte_always_inline uint32_t +mlx5_rx_addr2mr(struct mlx5_rxq_data *rxq, uintptr_t addr) { - if (unlikely(RTE_MBUF_INDIRECT(buf))) - return rte_mbuf_from_indirect(buf)->pool; - return buf->pool; + struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl; + uint32_t lkey; + + /* Linear search on MR cache array. */ + lkey = mlx5_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru, + MLX5_MR_CACHE_N, addr); + if (likely(lkey != UINT32_MAX)) + return lkey; + /* Take slower bottom-half (Binary Search) on miss. */ + return mlx5_rx_addr2mr_bh(rxq, addr); } +#define mlx5_rx_mb2mr(rxq, mb) mlx5_rx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr)) + /** - * Get Memory Region (MR) <-> rte_mbuf association from txq->mp2mr[]. - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, - * remove an entry first. + * Query LKey from a packet buffer for Tx. If not found, add the mempool. * * @param txq - * Pointer to TX queue structure. - * @param[in] mp - * Memory Pool for which a Memory Region lkey must be returned. + * Pointer to Tx queue structure. + * @param addr + * Address to search. * * @return - * mr->lkey on success, (uint32_t)-1 on failure. + * Searched LKey on success, UINT32_MAX on no match. */ static __rte_always_inline uint32_t -mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb) +mlx5_tx_addr2mr(struct mlx5_txq_data *txq, uintptr_t addr) { - uint16_t i = txq->mr_cache_idx; - uintptr_t addr = rte_pktmbuf_mtod(mb, uintptr_t); - struct mlx5_mr *mr; - - assert(i < RTE_DIM(txq->mp2mr)); - if (likely(txq->mp2mr[i]->start <= addr && txq->mp2mr[i]->end > addr)) - return txq->mp2mr[i]->lkey; - for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { - if (unlikely(txq->mp2mr[i] == NULL || - txq->mp2mr[i]->mr == NULL)) { - /* Unknown MP, add a new MR for it. */ - break; - } - if (txq->mp2mr[i]->start <= addr && - txq->mp2mr[i]->end > addr) { - assert(txq->mp2mr[i]->lkey != (uint32_t)-1); - txq->mr_cache_idx = i; - return txq->mp2mr[i]->lkey; - } - } - mr = mlx5_txq_mp2mr_reg(txq, mlx5_tx_mb2mp(mb), i); - /* - * Request the reference to use in this queue, the original one is - * kept by the control plane. - */ - if (mr) { - rte_atomic32_inc(&mr->refcnt); - txq->mr_cache_idx = i >= RTE_DIM(txq->mp2mr) ? i - 1 : i; - return mr->lkey; - } else { - struct rte_mempool *mp = mlx5_tx_mb2mp(mb); - - WARN("Failed to register mempool 0x%p(%s)", - (void *)mp, mp->name); - } - return (uint32_t)-1; + struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + uint32_t lkey; + + /* Check generation bit to see if there's any change on existing MRs. */ + if (unlikely(*mr_ctrl->dev_gen_ptr != mr_ctrl->cur_gen)) + mlx5_mr_flush_local_cache(mr_ctrl); + /* Linear search on MR cache array. */ + lkey = mlx5_mr_lookup_cache(mr_ctrl->cache, &mr_ctrl->mru, + MLX5_MR_CACHE_N, addr); + if (likely(lkey != UINT32_MAX)) + return lkey; + /* Take slower bottom-half (binary search) on miss. */ + return mlx5_tx_addr2mr_bh(txq, addr); } +#define mlx5_tx_mb2mr(rxq, mb) mlx5_tx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr)) + /** * Ring TX queue doorbell and flush the update if requested. * @@ -579,7 +687,7 @@ mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe, *txq->qp_db = rte_cpu_to_be_32(txq->wqe_ci); /* Ensure ordering between DB record and BF copy. */ rte_wmb(); - *dst = *src; + mlx5_uar_write64_relaxed(*src, dst, txq->uar_lock); if (cond) rte_wmb(); } @@ -599,38 +707,100 @@ mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe) } /** - * Convert the Checksum offloads to Verbs. + * Convert mbuf to Verb SWP. * * @param txq_data * Pointer to the Tx queue. * @param buf * Pointer to the mbuf. + * @param tso + * TSO offloads enabled. + * @param vlan + * VLAN offloads enabled + * @param offsets + * Pointer to the SWP header offsets. + * @param swp_types + * Pointer to the SWP header types. + */ +static __rte_always_inline void +txq_mbuf_to_swp(struct mlx5_txq_data *txq, struct rte_mbuf *buf, + uint8_t *offsets, uint8_t *swp_types) +{ + const uint64_t vlan = buf->ol_flags & PKT_TX_VLAN_PKT; + const uint64_t tunnel = buf->ol_flags & PKT_TX_TUNNEL_MASK; + const uint64_t tso = buf->ol_flags & PKT_TX_TCP_SEG; + const uint64_t csum_flags = buf->ol_flags & PKT_TX_L4_MASK; + const uint64_t inner_ip = + buf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6); + const uint64_t ol_flags_mask = PKT_TX_L4_MASK | PKT_TX_IPV6 | + PKT_TX_OUTER_IPV6; + uint16_t idx; + uint16_t off; + + if (likely(!txq->swp_en || (tunnel != PKT_TX_TUNNEL_UDP && + tunnel != PKT_TX_TUNNEL_IP))) + return; + /* + * The index should have: + * bit[0:1] = PKT_TX_L4_MASK + * bit[4] = PKT_TX_IPV6 + * bit[8] = PKT_TX_OUTER_IPV6 + * bit[9] = PKT_TX_OUTER_UDP + */ + idx = (buf->ol_flags & ol_flags_mask) >> 52; + if (tunnel == PKT_TX_TUNNEL_UDP) + idx |= 1 << 9; + *swp_types = mlx5_swp_types_table[idx]; + /* + * Set offsets for SW parser. Since ConnectX-5, SW parser just + * complements HW parser. SW parser starts to engage only if HW parser + * can't reach a header. For the older devices, HW parser will not kick + * in if any of SWP offsets is set. Therefore, all of the L3 offsets + * should be set regardless of HW offload. + */ + off = buf->outer_l2_len + (vlan ? sizeof(struct vlan_hdr) : 0); + offsets[1] = off >> 1; /* Outer L3 offset. */ + off += buf->outer_l3_len; + if (tunnel == PKT_TX_TUNNEL_UDP) + offsets[0] = off >> 1; /* Outer L4 offset. */ + if (inner_ip) { + off += buf->l2_len; + offsets[3] = off >> 1; /* Inner L3 offset. */ + if (csum_flags == PKT_TX_TCP_CKSUM || tso || + csum_flags == PKT_TX_UDP_CKSUM) { + off += buf->l3_len; + offsets[2] = off >> 1; /* Inner L4 offset. */ + } + } +} + +/** + * Convert the Checksum offloads to Verbs. + * + * @param buf + * Pointer to the mbuf. * * @return - * the converted cs_flags. + * Converted checksum flags. */ static __rte_always_inline uint8_t -txq_ol_cksum_to_cs(struct mlx5_txq_data *txq_data, struct rte_mbuf *buf) +txq_ol_cksum_to_cs(struct rte_mbuf *buf) { - uint8_t cs_flags = 0; - - /* Should we enable HW CKSUM offload */ - if (buf->ol_flags & - (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM | - PKT_TX_OUTER_IP_CKSUM)) { - if (txq_data->tunnel_en && - (buf->ol_flags & - (PKT_TX_TUNNEL_GRE | PKT_TX_TUNNEL_VXLAN))) { - cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | - MLX5_ETH_WQE_L4_INNER_CSUM; - if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) - cs_flags |= MLX5_ETH_WQE_L3_CSUM; - } else { - cs_flags = MLX5_ETH_WQE_L3_CSUM | - MLX5_ETH_WQE_L4_CSUM; - } - } - return cs_flags; + uint32_t idx; + uint8_t is_tunnel = !!(buf->ol_flags & PKT_TX_TUNNEL_MASK); + const uint64_t ol_flags_mask = PKT_TX_TCP_SEG | PKT_TX_L4_MASK | + PKT_TX_IP_CKSUM | PKT_TX_OUTER_IP_CKSUM; + + /* + * The index should have: + * bit[0] = PKT_TX_TCP_SEG + * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM + * bit[4] = PKT_TX_IP_CKSUM + * bit[8] = PKT_TX_OUTER_IP_CKSUM + * bit[9] = tunnel + */ + idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9); + return mlx5_cksum_table[idx]; } /** diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c index b66c2916..0a4aed8f 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.c +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox. + * Copyright 2017 Mellanox Technologies, Ltd */ #include <assert.h> @@ -42,8 +42,6 @@ /** * Count the number of packets having same ol_flags and calculate cs_flags. * - * @param txq - * Pointer to TX queue structure. * @param pkts * Pointer to array of packets. * @param pkts_n @@ -55,8 +53,7 @@ * Number of packets having same ol_flags. */ static inline unsigned int -txq_calc_offload(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, - uint16_t pkts_n, uint8_t *cs_flags) +txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags) { unsigned int pos; const uint64_t ol_mask = @@ -70,7 +67,7 @@ txq_calc_offload(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, for (pos = 1; pos < pkts_n; ++pos) if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask) break; - *cs_flags = txq_ol_cksum_to_cs(txq, pkts[0]); + *cs_flags = txq_ol_cksum_to_cs(pkts[0]); return pos; } @@ -141,7 +138,7 @@ mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS) n = txq_count_contig_single_seg(&pkts[nb_tx], n); if (txq->offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) - n = txq_calc_offload(txq, &pkts[nb_tx], n, &cs_flags); + n = txq_calc_offload(&pkts[nb_tx], n, &cs_flags); ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags); nb_tx += ret; if (!ret) @@ -223,17 +220,14 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) /** * Check Tx queue flags are set for raw vectorized Tx. * - * @param priv - * Pointer to private structure. * @param dev - * Pointer to rte_eth_dev structure. + * Pointer to Ethernet device. * * @return * 1 if supported, negative errno value if not. */ int __attribute__((cold)) -priv_check_raw_vec_tx_support(__rte_unused struct priv *priv, - struct rte_eth_dev *dev) +mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev) { uint64_t offloads = dev->data->dev_conf.txmode.offloads; @@ -246,17 +240,16 @@ priv_check_raw_vec_tx_support(__rte_unused struct priv *priv, /** * Check a device can support vectorized TX. * - * @param priv - * Pointer to private structure. * @param dev - * Pointer to rte_eth_dev structure. + * Pointer to Ethernet device. * * @return * 1 if supported, negative errno value if not. */ int __attribute__((cold)) -priv_check_vec_tx_support(struct priv *priv, struct rte_eth_dev *dev) +mlx5_check_vec_tx_support(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; uint64_t offloads = dev->data->dev_conf.txmode.offloads; if (!priv->config.tx_vec_en || @@ -277,11 +270,13 @@ priv_check_vec_tx_support(struct priv *priv, struct rte_eth_dev *dev) * 1 if supported, negative errno value if not. */ int __attribute__((cold)) -rxq_check_vec_support(struct mlx5_rxq_data *rxq) +mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq) { struct mlx5_rxq_ctrl *ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); + if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv))) + return -ENOTSUP; if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0) return -ENOTSUP; return 1; @@ -290,26 +285,29 @@ rxq_check_vec_support(struct mlx5_rxq_data *rxq) /** * Check a device can support vectorized RX. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return * 1 if supported, negative errno value if not. */ int __attribute__((cold)) -priv_check_vec_rx_support(struct priv *priv) +mlx5_check_vec_rx_support(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; uint16_t i; if (!priv->config.rx_vec_en) return -ENOTSUP; + if (mlx5_mprq_enabled(dev)) + return -ENOTSUP; /* All the configured queues should support. */ for (i = 0; i < priv->rxqs_n; ++i) { struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; if (!rxq) continue; - if (rxq_check_vec_support(rxq) < 0) + if (mlx5_rxq_check_vec_support(rxq) < 0) break; } if (i != priv->rxqs_n) diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h index 44856bbf..fb884f92 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox. + * Copyright 2017 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_RXTX_VEC_H_ @@ -87,21 +87,26 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n) const uint16_t q_mask = q_n - 1; uint16_t elts_idx = rxq->rq_ci & q_mask; struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; - volatile struct mlx5_wqe_data_seg *wq = &(*rxq->wqes)[elts_idx]; + volatile struct mlx5_wqe_data_seg *wq = + &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx]; unsigned int i; - assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH); + assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n)); assert(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi))); - assert(MLX5_VPMD_RXQ_RPLNSH_THRESH > MLX5_VPMD_DESCS_PER_LOOP); + assert(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) > MLX5_VPMD_DESCS_PER_LOOP); /* Not to cross queue end. */ n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx); if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { rxq->stats.rx_nombuf += n; return; } - for (i = 0; i < n; ++i) + for (i = 0; i < n; ++i) { wq[i].addr = rte_cpu_to_be_64((uintptr_t)elts[i]->buf_addr + RTE_PKTMBUF_HEADROOM); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]); + } rxq->rq_ci += n; /* Prevent overflowing into consumed mbufs. */ elts_idx = rxq->rq_ci & q_mask; diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h index bbe1818e..b37b7381 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox. + * Copyright 2017 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_RXTX_VEC_NEON_H_ @@ -107,8 +107,6 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, assert(elts_n > pkts_n); mlx5_tx_complete(txq); - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); if (unlikely(!pkts_n)) return 0; for (n = 0; n < pkts_n; ++n) { @@ -142,7 +140,7 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, break; wqe = &((volatile struct mlx5_wqe64 *) txq->wqes)[wqe_ci & wq_mask].hdr; - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); /* Title WQEBB pointer. */ t_wqe = (uint8x16_t *)wqe; dseg = (uint8_t *)(wqe + 1); @@ -167,8 +165,8 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, vst1q_u8((void *)t_wqe, ctrl); /* Fill ESEG in the header. */ vst1q_u16((void *)(t_wqe + 1), - (uint16x8_t) { 0, 0, cs_flags, rte_cpu_to_be_16(len), - 0, 0, 0, 0 }); + ((uint16x8_t) { 0, 0, cs_flags, rte_cpu_to_be_16(len), + 0, 0, 0, 0 })); txq->wqe_ci = wqe_ci; } if (!n) @@ -176,12 +174,11 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, txq->elts_comp += (uint16_t)(elts_head - txq->elts_head); txq->elts_head = elts_head; if (txq->elts_comp >= MLX5_TX_COMP_THRESH) { + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); wqe->ctrl[2] = rte_cpu_to_be_32(8); wqe->ctrl[3] = txq->elts_head; txq->elts_comp = 0; -#ifndef NDEBUG - ++txq->cq_pi; -#endif } #ifdef MLX5_PMD_SOFT_COUNTERS txq->stats.opackets += n; @@ -245,8 +242,6 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, assert(elts_n > pkts_n); mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts); if (unlikely(!pkts_n)) @@ -282,11 +277,10 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) { txq->elts_comp += pkts_n; } else { + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); /* Request a completion. */ txq->elts_comp = 0; -#ifndef NDEBUG - ++txq->cq_pi; -#endif comp_req = 8; } /* Fill CTRL in the header. */ @@ -300,10 +294,10 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, vst1q_u8((void *)t_wqe, ctrl); /* Fill ESEG in the header. */ vst1q_u8((void *)(t_wqe + 1), - (uint8x16_t) { 0, 0, 0, 0, - cs_flags, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0 }); + ((uint8x16_t) { 0, 0, 0, 0, + cs_flags, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 })); #ifdef MLX5_PMD_SOFT_COUNTERS txq->stats.opackets += pkts_n; #endif @@ -551,6 +545,7 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, const uint64x1_t mbuf_init = vld1_u64(&rxq->mbuf_initializer); const uint64x1_t r32_mask = vcreate_u64(0xffffffff); uint64x2_t rearm0, rearm1, rearm2, rearm3; + uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3; if (rxq->mark) { const uint32x4_t ft_def = vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT); @@ -583,14 +578,18 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, ptype = vshrn_n_u32(ptype_info, 10); /* Errored packets will have RTE_PTYPE_ALL_MASK. */ ptype = vorr_u16(ptype, op_err); - pkts[0]->packet_type = - mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 6)]; - pkts[1]->packet_type = - mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 4)]; - pkts[2]->packet_type = - mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 2)]; - pkts[3]->packet_type = - mlx5_ptype_table[vget_lane_u8(vreinterpret_u8_u16(ptype), 0)]; + pt_idx0 = vget_lane_u8(vreinterpret_u8_u16(ptype), 6); + pt_idx1 = vget_lane_u8(vreinterpret_u8_u16(ptype), 4); + pt_idx2 = vget_lane_u8(vreinterpret_u8_u16(ptype), 2); + pt_idx3 = vget_lane_u8(vreinterpret_u8_u16(ptype), 0); + pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] | + !!(pt_idx0 & (1 << 6)) * rxq->tunnel; + pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] | + !!(pt_idx1 & (1 << 6)) * rxq->tunnel; + pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] | + !!(pt_idx2 & (1 << 6)) * rxq->tunnel; + pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] | + !!(pt_idx3 & (1 << 6)) * rxq->tunnel; /* Fill flags for checksum and VLAN. */ pinfo = vandq_u32(ptype_info, ptype_ol_mask); pinfo = vreinterpretq_u32_u8( @@ -734,7 +733,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, * N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished). */ repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); - if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH) + if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n)) mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); /* See if there're unreturned mbufs from compressed CQE. */ rcvd_pkt = rxq->cq_ci - rxq->rq_pi; diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h index c088bcb5..54b3783c 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox. + * Copyright 2017 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_ @@ -107,8 +107,6 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, assert(elts_n > pkts_n); mlx5_tx_complete(txq); - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); if (unlikely(!pkts_n)) return 0; for (n = 0; n < pkts_n; ++n) { @@ -144,7 +142,7 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, } wqe = &((volatile struct mlx5_wqe64 *) txq->wqes)[wqe_ci & wq_mask].hdr; - cs_flags = txq_ol_cksum_to_cs(txq, buf); + cs_flags = txq_ol_cksum_to_cs(buf); /* Title WQEBB pointer. */ t_wqe = (__m128i *)wqe; dseg = (__m128i *)(wqe + 1); @@ -177,12 +175,11 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, txq->elts_comp += (uint16_t)(elts_head - txq->elts_head); txq->elts_head = elts_head; if (txq->elts_comp >= MLX5_TX_COMP_THRESH) { + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); wqe->ctrl[2] = rte_cpu_to_be_32(8); wqe->ctrl[3] = txq->elts_head; txq->elts_comp = 0; -#ifndef NDEBUG - ++txq->cq_pi; -#endif } #ifdef MLX5_PMD_SOFT_COUNTERS txq->stats.opackets += n; @@ -244,8 +241,6 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, assert(elts_n > pkts_n); mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts); assert(pkts_n <= MLX5_DSEG_MAX - nb_dword_in_hdr); @@ -283,11 +278,10 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) { txq->elts_comp += pkts_n; } else { + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); /* Request a completion. */ txq->elts_comp = 0; -#ifndef NDEBUG - ++txq->cq_pi; -#endif comp_req = 8; } /* Fill CTRL in the header. */ @@ -542,6 +536,7 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], const __m128i mbuf_init = _mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer); __m128i rearm0, rearm1, rearm2, rearm3; + uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3; /* Extract pkt_info field. */ pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]); @@ -595,10 +590,18 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], /* Errored packets will have RTE_PTYPE_ALL_MASK. */ op_err = _mm_srli_epi16(op_err, 8); ptype = _mm_or_si128(ptype, op_err); - pkts[0]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 0)]; - pkts[1]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 2)]; - pkts[2]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 4)]; - pkts[3]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 6)]; + pt_idx0 = _mm_extract_epi8(ptype, 0); + pt_idx1 = _mm_extract_epi8(ptype, 2); + pt_idx2 = _mm_extract_epi8(ptype, 4); + pt_idx3 = _mm_extract_epi8(ptype, 6); + pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] | + !!(pt_idx0 & (1 << 6)) * rxq->tunnel; + pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] | + !!(pt_idx1 & (1 << 6)) * rxq->tunnel; + pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] | + !!(pt_idx2 & (1 << 6)) * rxq->tunnel; + pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] | + !!(pt_idx3 & (1 << 6)) * rxq->tunnel; /* Fill flags for checksum and VLAN. */ pinfo = _mm_and_si128(pinfo, ptype_ol_mask); pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo); @@ -715,7 +718,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, * N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished). */ repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); - if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH) + if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n)) mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); /* See if there're unreturned mbufs from compressed CQE. */ rcvd_pkt = rxq->cq_ci - rxq->rq_pi; diff --git a/drivers/net/mlx5/mlx5_socket.c b/drivers/net/mlx5/mlx5_socket.c index 61c1a4a5..a3a52291 100644 --- a/drivers/net/mlx5/mlx5_socket.c +++ b/drivers/net/mlx5/mlx5_socket.c @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2016 6WIND S.A. + * Copyright 2016 Mellanox Technologies, Ltd */ #define _GNU_SOURCE @@ -18,92 +19,105 @@ /** * Initialise the socket to communicate with the secondary process * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_socket_init(struct priv *priv) +mlx5_socket_init(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct sockaddr_un sun = { .sun_family = AF_UNIX, }; int ret; int flags; - struct stat file_stat; /* + * Close the last socket that was used to communicate + * with the secondary process + */ + if (priv->primary_socket) + mlx5_socket_uninit(dev); + /* * Initialise the socket to communicate with the secondary * process. */ ret = socket(AF_UNIX, SOCK_STREAM, 0); if (ret < 0) { - WARN("secondary process not supported: %s", strerror(errno)); - return ret; + rte_errno = errno; + DRV_LOG(WARNING, "port %u secondary process not supported: %s", + dev->data->port_id, strerror(errno)); + goto error; } priv->primary_socket = ret; flags = fcntl(priv->primary_socket, F_GETFL, 0); - if (flags == -1) - goto out; + if (flags == -1) { + rte_errno = errno; + goto error; + } ret = fcntl(priv->primary_socket, F_SETFL, flags | O_NONBLOCK); - if (ret < 0) - goto out; + if (ret < 0) { + rte_errno = errno; + goto error; + } snprintf(sun.sun_path, sizeof(sun.sun_path), "/var/tmp/%s_%d", MLX5_DRIVER_NAME, priv->primary_socket); - ret = stat(sun.sun_path, &file_stat); - if (!ret) - claim_zero(remove(sun.sun_path)); + remove(sun.sun_path); ret = bind(priv->primary_socket, (const struct sockaddr *)&sun, sizeof(sun)); if (ret < 0) { - WARN("cannot bind socket, secondary process not supported: %s", - strerror(errno)); + rte_errno = errno; + DRV_LOG(WARNING, + "port %u cannot bind socket, secondary process not" + " supported: %s", + dev->data->port_id, strerror(errno)); goto close; } ret = listen(priv->primary_socket, 0); if (ret < 0) { - WARN("Secondary process not supported: %s", strerror(errno)); + rte_errno = errno; + DRV_LOG(WARNING, "port %u secondary process not supported: %s", + dev->data->port_id, strerror(errno)); goto close; } - return ret; + return 0; close: remove(sun.sun_path); -out: +error: claim_zero(close(priv->primary_socket)); priv->primary_socket = 0; - return -(ret); + return -rte_errno; } /** * Un-Initialise the socket to communicate with the secondary process * - * @param[in] priv - * Pointer to private structure. - * - * @return - * 0 on success, errno value on failure. + * @param[in] dev */ -int -priv_socket_uninit(struct priv *priv) +void +mlx5_socket_uninit(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; + MKSTR(path, "/var/tmp/%s_%d", MLX5_DRIVER_NAME, priv->primary_socket); claim_zero(close(priv->primary_socket)); priv->primary_socket = 0; claim_zero(remove(path)); - return 0; } /** * Handle socket interrupts. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. */ void -priv_socket_handle(struct priv *priv) +mlx5_socket_handle(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; int conn_sock; int ret = 0; struct cmsghdr *cmsg = NULL; @@ -125,25 +139,30 @@ priv_socket_handle(struct priv *priv) /* Accept the connection from the client. */ conn_sock = accept(priv->primary_socket, NULL, NULL); if (conn_sock < 0) { - WARN("connection failed: %s", strerror(errno)); + DRV_LOG(WARNING, "port %u connection failed: %s", + dev->data->port_id, strerror(errno)); return; } ret = setsockopt(conn_sock, SOL_SOCKET, SO_PASSCRED, &(int){1}, sizeof(int)); if (ret < 0) { - WARN("cannot change socket options"); - goto out; + ret = errno; + DRV_LOG(WARNING, "port %u cannot change socket options: %s", + dev->data->port_id, strerror(rte_errno)); + goto error; } ret = recvmsg(conn_sock, &msg, MSG_WAITALL); if (ret < 0) { - WARN("received an empty message: %s", strerror(errno)); - goto out; + ret = errno; + DRV_LOG(WARNING, "port %u received an empty message: %s", + dev->data->port_id, strerror(rte_errno)); + goto error; } /* Expect to receive credentials only. */ cmsg = CMSG_FIRSTHDR(&msg); if (cmsg == NULL) { - WARN("no message"); - goto out; + DRV_LOG(WARNING, "port %u no message", dev->data->port_id); + goto error; } if ((cmsg->cmsg_type == SCM_CREDENTIALS) && (cmsg->cmsg_len >= sizeof(*cred))) { @@ -152,14 +171,16 @@ priv_socket_handle(struct priv *priv) } cmsg = CMSG_NXTHDR(&msg, cmsg); if (cmsg != NULL) { - WARN("Message wrongly formatted"); - goto out; + DRV_LOG(WARNING, "port %u message wrongly formatted", + dev->data->port_id); + goto error; } /* Make sure all the ancillary data was received and valid. */ if ((cred == NULL) || (cred->uid != getuid()) || (cred->gid != getgid())) { - WARN("wrong credentials"); - goto out; + DRV_LOG(WARNING, "port %u wrong credentials", + dev->data->port_id); + goto error; } /* Set-up the ancillary data. */ cmsg = CMSG_FIRSTHDR(&msg); @@ -171,27 +192,29 @@ priv_socket_handle(struct priv *priv) *fd = priv->ctx->cmd_fd; ret = sendmsg(conn_sock, &msg, 0); if (ret < 0) - WARN("cannot send response"); -out: + DRV_LOG(WARNING, "port %u cannot send response", + dev->data->port_id); +error: close(conn_sock); } /** * Connect to the primary process. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet structure. * * @return - * fd on success, negative errno value on failure. + * fd on success, negative errno value otherwise and rte_errno is set. */ int -priv_socket_connect(struct priv *priv) +mlx5_socket_connect(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct sockaddr_un sun = { .sun_family = AF_UNIX, }; - int socket_fd; + int socket_fd = -1; int *fd = NULL; int ret; struct ucred *cred; @@ -211,57 +234,75 @@ priv_socket_connect(struct priv *priv) ret = socket(AF_UNIX, SOCK_STREAM, 0); if (ret < 0) { - WARN("cannot connect to primary"); - return ret; + rte_errno = errno; + DRV_LOG(WARNING, "port %u cannot connect to primary", + dev->data->port_id); + goto error; } socket_fd = ret; snprintf(sun.sun_path, sizeof(sun.sun_path), "/var/tmp/%s_%d", MLX5_DRIVER_NAME, priv->primary_socket); ret = connect(socket_fd, (const struct sockaddr *)&sun, sizeof(sun)); if (ret < 0) { - WARN("cannot connect to primary"); - goto out; + rte_errno = errno; + DRV_LOG(WARNING, "port %u cannot connect to primary", + dev->data->port_id); + goto error; } cmsg = CMSG_FIRSTHDR(&msg); if (cmsg == NULL) { - DEBUG("cannot get first message"); - goto out; + rte_errno = EINVAL; + DRV_LOG(DEBUG, "port %u cannot get first message", + dev->data->port_id); + goto error; } cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_CREDENTIALS; cmsg->cmsg_len = CMSG_LEN(sizeof(*cred)); cred = (struct ucred *)CMSG_DATA(cmsg); if (cred == NULL) { - DEBUG("no credentials received"); - goto out; + rte_errno = EINVAL; + DRV_LOG(DEBUG, "port %u no credentials received", + dev->data->port_id); + goto error; } cred->pid = getpid(); cred->uid = getuid(); cred->gid = getgid(); ret = sendmsg(socket_fd, &msg, MSG_DONTWAIT); if (ret < 0) { - WARN("cannot send credentials to primary: %s", - strerror(errno)); - goto out; + rte_errno = errno; + DRV_LOG(WARNING, + "port %u cannot send credentials to primary: %s", + dev->data->port_id, strerror(errno)); + goto error; } ret = recvmsg(socket_fd, &msg, MSG_WAITALL); if (ret <= 0) { - WARN("no message from primary: %s", strerror(errno)); - goto out; + rte_errno = errno; + DRV_LOG(WARNING, "port %u no message from primary: %s", + dev->data->port_id, strerror(errno)); + goto error; } cmsg = CMSG_FIRSTHDR(&msg); if (cmsg == NULL) { - WARN("No file descriptor received"); - goto out; + rte_errno = EINVAL; + DRV_LOG(WARNING, "port %u no file descriptor received", + dev->data->port_id); + goto error; } fd = (int *)CMSG_DATA(cmsg); - if (*fd <= 0) { - WARN("no file descriptor received: %s", strerror(errno)); - ret = *fd; - goto out; + if (*fd < 0) { + DRV_LOG(WARNING, "port %u no file descriptor received: %s", + dev->data->port_id, strerror(errno)); + rte_errno = *fd; + goto error; } ret = *fd; -out: close(socket_fd); return ret; +error: + if (socket_fd != -1) + close(socket_fd); + return -rte_errno; } diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c index 378472a7..91f3d474 100644 --- a/drivers/net/mlx5/mlx5_stats.c +++ b/drivers/net/mlx5/mlx5_stats.c @@ -1,10 +1,13 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ +#include <inttypes.h> #include <linux/sockios.h> #include <linux/ethtool.h> +#include <stdint.h> +#include <stdio.h> #include <rte_ethdev_driver.h> #include <rte_common.h> @@ -19,6 +22,7 @@ struct mlx5_counter_ctrl { char dpdk_name[RTE_ETH_XSTATS_NAME_SIZE]; /* Name of the counter on the device table. */ char ctr_name[RTE_ETH_XSTATS_NAME_SIZE]; + uint32_t ib:1; /**< Nonzero for IB counters. */ }; static const struct mlx5_counter_ctrl mlx5_counters_init[] = { @@ -93,6 +97,7 @@ static const struct mlx5_counter_ctrl mlx5_counters_init[] = { { .dpdk_name = "rx_out_of_buffer", .ctr_name = "out_of_buffer", + .ib = 1, }, { .dpdk_name = "tx_packets_phy", @@ -117,39 +122,56 @@ static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init); /** * Read device counters table. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param[out] stats * Counters table output buffer. * * @return - * 0 on success and stats is filled, negative on error. + * 0 on success and stats is filled, negative errno value otherwise and + * rte_errno is set. */ static int -priv_read_dev_counters(struct priv *priv, uint64_t *stats) +mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats) { + struct priv *priv = dev->data->dev_private; struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; unsigned int i; struct ifreq ifr; unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t); unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz]; struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf; + int ret; et_stats->cmd = ETHTOOL_GSTATS; et_stats->n_stats = xstats_ctrl->stats_n; ifr.ifr_data = (caddr_t)et_stats; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr) != 0) { - WARN("unable to read statistic values from device"); - return -1; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + if (ret) { + DRV_LOG(WARNING, + "port %u unable to read statistic values from device", + dev->data->port_id); + return ret; } for (i = 0; i != xstats_n; ++i) { - if (priv_is_ib_cntr(mlx5_counters_init[i].ctr_name)) - priv_get_cntr_sysfs(priv, - mlx5_counters_init[i].ctr_name, - &stats[i]); - else + if (mlx5_counters_init[i].ib) { + FILE *file; + MKSTR(path, "%s/ports/1/hw_counters/%s", + priv->ibdev_path, + mlx5_counters_init[i].ctr_name); + + file = fopen(path, "rb"); + if (file) { + int n = fscanf(file, "%" SCNu64, &stats[i]); + + fclose(file); + if (n != 1) + stats[i] = 0; + } + } else { stats[i] = (uint64_t) et_stats->data[xstats_ctrl->dev_table_idx[i]]; + } } return 0; } @@ -157,22 +179,26 @@ priv_read_dev_counters(struct priv *priv, uint64_t *stats) /** * Query the number of statistics provided by ETHTOOL. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return - * Number of statistics on success, -1 on error. + * Number of statistics on success, negative errno value otherwise and + * rte_errno is set. */ static int -priv_ethtool_get_stats_n(struct priv *priv) { +mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) { struct ethtool_drvinfo drvinfo; struct ifreq ifr; + int ret; drvinfo.cmd = ETHTOOL_GDRVINFO; ifr.ifr_data = (caddr_t)&drvinfo; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr) != 0) { - WARN("unable to query number of statistics"); - return -1; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + if (ret) { + DRV_LOG(WARNING, "port %u unable to query number of statistics", + dev->data->port_id); + return ret; } return drvinfo.n_stats; } @@ -180,12 +206,13 @@ priv_ethtool_get_stats_n(struct priv *priv) { /** * Init the structures to read device counters. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. */ void -priv_xstats_init(struct priv *priv) +mlx5_xstats_init(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; unsigned int i; unsigned int j; @@ -193,12 +220,15 @@ priv_xstats_init(struct priv *priv) struct ethtool_gstrings *strings = NULL; unsigned int dev_stats_n; unsigned int str_sz; + int ret; - dev_stats_n = priv_ethtool_get_stats_n(priv); - if (dev_stats_n < 1) { - WARN("no extended statistics available"); + ret = mlx5_ethtool_get_stats_n(dev); + if (ret < 0) { + DRV_LOG(WARNING, "port %u no extended statistics available", + dev->data->port_id); return; } + dev_stats_n = ret; xstats_ctrl->stats_n = dev_stats_n; /* Allocate memory to grab stat names and values. */ str_sz = dev_stats_n * ETH_GSTRING_LEN; @@ -206,15 +236,18 @@ priv_xstats_init(struct priv *priv) rte_malloc("xstats_strings", str_sz + sizeof(struct ethtool_gstrings), 0); if (!strings) { - WARN("unable to allocate memory for xstats"); + DRV_LOG(WARNING, "port %u unable to allocate memory for xstats", + dev->data->port_id); return; } strings->cmd = ETHTOOL_GSTRINGS; strings->string_set = ETH_SS_STATS; strings->len = dev_stats_n; ifr.ifr_data = (caddr_t)strings; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr) != 0) { - WARN("unable to get statistic names"); + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + if (ret) { + DRV_LOG(WARNING, "port %u unable to get statistic names", + dev->data->port_id); goto free; } for (j = 0; j != xstats_n; ++j) @@ -232,68 +265,67 @@ priv_xstats_init(struct priv *priv) } } for (j = 0; j != xstats_n; ++j) { - if (priv_is_ib_cntr(mlx5_counters_init[j].ctr_name)) + if (mlx5_counters_init[j].ib) continue; if (xstats_ctrl->dev_table_idx[j] >= dev_stats_n) { - WARN("counter \"%s\" is not recognized", - mlx5_counters_init[j].dpdk_name); + DRV_LOG(WARNING, + "port %u counter \"%s\" is not recognized", + dev->data->port_id, + mlx5_counters_init[j].dpdk_name); goto free; } } /* Copy to base at first time. */ assert(xstats_n <= MLX5_MAX_XSTATS); - priv_read_dev_counters(priv, xstats_ctrl->base); + ret = mlx5_read_dev_counters(dev, xstats_ctrl->base); + if (ret) + DRV_LOG(ERR, "port %u cannot read device counters: %s", + dev->data->port_id, strerror(rte_errno)); free: rte_free(strings); } /** - * Get device extended statistics. + * DPDK callback to get extended device statistics. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param[out] stats * Pointer to rte extended stats table. + * @param n + * The size of the stats table. * * @return * Number of extended stats on success and stats is filled, - * negative on error. + * negative on error and rte_errno is set. */ -static int -priv_xstats_get(struct priv *priv, struct rte_eth_xstat *stats) +int +mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats, + unsigned int n) { - struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + struct priv *priv = dev->data->dev_private; unsigned int i; - unsigned int n = xstats_n; uint64_t counters[n]; - if (priv_read_dev_counters(priv, counters) < 0) - return -1; - for (i = 0; i != xstats_n; ++i) { - stats[i].id = i; - stats[i].value = (counters[i] - xstats_ctrl->base[i]); - } - return n; -} - -/** - * Reset device extended statistics. - * - * @param priv - * Pointer to private structure. - */ -static void -priv_xstats_reset(struct priv *priv) -{ - struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; - unsigned int i; - unsigned int n = xstats_n; - uint64_t counters[n]; + if (n >= xstats_n && stats) { + struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + int stats_n; + int ret; - if (priv_read_dev_counters(priv, counters) < 0) - return; - for (i = 0; i != n; ++i) - xstats_ctrl->base[i] = counters[i]; + stats_n = mlx5_ethtool_get_stats_n(dev); + if (stats_n < 0) + return stats_n; + if (xstats_ctrl->stats_n != stats_n) + mlx5_xstats_init(dev); + ret = mlx5_read_dev_counters(dev, counters); + if (ret) + return ret; + for (i = 0; i != xstats_n; ++i) { + stats[i].id = i; + stats[i].value = (counters[i] - xstats_ctrl->base[i]); + } + } + return xstats_n; } /** @@ -303,6 +335,10 @@ priv_xstats_reset(struct priv *priv) * Pointer to Ethernet device structure. * @param[out] stats * Stats structure output buffer. + * + * @return + * 0 on success and stats is filled, negative errno value otherwise and + * rte_errno is set. */ int mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) @@ -312,7 +348,6 @@ mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) unsigned int i; unsigned int idx; - priv_lock(priv); /* Add software counters. */ for (i = 0; (i != priv->rxqs_n); ++i) { struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; @@ -358,7 +393,6 @@ mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) /* FIXME: retrieve and add hardware counters. */ #endif *stats = tmp; - priv_unlock(priv); return 0; } @@ -375,7 +409,6 @@ mlx5_stats_reset(struct rte_eth_dev *dev) unsigned int i; unsigned int idx; - priv_lock(priv); for (i = 0; (i != priv->rxqs_n); ++i) { if ((*priv->rxqs)[i] == NULL) continue; @@ -393,45 +426,6 @@ mlx5_stats_reset(struct rte_eth_dev *dev) #ifndef MLX5_PMD_SOFT_COUNTERS /* FIXME: reset hardware counters. */ #endif - priv_unlock(priv); -} - -/** - * DPDK callback to get extended device statistics. - * - * @param dev - * Pointer to Ethernet device structure. - * @param[out] stats - * Stats table output buffer. - * @param n - * The size of the stats table. - * - * @return - * Number of xstats on success, negative on failure. - */ -int -mlx5_xstats_get(struct rte_eth_dev *dev, - struct rte_eth_xstat *stats, unsigned int n) -{ - struct priv *priv = dev->data->dev_private; - int ret = xstats_n; - - if (n >= xstats_n && stats) { - struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; - int stats_n; - - priv_lock(priv); - stats_n = priv_ethtool_get_stats_n(priv); - if (stats_n < 0) { - priv_unlock(priv); - return -1; - } - if (xstats_ctrl->stats_n != stats_n) - priv_xstats_init(priv); - ret = priv_xstats_get(priv, stats); - priv_unlock(priv); - } - return ret; } /** @@ -446,16 +440,27 @@ mlx5_xstats_reset(struct rte_eth_dev *dev) struct priv *priv = dev->data->dev_private; struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; int stats_n; + unsigned int i; + unsigned int n = xstats_n; + uint64_t counters[n]; + int ret; - priv_lock(priv); - stats_n = priv_ethtool_get_stats_n(priv); - if (stats_n < 0) - goto unlock; + stats_n = mlx5_ethtool_get_stats_n(dev); + if (stats_n < 0) { + DRV_LOG(ERR, "port %u cannot get stats: %s", dev->data->port_id, + strerror(-stats_n)); + return; + } if (xstats_ctrl->stats_n != stats_n) - priv_xstats_init(priv); - priv_xstats_reset(priv); -unlock: - priv_unlock(priv); + mlx5_xstats_init(dev); + ret = mlx5_read_dev_counters(dev, counters); + if (ret) { + DRV_LOG(ERR, "port %u cannot read device counters: %s", + dev->data->port_id, strerror(rte_errno)); + return; + } + for (i = 0; i != n; ++i) + xstats_ctrl->base[i] = counters[i]; } /** @@ -472,21 +477,18 @@ unlock: * Number of xstats names. */ int -mlx5_xstats_get_names(struct rte_eth_dev *dev, - struct rte_eth_xstat_name *xstats_names, unsigned int n) +mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_xstat_name *xstats_names, unsigned int n) { - struct priv *priv = dev->data->dev_private; unsigned int i; if (n >= xstats_n && xstats_names) { - priv_lock(priv); for (i = 0; i != xstats_n; ++i) { strncpy(xstats_names[i].name, mlx5_counters_init[i].dpdk_name, RTE_ETH_XSTATS_NAME_SIZE); xstats_names[i].name[RTE_ETH_XSTATS_NAME_SIZE - 1] = 0; } - priv_unlock(priv); } return xstats_n; } diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c index f5711a99..e2a9bb70 100644 --- a/drivers/net/mlx5/mlx5_trigger.c +++ b/drivers/net/mlx5/mlx5_trigger.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <unistd.h> @@ -14,83 +14,133 @@ #include "mlx5_rxtx.h" #include "mlx5_utils.h" +/** + * Stop traffic on Tx queues. + * + * @param dev + * Pointer to Ethernet device structure. + */ static void -priv_txq_stop(struct priv *priv) +mlx5_txq_stop(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; unsigned int i; for (i = 0; i != priv->txqs_n; ++i) - mlx5_priv_txq_release(priv, i); + mlx5_txq_release(dev, i); } +/** + * Start traffic on Tx queues. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ static int -priv_txq_start(struct priv *priv) +mlx5_txq_start(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; unsigned int i; - int ret = 0; + int ret; - /* Add memory regions to Tx queues. */ for (i = 0; i != priv->txqs_n; ++i) { - unsigned int idx = 0; - struct mlx5_mr *mr; - struct mlx5_txq_ctrl *txq_ctrl = mlx5_priv_txq_get(priv, i); + struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i); if (!txq_ctrl) continue; - LIST_FOREACH(mr, &priv->mr, next) { - priv_txq_mp2mr_reg(priv, &txq_ctrl->txq, mr->mp, idx++); - if (idx == MLX5_PMD_TX_MP_CACHE) - break; - } txq_alloc_elts(txq_ctrl); - txq_ctrl->ibv = mlx5_priv_txq_ibv_new(priv, i); + txq_ctrl->ibv = mlx5_txq_ibv_new(dev, i); if (!txq_ctrl->ibv) { - ret = ENOMEM; + rte_errno = ENOMEM; goto error; } } - ret = priv_tx_uar_remap(priv, priv->ctx->cmd_fd); - if (ret) + ret = mlx5_tx_uar_remap(dev, priv->ctx->cmd_fd); + if (ret) { + /* Adjust index for rollback. */ + i = priv->txqs_n - 1; goto error; - return ret; + } + return 0; error: - priv_txq_stop(priv); - return ret; + ret = rte_errno; /* Save rte_errno before cleanup. */ + do { + mlx5_txq_release(dev, i); + } while (i-- != 0); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } +/** + * Stop traffic on Rx queues. + * + * @param dev + * Pointer to Ethernet device structure. + */ static void -priv_rxq_stop(struct priv *priv) +mlx5_rxq_stop(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; unsigned int i; for (i = 0; i != priv->rxqs_n; ++i) - mlx5_priv_rxq_release(priv, i); + mlx5_rxq_release(dev, i); } +/** + * Start traffic on Rx queues. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ static int -priv_rxq_start(struct priv *priv) +mlx5_rxq_start(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; unsigned int i; int ret = 0; + /* Allocate/reuse/resize mempool for Multi-Packet RQ. */ + if (mlx5_mprq_alloc_mp(dev)) { + /* Should not release Rx queues but return immediately. */ + return -rte_errno; + } for (i = 0; i != priv->rxqs_n; ++i) { - struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_priv_rxq_get(priv, i); + struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_get(dev, i); + struct rte_mempool *mp; if (!rxq_ctrl) continue; + /* Pre-register Rx mempool. */ + mp = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? + rxq_ctrl->rxq.mprq_mp : rxq_ctrl->rxq.mp; + DRV_LOG(DEBUG, + "port %u Rx queue %u registering" + " mp %s having %u chunks", + dev->data->port_id, rxq_ctrl->idx, + mp->name, mp->nb_mem_chunks); + mlx5_mr_update_mp(dev, &rxq_ctrl->rxq.mr_ctrl, mp); ret = rxq_alloc_elts(rxq_ctrl); if (ret) goto error; - rxq_ctrl->ibv = mlx5_priv_rxq_ibv_new(priv, i); - if (!rxq_ctrl->ibv) { - ret = ENOMEM; + rxq_ctrl->ibv = mlx5_rxq_ibv_new(dev, i); + if (!rxq_ctrl->ibv) goto error; - } } - return -ret; + return 0; error: - priv_rxq_stop(priv); - return -ret; + ret = rte_errno; /* Save rte_errno before cleanup. */ + do { + mlx5_rxq_release(dev, i); + } while (i-- != 0); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } /** @@ -102,68 +152,62 @@ error: * Pointer to Ethernet device structure. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_dev_start(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; - struct mlx5_mr *mr = NULL; - int err; + int ret; - dev->data->dev_started = 1; - priv_lock(priv); - err = priv_flow_create_drop_queue(priv); - if (err) { - ERROR("%p: Drop queue allocation failed: %s", - (void *)dev, strerror(err)); - goto error; + DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id); + ret = mlx5_txq_start(dev); + if (ret) { + DRV_LOG(ERR, "port %u Tx queue allocation failed: %s", + dev->data->port_id, strerror(rte_errno)); + return -rte_errno; } - DEBUG("%p: allocating and configuring hash RX queues", (void *)dev); - rte_mempool_walk(mlx5_mp2mr_iter, priv); - err = priv_txq_start(priv); - if (err) { - ERROR("%p: TXQ allocation failed: %s", - (void *)dev, strerror(err)); - goto error; + ret = mlx5_rxq_start(dev); + if (ret) { + DRV_LOG(ERR, "port %u Rx queue allocation failed: %s", + dev->data->port_id, strerror(rte_errno)); + mlx5_txq_stop(dev); + return -rte_errno; } - err = priv_rxq_start(priv); - if (err) { - ERROR("%p: RXQ allocation failed: %s", - (void *)dev, strerror(err)); + dev->data->dev_started = 1; + ret = mlx5_rx_intr_vec_enable(dev); + if (ret) { + DRV_LOG(ERR, "port %u Rx interrupt vector creation failed", + dev->data->port_id); goto error; } - err = priv_rx_intr_vec_enable(priv); - if (err) { - ERROR("%p: RX interrupt vector creation failed", - (void *)priv); + mlx5_xstats_init(dev); + ret = mlx5_traffic_enable(dev); + if (ret) { + DRV_LOG(DEBUG, "port %u failed to set defaults flows", + dev->data->port_id); goto error; } - priv_xstats_init(priv); - /* Update link status and Tx/Rx callbacks for the first time. */ - memset(&dev->data->dev_link, 0, sizeof(struct rte_eth_link)); - INFO("Forcing port %u link to be up", dev->data->port_id); - err = priv_force_link_status_change(priv, ETH_LINK_UP); - if (err) { - DEBUG("Failed to set port %u link to be up", - dev->data->port_id); + ret = mlx5_flow_start(dev, &priv->flows); + if (ret) { + DRV_LOG(DEBUG, "port %u failed to set flows", + dev->data->port_id); goto error; } - priv_dev_interrupt_handler_install(priv, dev); - priv_unlock(priv); + dev->tx_pkt_burst = mlx5_select_tx_function(dev); + dev->rx_pkt_burst = mlx5_select_rx_function(dev); + mlx5_dev_interrupt_handler_install(dev); return 0; error: + ret = rte_errno; /* Save rte_errno before cleanup. */ /* Rollback. */ dev->data->dev_started = 0; - for (mr = LIST_FIRST(&priv->mr); mr; mr = LIST_FIRST(&priv->mr)) - priv_mr_release(priv, mr); - priv_flow_stop(priv, &priv->flows); - priv_dev_traffic_disable(priv, dev); - priv_txq_stop(priv); - priv_rxq_stop(priv); - priv_flow_delete_drop_queue(priv); - priv_unlock(priv); - return err; + mlx5_flow_stop(dev, &priv->flows); + mlx5_traffic_disable(dev); + mlx5_txq_stop(dev); + mlx5_rxq_stop(dev); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } /** @@ -178,42 +222,37 @@ void mlx5_dev_stop(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; - struct mlx5_mr *mr; - priv_lock(priv); dev->data->dev_started = 0; /* Prevent crashes when queues are still in use. */ dev->rx_pkt_burst = removed_rx_burst; dev->tx_pkt_burst = removed_tx_burst; rte_wmb(); usleep(1000 * priv->rxqs_n); - DEBUG("%p: cleaning up and destroying hash RX queues", (void *)dev); - priv_flow_stop(priv, &priv->flows); - priv_dev_traffic_disable(priv, dev); - priv_rx_intr_vec_disable(priv); - priv_dev_interrupt_handler_uninstall(priv, dev); - priv_txq_stop(priv); - priv_rxq_stop(priv); - for (mr = LIST_FIRST(&priv->mr); mr; mr = LIST_FIRST(&priv->mr)) - priv_mr_release(priv, mr); - priv_flow_delete_drop_queue(priv); - priv_unlock(priv); + DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id); + mlx5_flow_stop(dev, &priv->flows); + mlx5_traffic_disable(dev); + mlx5_rx_intr_vec_disable(dev); + mlx5_dev_interrupt_handler_uninstall(dev); + mlx5_txq_stop(dev); + mlx5_rxq_stop(dev); } /** * Enable traffic flows configured by control plane * - * @param priv + * @param dev * Pointer to Ethernet device private data. * @param dev * Pointer to Ethernet device structure. * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_dev_traffic_enable(struct priv *priv, struct rte_eth_dev *dev) +mlx5_traffic_enable(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct rte_flow_item_eth bcast = { .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", }; @@ -246,8 +285,9 @@ priv_dev_traffic_enable(struct priv *priv, struct rte_eth_dev *dev) .type = 0, }; - claim_zero(mlx5_ctrl_flow(dev, &promisc, &promisc)); - return 0; + ret = mlx5_ctrl_flow(dev, &promisc, &promisc); + if (ret) + goto error; } if (dev->data->all_multicast) { struct rte_flow_item_eth multicast = { @@ -256,7 +296,9 @@ priv_dev_traffic_enable(struct priv *priv, struct rte_eth_dev *dev) .type = 0, }; - claim_zero(mlx5_ctrl_flow(dev, &multicast, &multicast)); + ret = mlx5_ctrl_flow(dev, &multicast, &multicast); + if (ret) + goto error; } else { /* Add broadcast/multicast flows. */ for (i = 0; i != vlan_filter_n; ++i) { @@ -265,9 +307,8 @@ priv_dev_traffic_enable(struct priv *priv, struct rte_eth_dev *dev) struct rte_flow_item_vlan vlan_spec = { .tci = rte_cpu_to_be_16(vlan), }; - struct rte_flow_item_vlan vlan_mask = { - .tci = 0xffff, - }; + struct rte_flow_item_vlan vlan_mask = + rte_flow_item_vlan_mask; ret = mlx5_ctrl_flow_vlan(dev, &bcast, &bcast, &vlan_spec, &vlan_mask); @@ -304,9 +345,8 @@ priv_dev_traffic_enable(struct priv *priv, struct rte_eth_dev *dev) struct rte_flow_item_vlan vlan_spec = { .tci = rte_cpu_to_be_16(vlan), }; - struct rte_flow_item_vlan vlan_mask = { - .tci = 0xffff, - }; + struct rte_flow_item_vlan vlan_mask = + rte_flow_item_vlan_mask; ret = mlx5_ctrl_flow_vlan(dev, &unicast, &unicast_mask, @@ -316,74 +356,49 @@ priv_dev_traffic_enable(struct priv *priv, struct rte_eth_dev *dev) goto error; } if (!vlan_filter_n) { - ret = mlx5_ctrl_flow(dev, &unicast, - &unicast_mask); + ret = mlx5_ctrl_flow(dev, &unicast, &unicast_mask); if (ret) goto error; } } return 0; error: - return rte_errno; + ret = rte_errno; /* Save rte_errno before cleanup. */ + mlx5_flow_list_flush(dev, &priv->ctrl_flows); + rte_errno = ret; /* Restore rte_errno. */ + return -rte_errno; } /** * Disable traffic flows configured by control plane * - * @param priv - * Pointer to Ethernet device private data. * @param dev - * Pointer to Ethernet device structure. - * - * @return - * 0 on success. - */ -int -priv_dev_traffic_disable(struct priv *priv, struct rte_eth_dev *dev) -{ - (void)dev; - priv_flow_flush(priv, &priv->ctrl_flows); - return 0; -} - -/** - * Restart traffic flows configured by control plane - * - * @param priv * Pointer to Ethernet device private data. - * @param dev - * Pointer to Ethernet device structure. - * - * @return - * 0 on success. */ -int -priv_dev_traffic_restart(struct priv *priv, struct rte_eth_dev *dev) +void +mlx5_traffic_disable(struct rte_eth_dev *dev) { - if (dev->data->dev_started) { - priv_dev_traffic_disable(priv, dev); - priv_dev_traffic_enable(priv, dev); - } - return 0; + struct priv *priv = dev->data->dev_private; + + mlx5_flow_list_flush(dev, &priv->ctrl_flows); } /** * Restart traffic flows configured by control plane * * @param dev - * Pointer to Ethernet device structure. + * Pointer to Ethernet device private data. * * @return - * 0 on success. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_traffic_restart(struct rte_eth_dev *dev) { - struct priv *priv = dev->data->dev_private; - - priv_lock(priv); - priv_dev_traffic_restart(priv, dev); - priv_unlock(priv); + if (dev->data->dev_started) { + mlx5_traffic_disable(dev); + return mlx5_traffic_enable(dev); + } return 0; } diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index ed1c713e..f9bc4739 100644 --- a/drivers/net/mlx5/mlx5_txq.c +++ b/drivers/net/mlx5/mlx5_txq.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -47,7 +47,8 @@ txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl) for (i = 0; (i != elts_n); ++i) (*txq_ctrl->txq.elts)[i] = NULL; - DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n); + DRV_LOG(DEBUG, "port %u Tx queue %u allocated and configured %u WRs", + PORT_ID(txq_ctrl->priv), txq_ctrl->idx, elts_n); txq_ctrl->txq.elts_head = 0; txq_ctrl->txq.elts_tail = 0; txq_ctrl->txq.elts_comp = 0; @@ -68,7 +69,8 @@ txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl) uint16_t elts_tail = txq_ctrl->txq.elts_tail; struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts; - DEBUG("%p: freeing WRs", (void *)txq_ctrl); + DRV_LOG(DEBUG, "port %u Tx queue %u freeing WRs", + PORT_ID(txq_ctrl->priv), txq_ctrl->idx); txq_ctrl->txq.elts_head = 0; txq_ctrl->txq.elts_tail = 0; txq_ctrl->txq.elts_comp = 0; @@ -91,15 +93,16 @@ txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl) /** * Returns the per-port supported offloads. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * * @return * Supported Tx offloads. */ uint64_t -mlx5_priv_get_tx_port_offloads(struct priv *priv) +mlx5_get_tx_port_offloads(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; uint64_t offloads = (DEV_TX_OFFLOAD_MULTI_SEGS | DEV_TX_OFFLOAD_VLAN_INSERT); struct mlx5_dev_config *config = &priv->config; @@ -110,6 +113,14 @@ mlx5_priv_get_tx_port_offloads(struct priv *priv) DEV_TX_OFFLOAD_TCP_CKSUM); if (config->tso) offloads |= DEV_TX_OFFLOAD_TCP_TSO; + if (config->swp) { + if (config->hw_csum) + offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; + if (config->tso) + offloads |= (DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO); + } + if (config->tunnel_en) { if (config->hw_csum) offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; @@ -121,31 +132,6 @@ mlx5_priv_get_tx_port_offloads(struct priv *priv) } /** - * Checks if the per-queue offload configuration is valid. - * - * @param priv - * Pointer to private structure. - * @param offloads - * Per-queue offloads configuration. - * - * @return - * 1 if the configuration is valid, 0 otherwise. - */ -static int -priv_is_tx_queue_offloads_allowed(struct priv *priv, uint64_t offloads) -{ - uint64_t port_offloads = priv->dev->data->dev_conf.txmode.offloads; - uint64_t port_supp_offloads = mlx5_priv_get_tx_port_offloads(priv); - - /* There are no Tx offloads which are per queue. */ - if ((offloads & port_supp_offloads) != offloads) - return 0; - if ((port_offloads ^ offloads) & port_supp_offloads) - return 0; - return 1; -} - -/** * DPDK callback to configure a TX queue. * * @param dev @@ -160,7 +146,7 @@ priv_is_tx_queue_offloads_allowed(struct priv *priv, uint64_t offloads) * Thresholds parameters. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, @@ -170,64 +156,47 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, struct mlx5_txq_data *txq = (*priv->txqs)[idx]; struct mlx5_txq_ctrl *txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq); - int ret = 0; - priv_lock(priv); - /* - * Don't verify port offloads for application which - * use the old API. - */ - if (!!(conf->txq_flags & ETH_TXQ_FLAGS_IGNORE) && - !priv_is_tx_queue_offloads_allowed(priv, conf->offloads)) { - ret = ENOTSUP; - ERROR("%p: Tx queue offloads 0x%" PRIx64 " don't match port " - "offloads 0x%" PRIx64 " or supported offloads 0x%" PRIx64, - (void *)dev, conf->offloads, - dev->data->dev_conf.txmode.offloads, - mlx5_priv_get_tx_port_offloads(priv)); - goto out; - } if (desc <= MLX5_TX_COMP_THRESH) { - WARN("%p: number of descriptors requested for TX queue %u" - " must be higher than MLX5_TX_COMP_THRESH, using" - " %u instead of %u", - (void *)dev, idx, MLX5_TX_COMP_THRESH + 1, desc); + DRV_LOG(WARNING, + "port %u number of descriptors requested for Tx queue" + " %u must be higher than MLX5_TX_COMP_THRESH, using %u" + " instead of %u", + dev->data->port_id, idx, MLX5_TX_COMP_THRESH + 1, desc); desc = MLX5_TX_COMP_THRESH + 1; } if (!rte_is_power_of_2(desc)) { desc = 1 << log2above(desc); - WARN("%p: increased number of descriptors in TX queue %u" - " to the next power of two (%d)", - (void *)dev, idx, desc); + DRV_LOG(WARNING, + "port %u increased number of descriptors in Tx queue" + " %u to the next power of two (%d)", + dev->data->port_id, idx, desc); } - DEBUG("%p: configuring queue %u for %u descriptors", - (void *)dev, idx, desc); + DRV_LOG(DEBUG, "port %u configuring queue %u for %u descriptors", + dev->data->port_id, idx, desc); if (idx >= priv->txqs_n) { - ERROR("%p: queue index out of range (%u >= %u)", - (void *)dev, idx, priv->txqs_n); - priv_unlock(priv); - return -EOVERFLOW; + DRV_LOG(ERR, "port %u Tx queue index out of range (%u >= %u)", + dev->data->port_id, idx, priv->txqs_n); + rte_errno = EOVERFLOW; + return -rte_errno; } - if (!mlx5_priv_txq_releasable(priv, idx)) { - ret = EBUSY; - ERROR("%p: unable to release queue index %u", - (void *)dev, idx); - goto out; + if (!mlx5_txq_releasable(dev, idx)) { + rte_errno = EBUSY; + DRV_LOG(ERR, "port %u unable to release queue index %u", + dev->data->port_id, idx); + return -rte_errno; } - mlx5_priv_txq_release(priv, idx); - txq_ctrl = mlx5_priv_txq_new(priv, idx, desc, socket, conf); + mlx5_txq_release(dev, idx); + txq_ctrl = mlx5_txq_new(dev, idx, desc, socket, conf); if (!txq_ctrl) { - ERROR("%p: unable to allocate queue index %u", - (void *)dev, idx); - ret = ENOMEM; - goto out; + DRV_LOG(ERR, "port %u unable to allocate queue index %u", + dev->data->port_id, idx); + return -rte_errno; } - DEBUG("%p: adding TX queue %p to list", - (void *)dev, (void *)txq_ctrl); + DRV_LOG(DEBUG, "port %u adding Tx queue %u to list", + dev->data->port_id, idx); (*priv->txqs)[idx] = &txq_ctrl->txq; -out: - priv_unlock(priv); - return -ret; + return 0; } /** @@ -248,15 +217,13 @@ mlx5_tx_queue_release(void *dpdk_txq) return; txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq); priv = txq_ctrl->priv; - priv_lock(priv); for (i = 0; (i != priv->txqs_n); ++i) if ((*priv->txqs)[i] == txq) { - DEBUG("%p: removing TX queue %p from list", - (void *)priv->dev, (void *)txq_ctrl); - mlx5_priv_txq_release(priv, i); + mlx5_txq_release(ETH_DEV(priv), i); + DRV_LOG(DEBUG, "port %u removing Tx queue %u from list", + PORT_ID(priv), txq_ctrl->idx); break; } - priv_unlock(priv); } @@ -265,17 +232,18 @@ mlx5_tx_queue_release(void *dpdk_txq) * Both primary and secondary process do mmap to make UAR address * aligned. * - * @param[in] priv - * Pointer to private structure. + * @param[in] dev + * Pointer to Ethernet device. * @param fd * Verbs file descriptor to map UAR pages. * * @return - * 0 on success, errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -priv_tx_uar_remap(struct priv *priv, int fd) +mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd) { + struct priv *priv = dev->data->dev_private; unsigned int i, j; uintptr_t pages[priv->txqs_n]; unsigned int pages_n = 0; @@ -287,7 +255,9 @@ priv_tx_uar_remap(struct priv *priv, int fd) struct mlx5_txq_ctrl *txq_ctrl; int already_mapped; size_t page_size = sysconf(_SC_PAGESIZE); - int r; +#ifndef RTE_ARCH_64 + unsigned int lock_idx; +#endif memset(pages, 0, priv->txqs_n * sizeof(uintptr_t)); /* @@ -300,6 +270,7 @@ priv_tx_uar_remap(struct priv *priv, int fd) continue; txq = (*priv->txqs)[i]; txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq); + assert(txq_ctrl->idx == (uint16_t)i); /* UAR addr form verbs used to find dup and offset in page. */ uar_va = (uintptr_t)txq_ctrl->bf_reg_orig; off = uar_va & (page_size - 1); /* offset in page. */ @@ -313,7 +284,7 @@ priv_tx_uar_remap(struct priv *priv, int fd) } /* new address in reserved UAR address space. */ addr = RTE_PTR_ADD(priv->uar_base, - uar_va & (MLX5_UAR_SIZE - 1)); + uar_va & (uintptr_t)(MLX5_UAR_SIZE - 1)); if (!already_mapped) { pages[pages_n++] = uar_va; /* fixed mmap to specified address in reserved @@ -324,10 +295,12 @@ priv_tx_uar_remap(struct priv *priv, int fd) txq_ctrl->uar_mmap_offset); if (ret != addr) { /* fixed mmap have to return same address */ - ERROR("call to mmap failed on UAR for txq %d\n", - i); - r = ENXIO; - return r; + DRV_LOG(ERR, + "port %u call to mmap failed on UAR" + " for txq %u", + dev->data->port_id, txq_ctrl->idx); + rte_errno = ENXIO; + return -rte_errno; } } if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save once */ @@ -335,6 +308,12 @@ priv_tx_uar_remap(struct priv *priv, int fd) else assert(txq_ctrl->txq.bf_reg == RTE_PTR_ADD((void *)addr, off)); +#ifndef RTE_ARCH_64 + /* Assign a UAR lock according to UAR page number */ + lock_idx = (txq_ctrl->uar_mmap_offset / page_size) & + MLX5_UAR_PAGE_NUM_MASK; + txq->uar_lock = &priv->uar_lock[lock_idx]; +#endif } return 0; } @@ -361,17 +340,18 @@ is_empw_burst_func(eth_tx_burst_t tx_pkt_burst) /** * Create the Tx queue Verbs object. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * Queue index in DPDK Rx queue array * * @return - * The Verbs object initialised if it can be created. + * The Verbs object initialised, NULL otherwise and rte_errno is set. */ -struct mlx5_txq_ibv* -mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) +struct mlx5_txq_ibv * +mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_data *txq_data = (*priv->txqs)[idx]; struct mlx5_txq_ctrl *txq_ctrl = container_of(txq_data, struct mlx5_txq_ctrl, txq); @@ -388,18 +368,20 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) struct mlx5dv_cq cq_info; struct mlx5dv_obj obj; const int desc = 1 << txq_data->elts_n; - eth_tx_burst_t tx_pkt_burst = priv_select_tx_function(priv, priv->dev); + eth_tx_burst_t tx_pkt_burst = mlx5_select_tx_function(dev); int ret = 0; assert(txq_data); priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_TX_QUEUE; priv->verbs_alloc_ctx.obj = txq_ctrl; if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) { - ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set"); - goto error; + DRV_LOG(ERR, + "port %u MLX5_ENABLE_CQE_COMPRESSION must never be set", + dev->data->port_id); + rte_errno = EINVAL; + return NULL; } memset(&tmpl, 0, sizeof(struct mlx5_txq_ibv)); - /* MRs will be registered in mp2mr[] later. */ attr.cq = (struct ibv_cq_init_attr_ex){ .comp_mask = 0, }; @@ -409,7 +391,9 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV; tmpl.cq = mlx5_glue->create_cq(priv->ctx, cqe_n, NULL, NULL, 0); if (tmpl.cq == NULL) { - ERROR("%p: CQ creation failure", (void *)txq_ctrl); + DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure", + dev->data->port_id, idx); + rte_errno = errno; goto error; } attr.init = (struct ibv_qp_init_attr_ex){ @@ -450,19 +434,24 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) } tmpl.qp = mlx5_glue->create_qp_ex(priv->ctx, &attr.init); if (tmpl.qp == NULL) { - ERROR("%p: QP creation failure", (void *)txq_ctrl); + DRV_LOG(ERR, "port %u Tx queue %u QP creation failure", + dev->data->port_id, idx); + rte_errno = errno; goto error; } attr.mod = (struct ibv_qp_attr){ /* Move the QP to this state. */ .qp_state = IBV_QPS_INIT, /* Primary port number. */ - .port_num = priv->port + .port_num = 1, }; ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, (IBV_QP_STATE | IBV_QP_PORT)); if (ret) { - ERROR("%p: QP state to IBV_QPS_INIT failed", (void *)txq_ctrl); + DRV_LOG(ERR, + "port %u Tx queue %u QP state to IBV_QPS_INIT failed", + dev->data->port_id, idx); + rte_errno = errno; goto error; } attr.mod = (struct ibv_qp_attr){ @@ -470,19 +459,27 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) }; ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE); if (ret) { - ERROR("%p: QP state to IBV_QPS_RTR failed", (void *)txq_ctrl); + DRV_LOG(ERR, + "port %u Tx queue %u QP state to IBV_QPS_RTR failed", + dev->data->port_id, idx); + rte_errno = errno; goto error; } attr.mod.qp_state = IBV_QPS_RTS; ret = mlx5_glue->modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE); if (ret) { - ERROR("%p: QP state to IBV_QPS_RTS failed", (void *)txq_ctrl); + DRV_LOG(ERR, + "port %u Tx queue %u QP state to IBV_QPS_RTS failed", + dev->data->port_id, idx); + rte_errno = errno; goto error; } txq_ibv = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_txq_ibv), 0, txq_ctrl->socket); if (!txq_ibv) { - ERROR("%p: cannot allocate memory", (void *)txq_ctrl); + DRV_LOG(ERR, "port %u Tx queue %u cannot allocate memory", + dev->data->port_id, idx); + rte_errno = ENOMEM; goto error; } obj.cq.in = tmpl.cq; @@ -490,11 +487,16 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) obj.qp.in = tmpl.qp; obj.qp.out = &qp; ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP); - if (ret != 0) + if (ret != 0) { + rte_errno = errno; goto error; + } if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) { - ERROR("Wrong MLX5_CQE_SIZE environment variable value: " - "it should be set to %u", RTE_CACHE_LINE_SIZE); + DRV_LOG(ERR, + "port %u wrong MLX5_CQE_SIZE environment variable" + " value: it should be set to %u", + dev->data->port_id, RTE_CACHE_LINE_SIZE); + rte_errno = EINVAL; goto error; } txq_data->cqe_n = log2above(cq_info.cqe_cnt); @@ -518,38 +520,46 @@ mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx) rte_atomic32_inc(&txq_ibv->refcnt); if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) { txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset; + DRV_LOG(DEBUG, "port %u: uar_mmap_offset 0x%lx", + dev->data->port_id, txq_ctrl->uar_mmap_offset); } else { - ERROR("Failed to retrieve UAR info, invalid libmlx5.so version"); + DRV_LOG(ERR, + "port %u failed to retrieve UAR info, invalid" + " libmlx5.so", + dev->data->port_id); + rte_errno = EINVAL; goto error; } - DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv, - (void *)txq_ibv, rte_atomic32_read(&txq_ibv->refcnt)); LIST_INSERT_HEAD(&priv->txqsibv, txq_ibv, next); + txq_ibv->txq_ctrl = txq_ctrl; priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; return txq_ibv; error: + ret = rte_errno; /* Save rte_errno before cleanup. */ if (tmpl.cq) claim_zero(mlx5_glue->destroy_cq(tmpl.cq)); if (tmpl.qp) claim_zero(mlx5_glue->destroy_qp(tmpl.qp)); priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; + rte_errno = ret; /* Restore rte_errno. */ return NULL; } /** * Get an Tx queue Verbs object. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * Queue index in DPDK Rx queue array * * @return * The Verbs object if it exists. */ -struct mlx5_txq_ibv* -mlx5_priv_txq_ibv_get(struct priv *priv, uint16_t idx) +struct mlx5_txq_ibv * +mlx5_txq_ibv_get(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *txq_ctrl; if (idx >= priv->txqs_n) @@ -557,33 +567,24 @@ mlx5_priv_txq_ibv_get(struct priv *priv, uint16_t idx) if (!(*priv->txqs)[idx]) return NULL; txq_ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq); - if (txq_ctrl->ibv) { + if (txq_ctrl->ibv) rte_atomic32_inc(&txq_ctrl->ibv->refcnt); - DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv, - (void *)txq_ctrl->ibv, - rte_atomic32_read(&txq_ctrl->ibv->refcnt)); - } return txq_ctrl->ibv; } /** * Release an Tx verbs queue object. * - * @param priv - * Pointer to private structure. * @param txq_ibv * Verbs Tx queue object. * * @return - * 0 on success, errno on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_txq_ibv_release(struct priv *priv, struct mlx5_txq_ibv *txq_ibv) +mlx5_txq_ibv_release(struct mlx5_txq_ibv *txq_ibv) { - (void)priv; assert(txq_ibv); - DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv, - (void *)txq_ibv, rte_atomic32_read(&txq_ibv->refcnt)); if (rte_atomic32_dec_and_test(&txq_ibv->refcnt)) { claim_zero(mlx5_glue->destroy_qp(txq_ibv->qp)); claim_zero(mlx5_glue->destroy_cq(txq_ibv->cq)); @@ -591,21 +592,18 @@ mlx5_priv_txq_ibv_release(struct priv *priv, struct mlx5_txq_ibv *txq_ibv) rte_free(txq_ibv); return 0; } - return EBUSY; + return 1; } /** * Return true if a single reference exists on the object. * - * @param priv - * Pointer to private structure. * @param txq_ibv * Verbs Tx queue object. */ int -mlx5_priv_txq_ibv_releasable(struct priv *priv, struct mlx5_txq_ibv *txq_ibv) +mlx5_txq_ibv_releasable(struct mlx5_txq_ibv *txq_ibv) { - (void)priv; assert(txq_ibv); return (rte_atomic32_read(&txq_ibv->refcnt) == 1); } @@ -613,20 +611,22 @@ mlx5_priv_txq_ibv_releasable(struct priv *priv, struct mlx5_txq_ibv *txq_ibv) /** * Verify the Verbs Tx queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_txq_ibv_verify(struct priv *priv) +mlx5_txq_ibv_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; int ret = 0; struct mlx5_txq_ibv *txq_ibv; LIST_FOREACH(txq_ibv, &priv->txqsibv, next) { - DEBUG("%p: Verbs Tx queue %p still referenced", (void *)priv, - (void *)txq_ibv); + DRV_LOG(DEBUG, "port %u Verbs Tx queue %u still referenced", + dev->data->port_id, txq_ibv->txq_ctrl->idx); ++ret; } return ret; @@ -649,9 +649,14 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) unsigned int txq_inline; unsigned int txqs_inline; unsigned int inline_max_packet_sz; - eth_tx_burst_t tx_pkt_burst = priv_select_tx_function(priv, priv->dev); + eth_tx_burst_t tx_pkt_burst = + mlx5_select_tx_function(ETH_DEV(priv)); int is_empw_func = is_empw_burst_func(tx_pkt_burst); - int tso = !!(txq_ctrl->txq.offloads & DEV_TX_OFFLOAD_TCP_TSO); + int tso = !!(txq_ctrl->txq.offloads & (DEV_TX_OFFLOAD_TCP_TSO | + DEV_TX_OFFLOAD_VXLAN_TNL_TSO | + DEV_TX_OFFLOAD_GRE_TNL_TSO | + DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO)); txq_inline = (config->txq_inline == MLX5_ARG_UNSET) ? 0 : config->txq_inline; @@ -685,18 +690,6 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) inline_max_packet_sz) + (RTE_CACHE_LINE_SIZE - 1)) / RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE; - } else if (tso) { - int inline_diff = txq_ctrl->txq.max_inline - - max_tso_inline; - - /* - * Adjust inline value as Verbs aggregates - * tso_inline and txq_inline fields. - */ - txq_ctrl->max_inline_data = inline_diff > 0 ? - inline_diff * - RTE_CACHE_LINE_SIZE : - 0; } else { txq_ctrl->max_inline_data = txq_ctrl->txq.max_inline * RTE_CACHE_LINE_SIZE; @@ -716,9 +709,10 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) max_inline = max_inline - (max_inline % RTE_CACHE_LINE_SIZE); - WARN("txq inline is too large (%d) setting it to " - "the maximum possible: %d\n", - txq_inline, max_inline); + DRV_LOG(WARNING, + "port %u txq inline is too large (%d) setting" + " it to the maximum possible: %d\n", + PORT_ID(priv), txq_inline, max_inline); txq_ctrl->txq.max_inline = max_inline / RTE_CACHE_LINE_SIZE; } @@ -729,14 +723,18 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) max_tso_inline); txq_ctrl->txq.tso_en = 1; } - txq_ctrl->txq.tunnel_en = config->tunnel_en; + txq_ctrl->txq.tunnel_en = config->tunnel_en | config->swp; + txq_ctrl->txq.swp_en = ((DEV_TX_OFFLOAD_IP_TNL_TSO | + DEV_TX_OFFLOAD_UDP_TNL_TSO | + DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM) & + txq_ctrl->txq.offloads) && config->swp; } /** * Create a DPDK Tx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * @param desc @@ -747,76 +745,75 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) * Thresholds parameters. * * @return - * A DPDK queue object on success. + * A DPDK queue object on success, NULL otherwise and rte_errno is set. */ -struct mlx5_txq_ctrl* -mlx5_priv_txq_new(struct priv *priv, uint16_t idx, uint16_t desc, - unsigned int socket, - const struct rte_eth_txconf *conf) +struct mlx5_txq_ctrl * +mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_txconf *conf) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *tmpl; tmpl = rte_calloc_socket("TXQ", 1, sizeof(*tmpl) + desc * sizeof(struct rte_mbuf *), 0, socket); - if (!tmpl) + if (!tmpl) { + rte_errno = ENOMEM; return NULL; + } + if (mlx5_mr_btree_init(&tmpl->txq.mr_ctrl.cache_bh, + MLX5_MR_BTREE_CACHE_N, socket)) { + /* rte_errno is already set. */ + goto error; + } + /* Save pointer of global generation number to check memory event. */ + tmpl->txq.mr_ctrl.dev_gen_ptr = &priv->mr.dev_gen; assert(desc > MLX5_TX_COMP_THRESH); - tmpl->txq.offloads = conf->offloads; + tmpl->txq.offloads = conf->offloads | + dev->data->dev_conf.txmode.offloads; tmpl->priv = priv; tmpl->socket = socket; tmpl->txq.elts_n = log2above(desc); + tmpl->idx = idx; txq_set_params(tmpl); - /* MRs will be registered in mp2mr[] later. */ - DEBUG("priv->device_attr.max_qp_wr is %d", - priv->device_attr.orig_attr.max_qp_wr); - DEBUG("priv->device_attr.max_sge is %d", - priv->device_attr.orig_attr.max_sge); + DRV_LOG(DEBUG, "port %u priv->device_attr.max_qp_wr is %d", + dev->data->port_id, priv->device_attr.orig_attr.max_qp_wr); + DRV_LOG(DEBUG, "port %u priv->device_attr.max_sge is %d", + dev->data->port_id, priv->device_attr.orig_attr.max_sge); tmpl->txq.elts = (struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])(tmpl + 1); tmpl->txq.stats.idx = idx; rte_atomic32_inc(&tmpl->refcnt); - DEBUG("%p: Tx queue %p: refcnt %d", (void *)priv, - (void *)tmpl, rte_atomic32_read(&tmpl->refcnt)); LIST_INSERT_HEAD(&priv->txqsctrl, tmpl, next); return tmpl; +error: + rte_free(tmpl); + return NULL; } /** * Get a Tx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * * @return * A pointer to the queue if it exists. */ -struct mlx5_txq_ctrl* -mlx5_priv_txq_get(struct priv *priv, uint16_t idx) +struct mlx5_txq_ctrl * +mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *ctrl = NULL; if ((*priv->txqs)[idx]) { ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq); - unsigned int i; - - mlx5_priv_txq_ibv_get(priv, idx); - for (i = 0; i != MLX5_PMD_TX_MP_CACHE; ++i) { - struct mlx5_mr *mr = NULL; - - (void)mr; - if (ctrl->txq.mp2mr[i]) { - mr = priv_mr_get(priv, ctrl->txq.mp2mr[i]->mp); - assert(mr); - } - } + mlx5_txq_ibv_get(dev, idx); rte_atomic32_inc(&ctrl->refcnt); - DEBUG("%p: Tx queue %p: refcnt %d", (void *)priv, - (void *)ctrl, rte_atomic32_read(&ctrl->refcnt)); } return ctrl; } @@ -824,57 +821,45 @@ mlx5_priv_txq_get(struct priv *priv, uint16_t idx) /** * Release a Tx queue. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * * @return - * 0 on success, errno on failure. + * 1 while a reference on it exists, 0 when freed. */ int -mlx5_priv_txq_release(struct priv *priv, uint16_t idx) +mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx) { - unsigned int i; + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *txq; size_t page_size = sysconf(_SC_PAGESIZE); if (!(*priv->txqs)[idx]) return 0; txq = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq); - DEBUG("%p: Tx queue %p: refcnt %d", (void *)priv, - (void *)txq, rte_atomic32_read(&txq->refcnt)); - if (txq->ibv) { - int ret; - - ret = mlx5_priv_txq_ibv_release(priv, txq->ibv); - if (!ret) - txq->ibv = NULL; - } - for (i = 0; i != MLX5_PMD_TX_MP_CACHE; ++i) { - if (txq->txq.mp2mr[i]) { - priv_mr_release(priv, txq->txq.mp2mr[i]); - txq->txq.mp2mr[i] = NULL; - } - } + if (txq->ibv && !mlx5_txq_ibv_release(txq->ibv)) + txq->ibv = NULL; if (priv->uar_base) munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->txq.bf_reg, page_size), page_size); if (rte_atomic32_dec_and_test(&txq->refcnt)) { txq_free_elts(txq); + mlx5_mr_btree_free(&txq->txq.mr_ctrl.cache_bh); LIST_REMOVE(txq, next); rte_free(txq); (*priv->txqs)[idx] = NULL; return 0; } - return EBUSY; + return 1; } /** * Verify if the queue can be released. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * @param idx * TX queue index. * @@ -882,8 +867,9 @@ mlx5_priv_txq_release(struct priv *priv, uint16_t idx) * 1 if the queue can be released. */ int -mlx5_priv_txq_releasable(struct priv *priv, uint16_t idx) +mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *txq; if (!(*priv->txqs)[idx]) @@ -895,20 +881,22 @@ mlx5_priv_txq_releasable(struct priv *priv, uint16_t idx) /** * Verify the Tx Queue list is empty * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device. * - * @return the number of object not released. + * @return + * The number of object not released. */ int -mlx5_priv_txq_verify(struct priv *priv) +mlx5_txq_verify(struct rte_eth_dev *dev) { + struct priv *priv = dev->data->dev_private; struct mlx5_txq_ctrl *txq; int ret = 0; LIST_FOREACH(txq, &priv->txqsctrl, next) { - DEBUG("%p: Tx Queue %p still referenced", (void *)priv, - (void *)txq); + DRV_LOG(DEBUG, "port %u Tx queue %u still referenced", + dev->data->port_id, txq->idx); ++ret; } return ret; diff --git a/drivers/net/mlx5/mlx5_utils.h b/drivers/net/mlx5/mlx5_utils.h index e1bfb9cd..886f60e6 100644 --- a/drivers/net/mlx5/mlx5_utils.h +++ b/drivers/net/mlx5/mlx5_utils.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #ifndef RTE_PMD_MLX5_UTILS_H_ @@ -61,14 +61,21 @@ pmd_drv_log_basename(const char *s) return s; } +extern int mlx5_logtype; + +#define PMD_DRV_LOG___(level, ...) \ + rte_log(RTE_LOG_ ## level, \ + mlx5_logtype, \ + RTE_FMT(MLX5_DRIVER_NAME ": " \ + RTE_FMT_HEAD(__VA_ARGS__,), \ + RTE_FMT_TAIL(__VA_ARGS__,))) + /* * When debugging is enabled (NDEBUG not defined), file, line and function * information replace the driver name (MLX5_DRIVER_NAME) in log messages. */ #ifndef NDEBUG -#define PMD_DRV_LOG___(level, ...) \ - ERRNO_SAFE(RTE_LOG(level, PMD, __VA_ARGS__)) #define PMD_DRV_LOG__(level, ...) \ PMD_DRV_LOG___(level, "%s:%u: %s(): " __VA_ARGS__) #define PMD_DRV_LOG_(level, s, ...) \ @@ -80,9 +87,6 @@ pmd_drv_log_basename(const char *s) __VA_ARGS__) #else /* NDEBUG */ - -#define PMD_DRV_LOG___(level, ...) \ - ERRNO_SAFE(RTE_LOG(level, PMD, MLX5_DRIVER_NAME ": " __VA_ARGS__)) #define PMD_DRV_LOG__(level, ...) \ PMD_DRV_LOG___(level, __VA_ARGS__) #define PMD_DRV_LOG_(level, s, ...) \ @@ -91,18 +95,15 @@ pmd_drv_log_basename(const char *s) #endif /* NDEBUG */ /* Generic printf()-like logging macro with automatic line feed. */ -#define PMD_DRV_LOG(level, ...) \ +#define DRV_LOG(level, ...) \ PMD_DRV_LOG_(level, \ __VA_ARGS__ PMD_DRV_LOG_STRIP PMD_DRV_LOG_OPAREN, \ PMD_DRV_LOG_CPAREN) -/* - * Like assert(), DEBUG() becomes a no-op and claim_zero() does not perform - * any check when debugging is disabled. - */ +/* claim_zero() does not perform any check when debugging is disabled. */ #ifndef NDEBUG -#define DEBUG(...) PMD_DRV_LOG(DEBUG, __VA_ARGS__) +#define DEBUG(...) DRV_LOG(DEBUG, __VA_ARGS__) #define claim_zero(...) assert((__VA_ARGS__) == 0) #define claim_nonzero(...) assert((__VA_ARGS__) != 0) @@ -114,9 +115,9 @@ pmd_drv_log_basename(const char *s) #endif /* NDEBUG */ -#define INFO(...) PMD_DRV_LOG(INFO, __VA_ARGS__) -#define WARN(...) PMD_DRV_LOG(WARNING, __VA_ARGS__) -#define ERROR(...) PMD_DRV_LOG(ERR, __VA_ARGS__) +#define INFO(...) DRV_LOG(INFO, __VA_ARGS__) +#define WARN(...) DRV_LOG(WARNING, __VA_ARGS__) +#define ERROR(...) DRV_LOG(ERR, __VA_ARGS__) /* Convenience macros for accessing mbuf fields. */ #define NEXT(m) ((m)->next) diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c index 75c34562..c91d08be 100644 --- a/drivers/net/mlx5/mlx5_vlan.c +++ b/drivers/net/mlx5/mlx5_vlan.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2015 6WIND S.A. - * Copyright 2015 Mellanox. + * Copyright 2015 Mellanox Technologies, Ltd */ #include <stddef.h> @@ -8,6 +8,12 @@ #include <assert.h> #include <stdint.h> +/* + * Not needed by this file; included to work around the lack of off_t + * definition for mlx5dv.h with unpatched rdma-core versions. + */ +#include <sys/types.h> + /* Verbs headers do not support -pedantic. */ #ifdef PEDANTIC #pragma GCC diagnostic ignored "-Wpedantic" @@ -37,26 +43,24 @@ * Toggle filter. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) { struct priv *priv = dev->data->dev_private; unsigned int i; - int ret = 0; - priv_lock(priv); - DEBUG("%p: %s VLAN filter ID %" PRIu16, - (void *)dev, (on ? "enable" : "disable"), vlan_id); + DRV_LOG(DEBUG, "port %u %s VLAN filter ID %" PRIu16, + dev->data->port_id, (on ? "enable" : "disable"), vlan_id); assert(priv->vlan_filter_n <= RTE_DIM(priv->vlan_filter)); for (i = 0; (i != priv->vlan_filter_n); ++i) if (priv->vlan_filter[i] == vlan_id) break; /* Check if there's room for another VLAN filter. */ if (i == RTE_DIM(priv->vlan_filter)) { - ret = -ENOMEM; - goto out; + rte_errno = ENOMEM; + return -rte_errno; } if (i < priv->vlan_filter_n) { assert(priv->vlan_filter_n != 0); @@ -79,37 +83,49 @@ mlx5_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) priv->vlan_filter[priv->vlan_filter_n] = vlan_id; ++priv->vlan_filter_n; } - if (dev->data->dev_started) - priv_dev_traffic_restart(priv, dev); out: - priv_unlock(priv); - return ret; + if (dev->data->dev_started) + return mlx5_traffic_restart(dev); + return 0; } /** - * Set/reset VLAN stripping for a specific queue. + * Callback to set/reset VLAN stripping for a specific queue. * - * @param priv - * Pointer to private structure. - * @param idx + * @param dev + * Pointer to Ethernet device structure. + * @param queue * RX queue index. * @param on * Enable/disable VLAN stripping. */ -static void -priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on) +void +mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on) { - struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx]; + struct priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq = (*priv->rxqs)[queue]; struct mlx5_rxq_ctrl *rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); struct ibv_wq_attr mod; uint16_t vlan_offloads = (on ? IBV_WQ_FLAGS_CVLAN_STRIPPING : 0) | 0; - int err; + int ret; - DEBUG("set VLAN offloads 0x%x for port %d queue %d", - vlan_offloads, rxq->port_id, idx); + /* Validate hw support */ + if (!priv->config.hw_vlan_strip) { + DRV_LOG(ERR, "port %u VLAN stripping is not supported", + dev->data->port_id); + return; + } + /* Validate queue number */ + if (queue >= priv->rxqs_n) { + DRV_LOG(ERR, "port %u VLAN stripping, invalid queue number %d", + dev->data->port_id, queue); + return; + } + DRV_LOG(DEBUG, "port %u set VLAN offloads 0x%x for port %uqueue %d", + dev->data->port_id, vlan_offloads, rxq->port_id, queue); if (!rxq_ctrl->ibv) { /* Update related bits in RX queue. */ rxq->vlan_strip = !!on; @@ -120,57 +136,26 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on) .flags_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING, .flags = vlan_offloads, }; - - err = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod); - if (err) { - ERROR("%p: failed to modified stripping mode: %s", - (void *)priv, strerror(err)); + ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod); + if (ret) { + DRV_LOG(ERR, "port %u failed to modified stripping mode: %s", + dev->data->port_id, strerror(rte_errno)); return; } - /* Update related bits in RX queue. */ rxq->vlan_strip = !!on; } /** - * Callback to set/reset VLAN stripping for a specific queue. - * - * @param dev - * Pointer to Ethernet device structure. - * @param queue - * RX queue index. - * @param on - * Enable/disable VLAN stripping. - */ -void -mlx5_vlan_strip_queue_set(struct rte_eth_dev *dev, uint16_t queue, int on) -{ - struct priv *priv = dev->data->dev_private; - - /* Validate hw support */ - if (!priv->config.hw_vlan_strip) { - ERROR("VLAN stripping is not supported"); - return; - } - - /* Validate queue number */ - if (queue >= priv->rxqs_n) { - ERROR("VLAN stripping, invalid queue number %d", queue); - return; - } - - priv_lock(priv); - priv_vlan_strip_queue_set(priv, queue, on); - priv_unlock(priv); -} - -/** * Callback to set/reset VLAN offloads for a port. * * @param dev * Pointer to Ethernet device structure. * @param mask * VLAN offload bit mask. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask) @@ -183,16 +168,13 @@ mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask) DEV_RX_OFFLOAD_VLAN_STRIP); if (!priv->config.hw_vlan_strip) { - ERROR("VLAN stripping is not supported"); + DRV_LOG(ERR, "port %u VLAN stripping is not supported", + dev->data->port_id); return 0; } - /* Run on every RX queue and set/reset VLAN stripping. */ - priv_lock(priv); for (i = 0; (i != priv->rxqs_n); i++) - priv_vlan_strip_queue_set(priv, i, hw_vlan_strip); - priv_unlock(priv); + mlx5_vlan_strip_queue_set(dev, i, hw_vlan_strip); } - return 0; } |