diff options
Diffstat (limited to 'drivers/net/mlx5')
-rw-r--r-- | drivers/net/mlx5/Makefile | 65 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5.c | 200 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5.h | 14 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_defs.h | 26 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_ethdev.c | 249 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_fdir.c | 20 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_mac.c | 1 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_mr.c | 283 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_prm.h | 163 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_rxmode.c | 16 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_rxq.c | 786 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_rxtx.c | 2200 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_rxtx.h | 188 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_txq.c | 370 | ||||
-rw-r--r-- | drivers/net/mlx5/mlx5_vlan.c | 8 |
15 files changed, 2722 insertions, 1867 deletions
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index 92bfa070..f6d39388 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -47,18 +47,22 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_vlan.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_stats.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_fdir.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c # Dependencies. DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += lib/librte_ether DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += lib/librte_mbuf DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += lib/librte_eal DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += lib/librte_mempool +DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += lib/librte_kvargs # Basic CFLAGS. CFLAGS += -O3 CFLAGS += -std=gnu99 -Wall -Wextra CFLAGS += -g CFLAGS += -I. +CFLAGS += -D_BSD_SOURCE +CFLAGS += -D_DEFAULT_SOURCE CFLAGS += -D_XOPEN_SOURCE=600 CFLAGS += $(WERROR_FLAGS) CFLAGS += -Wno-strict-prototypes @@ -83,14 +87,6 @@ else CFLAGS += -DNDEBUG -UPEDANTIC endif -ifdef CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N -CFLAGS += -DMLX5_PMD_SGE_WR_N=$(CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N) -endif - -ifdef CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE -CFLAGS += -DMLX5_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE) -endif - ifdef CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE CFLAGS += -DMLX5_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE) endif @@ -106,50 +102,31 @@ ifndef V AUTOCONF_OUTPUT := >/dev/null endif -mlx5_autoconf.h: $(RTE_SDK)/scripts/auto-config-h.sh +mlx5_autoconf.h.new: FORCE + +mlx5_autoconf.h.new: $(RTE_SDK)/scripts/auto-config-h.sh $Q $(RM) -f -- '$@' $Q sh -- '$<' '$@' \ - HAVE_EXP_QUERY_DEVICE \ - infiniband/verbs.h \ - type 'struct ibv_exp_device_attr' $(AUTOCONF_OUTPUT) - $Q sh -- '$<' '$@' \ - HAVE_FLOW_SPEC_IPV6 \ - infiniband/verbs.h \ - type 'struct ibv_exp_flow_spec_ipv6' $(AUTOCONF_OUTPUT) - $Q sh -- '$<' '$@' \ - HAVE_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR \ - infiniband/verbs.h \ - enum IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR \ - $(AUTOCONF_OUTPUT) - $Q sh -- '$<' '$@' \ - HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS \ - infiniband/verbs.h \ - enum IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS \ + HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \ + infiniband/verbs_exp.h \ + enum IBV_EXP_CQ_COMPRESSED_CQE \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ - HAVE_EXP_CQ_RX_TCP_PACKET \ - infiniband/verbs.h \ - enum IBV_EXP_CQ_RX_TCP_PACKET \ - $(AUTOCONF_OUTPUT) - $Q sh -- '$<' '$@' \ - HAVE_VERBS_FCS \ - infiniband/verbs.h \ - enum IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS \ - $(AUTOCONF_OUTPUT) - $Q sh -- '$<' '$@' \ - HAVE_VERBS_RX_END_PADDING \ - infiniband/verbs.h \ - enum IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING \ - $(AUTOCONF_OUTPUT) - $Q sh -- '$<' '$@' \ - HAVE_VERBS_VLAN_INSERTION \ - infiniband/verbs.h \ - enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \ + HAVE_VERBS_MLX5_ETH_VLAN_INLINE_HEADER_SIZE \ + infiniband/mlx5_hw.h \ + enum MLX5_ETH_VLAN_INLINE_HEADER_SIZE \ $(AUTOCONF_OUTPUT) +# Create mlx5_autoconf.h or update it in case it differs from the new one. + +mlx5_autoconf.h: mlx5_autoconf.h.new + $Q [ -f '$@' ] && \ + cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \ + mv '$<' '$@' + $(SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD):.c=.o): mlx5_autoconf.h clean_mlx5: FORCE - $Q rm -f -- mlx5_autoconf.h + $Q rm -f -- mlx5_autoconf.h mlx5_autoconf.h.new clean: clean_mlx5 diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 041cfc33..5aa4adc6 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -37,6 +37,7 @@ #include <assert.h> #include <stdint.h> #include <stdlib.h> +#include <errno.h> #include <net/if.h> /* Verbs header. */ @@ -57,6 +58,7 @@ #include <rte_ethdev.h> #include <rte_pci.h> #include <rte_common.h> +#include <rte_kvargs.h> #ifdef PEDANTIC #pragma GCC diagnostic error "-pedantic" #endif @@ -67,6 +69,21 @@ #include "mlx5_autoconf.h" #include "mlx5_defs.h" +/* Device parameter to enable RX completion queue compression. */ +#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" + +/* Device parameter to configure inline send. */ +#define MLX5_TXQ_INLINE "txq_inline" + +/* + * Device parameter to configure the number of TX queues threshold for + * enabling inline send. + */ +#define MLX5_TXQS_MIN_INLINE "txqs_min_inline" + +/* Device parameter to enable multi-packet send WQEs. */ +#define MLX5_TXQ_MPW_EN "txq_mpw_en" + /** * Retrieve integer value from environment variable. * @@ -98,7 +115,6 @@ static void mlx5_dev_close(struct rte_eth_dev *dev) { struct priv *priv = mlx5_get_priv(dev); - void *tmp; unsigned int i; priv_lock(priv); @@ -122,12 +138,15 @@ mlx5_dev_close(struct rte_eth_dev *dev) /* XXX race condition if mlx5_rx_burst() is still running. */ usleep(1000); for (i = 0; (i != priv->rxqs_n); ++i) { - tmp = (*priv->rxqs)[i]; - if (tmp == NULL) + struct rxq *rxq = (*priv->rxqs)[i]; + struct rxq_ctrl *rxq_ctrl; + + if (rxq == NULL) continue; + rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); (*priv->rxqs)[i] = NULL; - rxq_cleanup(tmp); - rte_free(tmp); + rxq_cleanup(rxq_ctrl); + rte_free(rxq_ctrl); } priv->rxqs_n = 0; priv->rxqs = NULL; @@ -136,12 +155,15 @@ mlx5_dev_close(struct rte_eth_dev *dev) /* XXX race condition if mlx5_tx_burst() is still running. */ usleep(1000); for (i = 0; (i != priv->txqs_n); ++i) { - tmp = (*priv->txqs)[i]; - if (tmp == NULL) + struct txq *txq = (*priv->txqs)[i]; + struct txq_ctrl *txq_ctrl; + + if (txq == NULL) continue; + txq_ctrl = container_of(txq, struct txq_ctrl, txq); (*priv->txqs)[i] = NULL; - txq_cleanup(tmp); - rte_free(tmp); + txq_cleanup(txq_ctrl); + rte_free(txq_ctrl); } priv->txqs_n = 0; priv->txqs = NULL; @@ -190,17 +212,13 @@ static const struct eth_dev_ops mlx5_dev_ops = { .mac_addr_add = mlx5_mac_addr_add, .mac_addr_set = mlx5_mac_addr_set, .mtu_set = mlx5_dev_set_mtu, -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, .vlan_offload_set = mlx5_vlan_offload_set, -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ .reta_update = mlx5_dev_rss_reta_update, .reta_query = mlx5_dev_rss_reta_query, .rss_hash_update = mlx5_rss_hash_update, .rss_hash_conf_get = mlx5_rss_hash_conf_get, -#ifdef MLX5_FDIR_SUPPORT .filter_ctrl = mlx5_dev_filter_ctrl, -#endif /* MLX5_FDIR_SUPPORT */ }; static struct { @@ -236,6 +254,90 @@ mlx5_dev_idx(struct rte_pci_addr *pci_addr) return ret; } +/** + * Verify and store value for device argument. + * + * @param[in] key + * Key argument to verify. + * @param[in] val + * Value associated with key. + * @param opaque + * User data. + * + * @return + * 0 on success, negative errno value on failure. + */ +static int +mlx5_args_check(const char *key, const char *val, void *opaque) +{ + struct priv *priv = opaque; + unsigned long tmp; + + errno = 0; + tmp = strtoul(val, NULL, 0); + if (errno) { + WARN("%s: \"%s\" is not a valid integer", key, val); + return errno; + } + if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { + priv->cqe_comp = !!tmp; + } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { + priv->txq_inline = tmp; + } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { + priv->txqs_inline = tmp; + } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { + priv->mps = !!tmp; + } else { + WARN("%s: unknown parameter", key); + return -EINVAL; + } + return 0; +} + +/** + * Parse device parameters. + * + * @param priv + * Pointer to private structure. + * @param devargs + * Device arguments structure. + * + * @return + * 0 on success, errno value on failure. + */ +static int +mlx5_args(struct priv *priv, struct rte_devargs *devargs) +{ + const char **params = (const char *[]){ + MLX5_RXQ_CQE_COMP_EN, + MLX5_TXQ_INLINE, + MLX5_TXQS_MIN_INLINE, + MLX5_TXQ_MPW_EN, + NULL, + }; + struct rte_kvargs *kvlist; + int ret = 0; + int i; + + if (devargs == NULL) + return 0; + /* Following UGLY cast is done to pass checkpatch. */ + kvlist = rte_kvargs_parse(devargs->args, params); + if (kvlist == NULL) + return 0; + /* Process parameters. */ + for (i = 0; (params[i] != NULL); ++i) { + if (rte_kvargs_count(kvlist, params[i])) { + ret = rte_kvargs_process(kvlist, params[i], + mlx5_args_check, priv); + if (ret != 0) + return ret; + } + } + rte_kvargs_free(kvlist); + return 0; +} + static struct eth_driver mlx5_driver; /** @@ -260,7 +362,7 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) int err = 0; struct ibv_context *attr_ctx = NULL; struct ibv_device_attr device_attr; - unsigned int vf; + unsigned int sriov; unsigned int mps; int idx; int i; @@ -303,17 +405,17 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) (pci_dev->addr.devid != pci_addr.devid) || (pci_dev->addr.function != pci_addr.function)) continue; - vf = ((pci_dev->id.device_id == + sriov = ((pci_dev->id.device_id == PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) || (pci_dev->id.device_id == PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)); /* Multi-packet send is only supported by ConnectX-4 Lx PF. */ mps = (pci_dev->id.device_id == PCI_DEVICE_ID_MELLANOX_CONNECTX4LX); - INFO("PCI information matches, using device \"%s\" (VF: %s," - " MPS: %s)", + INFO("PCI information matches, using device \"%s\"" + " (SR-IOV: %s, MPS: %s)", list[i]->name, - vf ? "true" : "false", + sriov ? "true" : "false", mps ? "true" : "false"); attr_ctx = ibv_open_device(list[i]); err = errno; @@ -347,23 +449,16 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) struct ibv_pd *pd = NULL; struct priv *priv = NULL; struct rte_eth_dev *eth_dev; -#ifdef HAVE_EXP_QUERY_DEVICE struct ibv_exp_device_attr exp_device_attr; -#endif /* HAVE_EXP_QUERY_DEVICE */ struct ether_addr mac; + uint16_t num_vfs = 0; -#ifdef HAVE_EXP_QUERY_DEVICE exp_device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS | IBV_EXP_DEVICE_ATTR_RX_HASH | -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS | -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ -#ifdef HAVE_VERBS_RX_END_PADDING IBV_EXP_DEVICE_ATTR_RX_PAD_END_ALIGN | -#endif /* HAVE_VERBS_RX_END_PADDING */ 0; -#endif /* HAVE_EXP_QUERY_DEVICE */ DEBUG("using port %u (%08" PRIx32 ")", port, test); @@ -414,7 +509,14 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) priv->port = port; priv->pd = pd; priv->mtu = ETHER_MTU; -#ifdef HAVE_EXP_QUERY_DEVICE + priv->mps = mps; /* Enable MPW by default if supported. */ + priv->cqe_comp = 1; /* Enable compression by default. */ + err = mlx5_args(priv, pci_dev->devargs); + if (err) { + ERROR("failed to process device arguments: %s", + strerror(err)); + goto port_error; + } if (ibv_exp_query_device(ctx, &exp_device_attr)) { ERROR("ibv_exp_query_device() failed"); goto port_error; @@ -440,32 +542,28 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE; DEBUG("maximum RX indirection table size is %u", priv->ind_table_max_size); -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS priv->hw_vlan_strip = !!(exp_device_attr.wq_vlan_offloads_cap & IBV_EXP_RECEIVE_WQ_CVLAN_STRIP); -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ DEBUG("VLAN stripping is %ssupported", (priv->hw_vlan_strip ? "" : "not ")); -#ifdef HAVE_VERBS_FCS priv->hw_fcs_strip = !!(exp_device_attr.exp_device_cap_flags & IBV_EXP_DEVICE_SCATTER_FCS); -#endif /* HAVE_VERBS_FCS */ DEBUG("FCS stripping configuration is %ssupported", (priv->hw_fcs_strip ? "" : "not ")); -#ifdef HAVE_VERBS_RX_END_PADDING priv->hw_padding = !!exp_device_attr.rx_pad_end_addr_align; -#endif /* HAVE_VERBS_RX_END_PADDING */ DEBUG("hardware RX end alignment padding is %ssupported", (priv->hw_padding ? "" : "not ")); -#else /* HAVE_EXP_QUERY_DEVICE */ - priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE; -#endif /* HAVE_EXP_QUERY_DEVICE */ - - priv->vf = vf; - priv->mps = mps; + priv_get_num_vfs(priv, &num_vfs); + priv->sriov = (num_vfs || sriov); + if (priv->mps && !mps) { + ERROR("multi-packet send not supported on this device" + " (" MLX5_TXQ_MPW_EN ")"); + err = ENOTSUP; + goto port_error; + } /* Allocate and register default RSS hash keys. */ priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n, sizeof((*priv->rss_conf)[0]), 0); @@ -608,28 +706,20 @@ error: static const struct rte_pci_id mlx5_pci_id_map[] = { { - .vendor_id = PCI_VENDOR_ID_MELLANOX, - .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4, - .subsystem_vendor_id = PCI_ANY_ID, - .subsystem_device_id = PCI_ANY_ID + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX4) }, { - .vendor_id = PCI_VENDOR_ID_MELLANOX, - .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4VF, - .subsystem_vendor_id = PCI_ANY_ID, - .subsystem_device_id = PCI_ANY_ID + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) }, { - .vendor_id = PCI_VENDOR_ID_MELLANOX, - .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4LX, - .subsystem_vendor_id = PCI_ANY_ID, - .subsystem_device_id = PCI_ANY_ID + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX4LX) }, { - .vendor_id = PCI_VENDOR_ID_MELLANOX, - .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF, - .subsystem_vendor_id = PCI_ANY_ID, - .subsystem_device_id = PCI_ANY_ID + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) }, { .vendor_id = 0 diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 24876625..3a866098 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -39,7 +39,6 @@ #include <limits.h> #include <net/if.h> #include <netinet/in.h> -#include <linux/if.h> /* Verbs header. */ /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ @@ -69,6 +68,11 @@ #include "mlx5_autoconf.h" #include "mlx5_defs.h" +#if !defined(HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE) || \ + !defined(HAVE_VERBS_MLX5_ETH_VLAN_INLINE_HEADER_SIZE) +#error Mellanox OFED >= 3.3 is required, please refer to the documentation. +#endif + enum { PCI_VENDOR_ID_MELLANOX = 0x15b3, }; @@ -105,9 +109,12 @@ struct priv { unsigned int hw_vlan_strip:1; /* VLAN stripping is supported. */ unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */ unsigned int hw_padding:1; /* End alignment padding is supported. */ - unsigned int vf:1; /* This is a VF device. */ + unsigned int sriov:1; /* This is a VF or PF with VF devices. */ unsigned int mps:1; /* Whether multi-packet send is supported. */ + unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */ unsigned int pending_alarm:1; /* An alarm is pending. */ + unsigned int txq_inline; /* Maximum packet size for inlining. */ + unsigned int txqs_inline; /* Queue number threshold for inlining. */ /* RX/TX queues. */ unsigned int rxqs_n; /* RX queues array size. */ unsigned int txqs_n; /* TX queues array size. */ @@ -173,6 +180,7 @@ struct priv *mlx5_get_priv(struct rte_eth_dev *dev); int mlx5_is_secondary(void); int priv_get_ifname(const struct priv *, char (*)[IF_NAMESIZE]); int priv_ifreq(const struct priv *, int req, struct ifreq *); +int priv_get_num_vfs(struct priv *, uint16_t *); int priv_get_mtu(struct priv *, uint16_t *); int priv_set_flags(struct priv *, unsigned int, unsigned int); int mlx5_dev_configure(struct rte_eth_dev *); @@ -191,6 +199,8 @@ void priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *); int mlx5_set_link_down(struct rte_eth_dev *dev); int mlx5_set_link_up(struct rte_eth_dev *dev); struct priv *mlx5_secondary_data_setup(struct priv *priv); +void priv_select_tx_function(struct priv *); +void priv_select_rx_function(struct priv *); /* mlx5_mac.c */ diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index 09207d9c..cc2a6f3e 100644 --- a/drivers/net/mlx5/mlx5_defs.h +++ b/drivers/net/mlx5/mlx5_defs.h @@ -48,22 +48,15 @@ /* Maximum number of special flows. */ #define MLX5_MAX_SPECIAL_FLOWS 4 -/* Request send completion once in every 64 sends, might be less. */ -#define MLX5_PMD_TX_PER_COMP_REQ 64 +/* + * Request TX completion every time descriptors reach this threshold since + * the previous request. Must be a power of two for performance reasons. + */ +#define MLX5_TX_COMP_THRESH 32 /* RSS Indirection table size. */ #define RSS_INDIRECTION_TABLE_SIZE 256 -/* Maximum number of Scatter/Gather Elements per Work Request. */ -#ifndef MLX5_PMD_SGE_WR_N -#define MLX5_PMD_SGE_WR_N 4 -#endif - -/* Maximum size for inline data. */ -#ifndef MLX5_PMD_MAX_INLINE -#define MLX5_PMD_MAX_INLINE 0 -#endif - /* * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP * from which buffers are to be transmitted will have to be mapped by this @@ -86,13 +79,4 @@ /* Alarm timeout. */ #define MLX5_ALARM_TIMEOUT_US 100000 -/* - * Extended flow priorities necessary to support flow director are available - * since MLNX_OFED 3.2. Considering this version adds support for VLAN - * offloads as well, their availability means flow director can be used. - */ -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS -#define MLX5_FDIR_SUPPORT 1 -#endif - #endif /* RTE_PMD_MLX5_DEFS_H_ */ diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index 36b369e7..0e7ed019 100644 --- a/drivers/net/mlx5/mlx5_ethdev.c +++ b/drivers/net/mlx5/mlx5_ethdev.c @@ -44,7 +44,6 @@ #include <sys/ioctl.h> #include <sys/socket.h> #include <netinet/in.h> -#include <linux/if.h> #include <linux/ethtool.h> #include <linux/sockios.h> #include <fcntl.h> @@ -363,6 +362,38 @@ priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) } /** + * Return the number of active VFs for the current device. + * + * @param[in] priv + * Pointer to private structure. + * @param[out] num_vfs + * Number of active VFs. + * + * @return + * 0 on success, -1 on failure and errno is set. + */ +int +priv_get_num_vfs(struct priv *priv, uint16_t *num_vfs) +{ + /* The sysfs entry name depends on the operating system. */ + const char **name = (const char *[]){ + "device/sriov_numvfs", + "device/mlx5_num_vfs", + NULL, + }; + int ret; + + do { + unsigned long ulong_num_vfs; + + ret = priv_get_sysfs_ulong(priv, *name, &ulong_num_vfs); + if (!ret) + *num_vfs = ulong_num_vfs; + } while (*(++name) && ret); + return ret; +} + +/** * Get device MTU. * * @param priv @@ -398,7 +429,15 @@ priv_get_mtu(struct priv *priv, uint16_t *mtu) static int priv_set_mtu(struct priv *priv, uint16_t mtu) { - return priv_set_sysfs_ulong(priv, "mtu", mtu); + uint16_t new_mtu; + + if (priv_set_sysfs_ulong(priv, "mtu", mtu) || + priv_get_mtu(priv, &new_mtu)) + return -1; + if (new_mtu == mtu) + return 0; + errno = EINVAL; + return -1; } /** @@ -545,7 +584,8 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) DEV_RX_OFFLOAD_UDP_CKSUM | DEV_RX_OFFLOAD_TCP_CKSUM) : 0); - info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT; + if (!priv->mps) + info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT; if (priv->hw_csum) info->tx_offload_capa |= (DEV_TX_OFFLOAD_IPV4_CKSUM | @@ -584,8 +624,7 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) }; - if (dev->rx_pkt_burst == mlx5_rx_burst || - dev->rx_pkt_burst == mlx5_rx_burst_sp) + if (dev->rx_pkt_burst == mlx5_rx_burst) return ptypes; return NULL; } @@ -617,7 +656,7 @@ mlx5_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete) memset(&dev_link, 0, sizeof(dev_link)); dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING)); - ifr.ifr_data = &edata; + ifr.ifr_data = (void *)&edata; if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", strerror(errno)); @@ -686,6 +725,9 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) unsigned int i; uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) = mlx5_rx_burst; + unsigned int max_frame_len; + int rehash; + int restart = priv->started; if (mlx5_is_secondary()) return -E_RTE_SECONDARY; @@ -699,7 +741,6 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) goto out; } else DEBUG("adapter port %u MTU set to %u", priv->port, mtu); - priv->mtu = mtu; /* Temporarily replace RX handler with a fake one, assuming it has not * been copied elsewhere. */ dev->rx_pkt_burst = removed_rx_burst; @@ -707,33 +748,94 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) * removed_rx_burst() instead. */ rte_wmb(); usleep(1000); + /* MTU does not include header and CRC. */ + max_frame_len = ETHER_HDR_LEN + mtu + ETHER_CRC_LEN; + /* Check if at least one queue is going to need a SGE update. */ + for (i = 0; i != priv->rxqs_n; ++i) { + struct rxq *rxq = (*priv->rxqs)[i]; + unsigned int mb_len; + unsigned int size = RTE_PKTMBUF_HEADROOM + max_frame_len; + unsigned int sges_n; + + if (rxq == NULL) + continue; + mb_len = rte_pktmbuf_data_room_size(rxq->mp); + assert(mb_len >= RTE_PKTMBUF_HEADROOM); + /* + * Determine the number of SGEs needed for a full packet + * and round it to the next power of two. + */ + sges_n = log2above((size / mb_len) + !!(size % mb_len)); + if (sges_n != rxq->sges_n) + break; + } + /* + * If all queues have the right number of SGEs, a simple rehash + * of their buffers is enough, otherwise SGE information can only + * be updated in a queue by recreating it. All resources that depend + * on queues (flows, indirection tables) must be recreated as well in + * that case. + */ + rehash = (i == priv->rxqs_n); + if (!rehash) { + /* Clean up everything as with mlx5_dev_stop(). */ + priv_special_flow_disable_all(priv); + priv_mac_addrs_disable(priv); + priv_destroy_hash_rxqs(priv); + priv_fdir_disable(priv); + priv_dev_interrupt_handler_uninstall(priv, dev); + } +recover: /* Reconfigure each RX queue. */ for (i = 0; (i != priv->rxqs_n); ++i) { struct rxq *rxq = (*priv->rxqs)[i]; - unsigned int max_frame_len; + struct rxq_ctrl *rxq_ctrl = + container_of(rxq, struct rxq_ctrl, rxq); int sp; + unsigned int mb_len; + unsigned int tmp; if (rxq == NULL) continue; - /* Calculate new maximum frame length according to MTU and - * toggle scattered support (sp) if necessary. */ - max_frame_len = (priv->mtu + ETHER_HDR_LEN + - (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN)); - sp = (max_frame_len > (rxq->mb_len - RTE_PKTMBUF_HEADROOM)); + mb_len = rte_pktmbuf_data_room_size(rxq->mp); + assert(mb_len >= RTE_PKTMBUF_HEADROOM); + /* Toggle scattered support (sp) if necessary. */ + sp = (max_frame_len > (mb_len - RTE_PKTMBUF_HEADROOM)); /* Provide new values to rxq_setup(). */ dev->data->dev_conf.rxmode.jumbo_frame = sp; dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; - ret = rxq_rehash(dev, rxq); - if (ret) { - /* Force SP RX if that queue requires it and abort. */ - if (rxq->sp) - rx_func = mlx5_rx_burst_sp; - break; + if (rehash) + ret = rxq_rehash(dev, rxq_ctrl); + else + ret = rxq_ctrl_setup(dev, rxq_ctrl, rxq->elts_n, + rxq_ctrl->socket, NULL, rxq->mp); + if (!ret) + continue; + /* Attempt to roll back in case of error. */ + tmp = (mb_len << rxq->sges_n) - RTE_PKTMBUF_HEADROOM; + if (max_frame_len != tmp) { + max_frame_len = tmp; + goto recover; } - /* Scattered burst function takes priority. */ - if (rxq->sp) - rx_func = mlx5_rx_burst_sp; + /* Double fault, disable RX. */ + break; + } + /* + * Use a safe RX burst function in case of error, otherwise mimic + * mlx5_dev_start(). + */ + if (ret) { + ERROR("unable to reconfigure RX queues, RX disabled"); + rx_func = removed_rx_burst; + } else if (restart && + !rehash && + !priv_create_hash_rxqs(priv) && + !priv_rehash_flows(priv)) { + if (dev->data->dev_conf.fdir_conf.mode == RTE_FDIR_MODE_NONE) + priv_fdir_enable(priv); + priv_dev_interrupt_handler_install(priv, dev); } + priv->mtu = mtu; /* Burst functions can now be called again. */ rte_wmb(); dev->rx_pkt_burst = rx_func; @@ -767,7 +869,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) if (mlx5_is_secondary()) return -E_RTE_SECONDARY; - ifr.ifr_data = ðpause; + ifr.ifr_data = (void *)ðpause; priv_lock(priv); if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { ret = errno; @@ -818,7 +920,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) if (mlx5_is_secondary()) return -E_RTE_SECONDARY; - ifr.ifr_data = ðpause; + ifr.ifr_data = (void *)ðpause; ethpause.autoneg = fc_conf->autoneg; if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || (fc_conf->mode & RTE_FC_RX_PAUSE)) @@ -1012,7 +1114,7 @@ priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev); priv->pending_alarm = 0; priv->intr_handle.fd = 0; - priv->intr_handle.type = 0; + priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; } /** @@ -1061,23 +1163,13 @@ priv_set_link(struct priv *priv, int up) { struct rte_eth_dev *dev = priv->dev; int err; - unsigned int i; if (up) { err = priv_set_flags(priv, ~IFF_UP, IFF_UP); if (err) return err; - for (i = 0; i < priv->rxqs_n; i++) - if ((*priv->rxqs)[i]->sp) - break; - /* Check if an sp queue exists. - * Note: Some old frames might be received. - */ - if (i == priv->rxqs_n) - dev->rx_pkt_burst = mlx5_rx_burst; - else - dev->rx_pkt_burst = mlx5_rx_burst_sp; - dev->tx_pkt_burst = mlx5_tx_burst; + priv_select_tx_function(priv); + priv_select_rx_function(priv); } else { err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP); if (err) @@ -1209,34 +1301,40 @@ mlx5_secondary_data_setup(struct priv *priv) /* TX queues. */ for (i = 0; i != nb_tx_queues; ++i) { struct txq *primary_txq = (*sd->primary_priv->txqs)[i]; - struct txq *txq; + struct txq_ctrl *primary_txq_ctrl; + struct txq_ctrl *txq_ctrl; if (primary_txq == NULL) continue; - txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, - primary_txq->socket); - if (txq != NULL) { - if (txq_setup(priv->dev, - txq, - primary_txq->elts_n * MLX5_PMD_SGE_WR_N, - primary_txq->socket, - NULL) == 0) { - txq->stats.idx = primary_txq->stats.idx; - tx_queues[i] = txq; + primary_txq_ctrl = container_of(primary_txq, + struct txq_ctrl, txq); + txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl), 0, + primary_txq_ctrl->socket); + if (txq_ctrl != NULL) { + if (txq_ctrl_setup(priv->dev, + primary_txq_ctrl, + primary_txq->elts_n, + primary_txq_ctrl->socket, + NULL) == 0) { + txq_ctrl->txq.stats.idx = + primary_txq->stats.idx; + tx_queues[i] = &txq_ctrl->txq; continue; } - rte_free(txq); + rte_free(txq_ctrl); } while (i) { - txq = tx_queues[--i]; - txq_cleanup(txq); - rte_free(txq); + txq_ctrl = tx_queues[--i]; + txq_cleanup(txq_ctrl); + rte_free(txq_ctrl); } goto error; } /* RX queues. */ for (i = 0; i != nb_rx_queues; ++i) { - struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i]; + struct rxq_ctrl *primary_rxq = + container_of((*sd->primary_priv->rxqs)[i], + struct rxq_ctrl, rxq); if (primary_rxq == NULL) continue; @@ -1263,13 +1361,11 @@ mlx5_secondary_data_setup(struct priv *priv) rte_mb(); priv->dev->data = &sd->data; rte_mb(); - priv->dev->tx_pkt_burst = mlx5_tx_burst; - priv->dev->rx_pkt_burst = removed_rx_burst; + priv_select_tx_function(priv); + priv_select_rx_function(priv); priv_unlock(priv); end: /* More sanity checks. */ - assert(priv->dev->tx_pkt_burst == mlx5_tx_burst); - assert(priv->dev->rx_pkt_burst == removed_rx_burst); assert(priv->dev->data == &sd->data); rte_spinlock_unlock(&sd->lock); return priv; @@ -1280,3 +1376,42 @@ error: rte_spinlock_unlock(&sd->lock); return NULL; } + +/** + * Configure the TX function to use. + * + * @param priv + * Pointer to private structure. + */ +void +priv_select_tx_function(struct priv *priv) +{ + priv->dev->tx_pkt_burst = mlx5_tx_burst; + /* Display warning for unsupported configurations. */ + if (priv->sriov && priv->mps) + WARN("multi-packet send WQE cannot be used on a SR-IOV setup"); + /* Select appropriate TX function. */ + if ((priv->sriov == 0) && priv->mps && priv->txq_inline) { + priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline; + DEBUG("selected MPW inline TX function"); + } else if ((priv->sriov == 0) && priv->mps) { + priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw; + DEBUG("selected MPW TX function"); + } else if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) { + priv->dev->tx_pkt_burst = mlx5_tx_burst_inline; + DEBUG("selected inline TX function (%u >= %u queues)", + priv->txqs_n, priv->txqs_inline); + } +} + +/** + * Configure the RX function to use. + * + * @param priv + * Pointer to private structure. + */ +void +priv_select_rx_function(struct priv *priv) +{ + priv->dev->rx_pkt_burst = mlx5_rx_burst; +} diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c index 63e43ad9..73eb00ec 100644 --- a/drivers/net/mlx5/mlx5_fdir.c +++ b/drivers/net/mlx5/mlx5_fdir.c @@ -122,7 +122,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter, case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER: desc->type = HASH_RXQ_IPV4; break; -#ifdef HAVE_FLOW_SPEC_IPV6 case RTE_ETH_FLOW_NONFRAG_IPV6_UDP: desc->type = HASH_RXQ_UDPV6; break; @@ -132,7 +131,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter, case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER: desc->type = HASH_RXQ_IPV6; break; -#endif /* HAVE_FLOW_SPEC_IPV6 */ default: break; } @@ -147,7 +145,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter, desc->src_ip[0] = fdir_filter->input.flow.ip4_flow.src_ip; desc->dst_ip[0] = fdir_filter->input.flow.ip4_flow.dst_ip; break; -#ifdef HAVE_FLOW_SPEC_IPV6 case RTE_ETH_FLOW_NONFRAG_IPV6_UDP: case RTE_ETH_FLOW_NONFRAG_IPV6_TCP: desc->src_port = fdir_filter->input.flow.udp6_flow.src_port; @@ -161,7 +158,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter, fdir_filter->input.flow.ipv6_flow.dst_ip, sizeof(desc->dst_ip)); break; -#endif /* HAVE_FLOW_SPEC_IPV6 */ default: break; } @@ -211,7 +207,6 @@ priv_fdir_overlap(const struct priv *priv, (desc2->dst_ip[0] & mask->ipv4_mask.dst_ip))) return 0; break; -#ifdef HAVE_FLOW_SPEC_IPV6 case HASH_RXQ_IPV6: case HASH_RXQ_UDPV6: case HASH_RXQ_TCPV6: @@ -222,7 +217,6 @@ priv_fdir_overlap(const struct priv *priv, (desc2->dst_ip[i] & mask->ipv6_mask.dst_ip[i]))) return 0; break; -#endif /* HAVE_FLOW_SPEC_IPV6 */ default: break; } @@ -258,9 +252,7 @@ priv_fdir_flow_add(struct priv *priv, uintptr_t spec_offset = (uintptr_t)&data->spec; struct ibv_exp_flow_spec_eth *spec_eth; struct ibv_exp_flow_spec_ipv4 *spec_ipv4; -#ifdef HAVE_FLOW_SPEC_IPV6 struct ibv_exp_flow_spec_ipv6 *spec_ipv6; -#endif /* HAVE_FLOW_SPEC_IPV6 */ struct ibv_exp_flow_spec_tcp_udp *spec_tcp_udp; struct mlx5_fdir_filter *iter_fdir_filter; unsigned int i; @@ -334,7 +326,6 @@ priv_fdir_flow_add(struct priv *priv, spec_offset += spec_ipv4->size; break; -#ifdef HAVE_FLOW_SPEC_IPV6 case HASH_RXQ_IPV6: case HASH_RXQ_UDPV6: case HASH_RXQ_TCPV6: @@ -368,7 +359,6 @@ priv_fdir_flow_add(struct priv *priv, spec_offset += spec_ipv6->size; break; -#endif /* HAVE_FLOW_SPEC_IPV6 */ default: ERROR("invalid flow attribute type"); return EINVAL; @@ -424,7 +414,9 @@ create_flow: static struct fdir_queue * priv_get_fdir_queue(struct priv *priv, uint16_t idx) { - struct fdir_queue *fdir_queue = &(*priv->rxqs)[idx]->fdir_queue; + struct rxq_ctrl *rxq_ctrl = + container_of((*priv->rxqs)[idx], struct rxq_ctrl, rxq); + struct fdir_queue *fdir_queue = &rxq_ctrl->fdir_queue; struct ibv_exp_rwq_ind_table *ind_table = NULL; struct ibv_qp *qp = NULL; struct ibv_exp_rwq_ind_table_init_attr ind_init_attr; @@ -439,7 +431,7 @@ priv_get_fdir_queue(struct priv *priv, uint16_t idx) ind_init_attr = (struct ibv_exp_rwq_ind_table_init_attr){ .pd = priv->pd, .log_ind_tbl_size = 0, - .ind_tbl = &((*priv->rxqs)[idx]->wq), + .ind_tbl = &rxq_ctrl->wq, .comp_mask = 0, }; @@ -629,8 +621,10 @@ priv_fdir_disable(struct priv *priv) /* Run on every RX queue to destroy related flow director QP and * indirection table. */ for (i = 0; (i != priv->rxqs_n); i++) { - fdir_queue = &(*priv->rxqs)[i]->fdir_queue; + struct rxq_ctrl *rxq_ctrl = + container_of((*priv->rxqs)[i], struct rxq_ctrl, rxq); + fdir_queue = &rxq_ctrl->fdir_queue; if (fdir_queue->qp != NULL) { claim_zero(ibv_destroy_qp(fdir_queue->qp)); fdir_queue->qp = NULL; diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c index c9cea485..f6b27bb8 100644 --- a/drivers/net/mlx5/mlx5_mac.c +++ b/drivers/net/mlx5/mlx5_mac.c @@ -38,7 +38,6 @@ #include <inttypes.h> #include <errno.h> #include <netinet/in.h> -#include <linux/if.h> #include <sys/ioctl.h> #include <arpa/inet.h> diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c new file mode 100644 index 00000000..67dfefa8 --- /dev/null +++ b/drivers/net/mlx5/mlx5_mr.c @@ -0,0 +1,283 @@ +/*- + * BSD LICENSE + * + * Copyright 2016 6WIND S.A. + * Copyright 2016 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-pedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-pedantic" +#endif + +/* DPDK headers don't like -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-pedantic" +#endif +#include <rte_mempool.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-pedantic" +#endif + +#include "mlx5.h" +#include "mlx5_rxtx.h" + +struct mlx5_check_mempool_data { + int ret; + char *start; + char *end; +}; + +/* Called by mlx5_check_mempool() when iterating the memory chunks. */ +static void +mlx5_check_mempool_cb(struct rte_mempool *mp, + void *opaque, struct rte_mempool_memhdr *memhdr, + unsigned int mem_idx) +{ + struct mlx5_check_mempool_data *data = opaque; + + (void)mp; + (void)mem_idx; + + /* It already failed, skip the next chunks. */ + if (data->ret != 0) + return; + /* It is the first chunk. */ + if (data->start == NULL && data->end == NULL) { + data->start = memhdr->addr; + data->end = data->start + memhdr->len; + return; + } + if (data->end == memhdr->addr) { + data->end += memhdr->len; + return; + } + if (data->start == (char *)memhdr->addr + memhdr->len) { + data->start -= memhdr->len; + return; + } + /* Error, mempool is not virtually contiguous. */ + data->ret = -1; +} + +/** + * Check if a mempool can be used: it must be virtually contiguous. + * + * @param[in] mp + * Pointer to memory pool. + * @param[out] start + * Pointer to the start address of the mempool virtual memory area + * @param[out] end + * Pointer to the end address of the mempool virtual memory area + * + * @return + * 0 on success (mempool is virtually contiguous), -1 on error. + */ +static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start, + uintptr_t *end) +{ + struct mlx5_check_mempool_data data; + + memset(&data, 0, sizeof(data)); + rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data); + *start = (uintptr_t)data.start; + *end = (uintptr_t)data.end; + + return data.ret; +} + +/** + * Register mempool as a memory region. + * + * @param pd + * Pointer to protection domain. + * @param mp + * Pointer to memory pool. + * + * @return + * Memory region pointer, NULL in case of error. + */ +struct ibv_mr * +mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp) +{ + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); + uintptr_t start; + uintptr_t end; + unsigned int i; + + if (mlx5_check_mempool(mp, &start, &end) != 0) { + ERROR("mempool %p: not virtually contiguous", + (void *)mp); + return NULL; + } + + DEBUG("mempool %p area start=%p end=%p size=%zu", + (void *)mp, (void *)start, (void *)end, + (size_t)(end - start)); + /* Round start and end to page boundary if found in memory segments. */ + for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { + uintptr_t addr = (uintptr_t)ms[i].addr; + size_t len = ms[i].len; + unsigned int align = ms[i].hugepage_sz; + + if ((start > addr) && (start < addr + len)) + start = RTE_ALIGN_FLOOR(start, align); + if ((end > addr) && (end < addr + len)) + end = RTE_ALIGN_CEIL(end, align); + } + DEBUG("mempool %p using start=%p end=%p size=%zu for MR", + (void *)mp, (void *)start, (void *)end, + (size_t)(end - start)); + return ibv_reg_mr(pd, + (void *)start, + end - start, + IBV_ACCESS_LOCAL_WRITE); +} + +/** + * Register a Memory Region (MR) <-> Memory Pool (MP) association in + * txq->mp2mr[]. If mp2mr[] is full, remove an entry first. + * + * This function should only be called by txq_mp2mr(). + * + * @param txq + * Pointer to TX queue structure. + * @param[in] mp + * Memory Pool for which a Memory Region lkey must be returned. + * @param idx + * Index of the next available entry. + * + * @return + * mr->lkey on success, (uint32_t)-1 on failure. + */ +uint32_t +txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx) +{ + struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq); + struct ibv_mr *mr; + + /* Add a new entry, register MR first. */ + DEBUG("%p: discovered new memory pool \"%s\" (%p)", + (void *)txq_ctrl, mp->name, (void *)mp); + mr = mlx5_mp2mr(txq_ctrl->priv->pd, mp); + if (unlikely(mr == NULL)) { + DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", + (void *)txq_ctrl); + return (uint32_t)-1; + } + if (unlikely(idx == RTE_DIM(txq_ctrl->txq.mp2mr))) { + /* Table is full, remove oldest entry. */ + DEBUG("%p: MR <-> MP table full, dropping oldest entry.", + (void *)txq_ctrl); + --idx; + claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[0].mr)); + memmove(&txq_ctrl->txq.mp2mr[0], &txq_ctrl->txq.mp2mr[1], + (sizeof(txq_ctrl->txq.mp2mr) - + sizeof(txq_ctrl->txq.mp2mr[0]))); + } + /* Store the new entry. */ + txq_ctrl->txq.mp2mr[idx].mp = mp; + txq_ctrl->txq.mp2mr[idx].mr = mr; + txq_ctrl->txq.mp2mr[idx].lkey = htonl(mr->lkey); + DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, + (void *)txq_ctrl, mp->name, (void *)mp, + txq_ctrl->txq.mp2mr[idx].lkey); + return txq_ctrl->txq.mp2mr[idx].lkey; +} + +struct txq_mp2mr_mbuf_check_data { + int ret; +}; + +/** + * Callback function for rte_mempool_obj_iter() to check whether a given + * mempool object looks like a mbuf. + * + * @param[in] mp + * The mempool pointer + * @param[in] arg + * Context data (struct txq_mp2mr_mbuf_check_data). Contains the + * return value. + * @param[in] obj + * Object address. + * @param index + * Object index, unused. + */ +static void +txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj, + uint32_t index __rte_unused) +{ + struct txq_mp2mr_mbuf_check_data *data = arg; + struct rte_mbuf *buf = obj; + + /* + * Check whether mbuf structure fits element size and whether mempool + * pointer is valid. + */ + if (sizeof(*buf) > mp->elt_size || buf->pool != mp) + data->ret = -1; +} + +/** + * Iterator function for rte_mempool_walk() to register existing mempools and + * fill the MP to MR cache of a TX queue. + * + * @param[in] mp + * Memory Pool to register. + * @param *arg + * Pointer to TX queue structure. + */ +void +txq_mp2mr_iter(struct rte_mempool *mp, void *arg) +{ + struct txq_ctrl *txq_ctrl = arg; + struct txq_mp2mr_mbuf_check_data data = { + .ret = 0, + }; + unsigned int i; + + /* Register mempool only if the first element looks like a mbuf. */ + if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 || + data.ret == -1) + return; + for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) { + if (unlikely(txq_ctrl->txq.mp2mr[i].mp == NULL)) { + /* Unknown MP, add a new MR for it. */ + break; + } + if (txq_ctrl->txq.mp2mr[i].mp == mp) + return; + } + txq_mp2mr_reg(&txq_ctrl->txq, mp, i); +} diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h new file mode 100644 index 00000000..5db219b3 --- /dev/null +++ b/drivers/net/mlx5/mlx5_prm.h @@ -0,0 +1,163 @@ +/*- + * BSD LICENSE + * + * Copyright 2016 6WIND S.A. + * Copyright 2016 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RTE_PMD_MLX5_PRM_H_ +#define RTE_PMD_MLX5_PRM_H_ + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-pedantic" +#endif +#include <infiniband/mlx5_hw.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-pedantic" +#endif + +/* Get CQE owner bit. */ +#define MLX5_CQE_OWNER(op_own) ((op_own) & MLX5_CQE_OWNER_MASK) + +/* Get CQE format. */ +#define MLX5_CQE_FORMAT(op_own) (((op_own) & MLX5E_CQE_FORMAT_MASK) >> 2) + +/* Get CQE opcode. */ +#define MLX5_CQE_OPCODE(op_own) (((op_own) & 0xf0) >> 4) + +/* Get CQE solicited event. */ +#define MLX5_CQE_SE(op_own) (((op_own) >> 1) & 1) + +/* Invalidate a CQE. */ +#define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4) + +/* CQE value to inform that VLAN is stripped. */ +#define MLX5_CQE_VLAN_STRIPPED 0x1 + +/* Maximum number of packets a multi-packet WQE can handle. */ +#define MLX5_MPW_DSEG_MAX 5 + +/* Room for inline data in regular work queue element. */ +#define MLX5_WQE64_INL_DATA 12 + +/* Room for inline data in multi-packet WQE. */ +#define MLX5_MWQE64_INL_DATA 28 + +/* Subset of struct mlx5_wqe_eth_seg. */ +struct mlx5_wqe_eth_seg_small { + uint32_t rsvd0; + uint8_t cs_flags; + uint8_t rsvd1; + uint16_t mss; + uint32_t rsvd2; + uint16_t inline_hdr_sz; +}; + +/* Regular WQE. */ +struct mlx5_wqe_regular { + union { + struct mlx5_wqe_ctrl_seg ctrl; + uint32_t data[4]; + } ctrl; + struct mlx5_wqe_eth_seg eseg; + struct mlx5_wqe_data_seg dseg; +} __rte_aligned(64); + +/* Inline WQE. */ +struct mlx5_wqe_inl { + union { + struct mlx5_wqe_ctrl_seg ctrl; + uint32_t data[4]; + } ctrl; + struct mlx5_wqe_eth_seg eseg; + uint32_t byte_cnt; + uint8_t data[MLX5_WQE64_INL_DATA]; +} __rte_aligned(64); + +/* Multi-packet WQE. */ +struct mlx5_wqe_mpw { + union { + struct mlx5_wqe_ctrl_seg ctrl; + uint32_t data[4]; + } ctrl; + struct mlx5_wqe_eth_seg_small eseg; + struct mlx5_wqe_data_seg dseg[2]; +} __rte_aligned(64); + +/* Multi-packet WQE with inline. */ +struct mlx5_wqe_mpw_inl { + union { + struct mlx5_wqe_ctrl_seg ctrl; + uint32_t data[4]; + } ctrl; + struct mlx5_wqe_eth_seg_small eseg; + uint32_t byte_cnt; + uint8_t data[MLX5_MWQE64_INL_DATA]; +} __rte_aligned(64); + +/* Union of all WQE types. */ +union mlx5_wqe { + struct mlx5_wqe_regular wqe; + struct mlx5_wqe_inl inl; + struct mlx5_wqe_mpw mpw; + struct mlx5_wqe_mpw_inl mpw_inl; + uint8_t data[64]; +}; + +/* MPW session status. */ +enum mlx5_mpw_state { + MLX5_MPW_STATE_OPENED, + MLX5_MPW_INL_STATE_OPENED, + MLX5_MPW_STATE_CLOSED, +}; + +/* MPW session descriptor. */ +struct mlx5_mpw { + enum mlx5_mpw_state state; + unsigned int pkts_n; + unsigned int len; + unsigned int total_len; + volatile union mlx5_wqe *wqe; + union { + volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX]; + volatile uint8_t *raw; + } data; +}; + +/* CQ element structure - should be equal to the cache line size */ +struct mlx5_cqe { +#if (RTE_CACHE_LINE_SIZE == 128) + uint8_t padding[64]; +#endif + struct mlx5_cqe64 cqe64; +}; + +#endif /* RTE_PMD_MLX5_PRM_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxmode.c b/drivers/net/mlx5/mlx5_rxmode.c index 3a55f633..8b585554 100644 --- a/drivers/net/mlx5/mlx5_rxmode.c +++ b/drivers/net/mlx5/mlx5_rxmode.c @@ -67,11 +67,9 @@ static const struct special_flow_init special_flow_init[] = { 1 << HASH_RXQ_TCPV4 | 1 << HASH_RXQ_UDPV4 | 1 << HASH_RXQ_IPV4 | -#ifdef HAVE_FLOW_SPEC_IPV6 1 << HASH_RXQ_TCPV6 | 1 << HASH_RXQ_UDPV6 | 1 << HASH_RXQ_IPV6 | -#endif /* HAVE_FLOW_SPEC_IPV6 */ 1 << HASH_RXQ_ETH | 0, .per_vlan = 0, @@ -82,10 +80,8 @@ static const struct special_flow_init special_flow_init[] = { .hash_types = 1 << HASH_RXQ_UDPV4 | 1 << HASH_RXQ_IPV4 | -#ifdef HAVE_FLOW_SPEC_IPV6 1 << HASH_RXQ_UDPV6 | 1 << HASH_RXQ_IPV6 | -#endif /* HAVE_FLOW_SPEC_IPV6 */ 1 << HASH_RXQ_ETH | 0, .per_vlan = 0, @@ -96,15 +92,12 @@ static const struct special_flow_init special_flow_init[] = { .hash_types = 1 << HASH_RXQ_UDPV4 | 1 << HASH_RXQ_IPV4 | -#ifdef HAVE_FLOW_SPEC_IPV6 1 << HASH_RXQ_UDPV6 | 1 << HASH_RXQ_IPV6 | -#endif /* HAVE_FLOW_SPEC_IPV6 */ 1 << HASH_RXQ_ETH | 0, .per_vlan = 1, }, -#ifdef HAVE_FLOW_SPEC_IPV6 [HASH_RXQ_FLOW_TYPE_IPV6MULTI] = { .dst_mac_val = "\x33\x33\x00\x00\x00\x00", .dst_mac_mask = "\xff\xff\x00\x00\x00\x00", @@ -115,7 +108,6 @@ static const struct special_flow_init special_flow_init[] = { 0, .per_vlan = 1, }, -#endif /* HAVE_FLOW_SPEC_IPV6 */ }; /** @@ -355,7 +347,9 @@ priv_special_flow_enable_all(struct priv *priv) { enum hash_rxq_flow_type flow_type; - for (flow_type = 0; flow_type != HASH_RXQ_FLOW_TYPE_MAC; ++flow_type) { + for (flow_type = HASH_RXQ_FLOW_TYPE_PROMISC; + flow_type != HASH_RXQ_FLOW_TYPE_MAC; + ++flow_type) { int ret; ret = priv_special_flow_enable(priv, flow_type); @@ -380,7 +374,9 @@ priv_special_flow_disable_all(struct priv *priv) { enum hash_rxq_flow_type flow_type; - for (flow_type = 0; flow_type != HASH_RXQ_FLOW_TYPE_MAC; ++flow_type) + for (flow_type = HASH_RXQ_FLOW_TYPE_PROMISC; + flow_type != HASH_RXQ_FLOW_TYPE_MAC; + ++flow_type) priv_special_flow_disable(priv, flow_type); } diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index cbb017bb..29c137cd 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -43,6 +43,8 @@ #pragma GCC diagnostic ignored "-pedantic" #endif #include <infiniband/verbs.h> +#include <infiniband/arch.h> +#include <infiniband/mlx5_hw.h> #ifdef PEDANTIC #pragma GCC diagnostic error "-pedantic" #endif @@ -105,7 +107,6 @@ const struct hash_rxq_init hash_rxq_init[] = { }, .underlayer = &hash_rxq_init[HASH_RXQ_ETH], }, -#ifdef HAVE_FLOW_SPEC_IPV6 [HASH_RXQ_TCPV6] = { .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 | IBV_EXP_RX_HASH_DST_IPV6 | @@ -144,7 +145,6 @@ const struct hash_rxq_init hash_rxq_init[] = { }, .underlayer = &hash_rxq_init[HASH_RXQ_ETH], }, -#endif /* HAVE_FLOW_SPEC_IPV6 */ [HASH_RXQ_ETH] = { .hash_fields = 0, .dpdk_rss_hf = 0, @@ -168,17 +168,11 @@ static const struct ind_table_init ind_table_init[] = { 1 << HASH_RXQ_TCPV4 | 1 << HASH_RXQ_UDPV4 | 1 << HASH_RXQ_IPV4 | -#ifdef HAVE_FLOW_SPEC_IPV6 1 << HASH_RXQ_TCPV6 | 1 << HASH_RXQ_UDPV6 | 1 << HASH_RXQ_IPV6 | -#endif /* HAVE_FLOW_SPEC_IPV6 */ 0, -#ifdef HAVE_FLOW_SPEC_IPV6 .hash_types_n = 6, -#else /* HAVE_FLOW_SPEC_IPV6 */ - .hash_types_n = 3, -#endif /* HAVE_FLOW_SPEC_IPV6 */ }, { .max_size = 1, @@ -243,12 +237,8 @@ priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr, init = &hash_rxq_init[type]; *flow_attr = (struct ibv_exp_flow_attr){ .type = IBV_EXP_FLOW_ATTR_NORMAL, -#ifdef MLX5_FDIR_SUPPORT /* Priorities < 3 are reserved for flow director. */ .priority = init->flow_priority + 3, -#else /* MLX5_FDIR_SUPPORT */ - .priority = init->flow_priority, -#endif /* MLX5_FDIR_SUPPORT */ .num_of_specs = 0, .port = priv->port, .flags = 0, @@ -279,7 +269,7 @@ priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr, static enum hash_rxq_type hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos) { - enum hash_rxq_type type = 0; + enum hash_rxq_type type = HASH_RXQ_TCPV4; assert(pos < table->hash_types_n); do { @@ -385,8 +375,13 @@ priv_create_hash_rxqs(struct priv *priv) DEBUG("indirection table extended to assume %u WQs", priv->reta_idx_n); } - for (i = 0; (i != priv->reta_idx_n); ++i) - wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq; + for (i = 0; (i != priv->reta_idx_n); ++i) { + struct rxq_ctrl *rxq_ctrl; + + rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]], + struct rxq_ctrl, rxq); + wqs[i] = rxq_ctrl->wq; + } /* Get number of hash RX queues to configure. */ for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i) hash_rxqs_n += ind_table_init[i].hash_types_n; @@ -589,9 +584,7 @@ priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type) case HASH_RXQ_FLOW_TYPE_ALLMULTI: return !!priv->allmulti_req; case HASH_RXQ_FLOW_TYPE_BROADCAST: -#ifdef HAVE_FLOW_SPEC_IPV6 case HASH_RXQ_FLOW_TYPE_IPV6MULTI: -#endif /* HAVE_FLOW_SPEC_IPV6 */ /* If allmulti is enabled, broadcast and ipv6multi * are unnecessary. */ return !priv->allmulti_req; @@ -616,9 +609,11 @@ priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type) int priv_rehash_flows(struct priv *priv) { - unsigned int i; + enum hash_rxq_flow_type i; - for (i = 0; (i != RTE_DIM((*priv->hash_rxqs)[0].special_flow)); ++i) + for (i = HASH_RXQ_FLOW_TYPE_PROMISC; + i != RTE_DIM((*priv->hash_rxqs)[0].special_flow); + ++i) if (!priv_allow_flow_type(priv, i)) { priv_special_flow_disable(priv, i); } else { @@ -634,148 +629,9 @@ priv_rehash_flows(struct priv *priv) } /** - * Allocate RX queue elements with scattered packets support. - * - * @param rxq - * Pointer to RX queue structure. - * @param elts_n - * Number of elements to allocate. - * @param[in] pool - * If not NULL, fetch buffers from this array instead of allocating them - * with rte_pktmbuf_alloc(). - * - * @return - * 0 on success, errno value on failure. - */ -static int -rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n, - struct rte_mbuf **pool) -{ - unsigned int i; - struct rxq_elt_sp (*elts)[elts_n] = - rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0, - rxq->socket); - int ret = 0; - - if (elts == NULL) { - ERROR("%p: can't allocate packets array", (void *)rxq); - ret = ENOMEM; - goto error; - } - /* For each WR (packet). */ - for (i = 0; (i != elts_n); ++i) { - unsigned int j; - struct rxq_elt_sp *elt = &(*elts)[i]; - struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges; - - /* These two arrays must have the same size. */ - assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs)); - /* For each SGE (segment). */ - for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) { - struct ibv_sge *sge = &(*sges)[j]; - struct rte_mbuf *buf; - - if (pool != NULL) { - buf = *(pool++); - assert(buf != NULL); - rte_pktmbuf_reset(buf); - } else - buf = rte_pktmbuf_alloc(rxq->mp); - if (buf == NULL) { - assert(pool == NULL); - ERROR("%p: empty mbuf pool", (void *)rxq); - ret = ENOMEM; - goto error; - } - elt->bufs[j] = buf; - /* Headroom is reserved by rte_pktmbuf_alloc(). */ - assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); - /* Buffer is supposed to be empty. */ - assert(rte_pktmbuf_data_len(buf) == 0); - assert(rte_pktmbuf_pkt_len(buf) == 0); - /* sge->addr must be able to store a pointer. */ - assert(sizeof(sge->addr) >= sizeof(uintptr_t)); - if (j == 0) { - /* The first SGE keeps its headroom. */ - sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); - sge->length = (buf->buf_len - - RTE_PKTMBUF_HEADROOM); - } else { - /* Subsequent SGEs lose theirs. */ - assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); - SET_DATA_OFF(buf, 0); - sge->addr = (uintptr_t)buf->buf_addr; - sge->length = buf->buf_len; - } - sge->lkey = rxq->mr->lkey; - /* Redundant check for tailroom. */ - assert(sge->length == rte_pktmbuf_tailroom(buf)); - } - } - DEBUG("%p: allocated and configured %u WRs (%zu segments)", - (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges))); - rxq->elts_n = elts_n; - rxq->elts_head = 0; - rxq->elts.sp = elts; - assert(ret == 0); - return 0; -error: - if (elts != NULL) { - assert(pool == NULL); - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - unsigned int j; - struct rxq_elt_sp *elt = &(*elts)[i]; - - for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) { - struct rte_mbuf *buf = elt->bufs[j]; - - if (buf != NULL) - rte_pktmbuf_free_seg(buf); - } - } - rte_free(elts); - } - DEBUG("%p: failed, freed everything", (void *)rxq); - assert(ret > 0); - return ret; -} - -/** - * Free RX queue elements with scattered packets support. - * - * @param rxq - * Pointer to RX queue structure. - */ -static void -rxq_free_elts_sp(struct rxq *rxq) -{ - unsigned int i; - unsigned int elts_n = rxq->elts_n; - struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp; - - DEBUG("%p: freeing WRs", (void *)rxq); - rxq->elts_n = 0; - rxq->elts.sp = NULL; - if (elts == NULL) - return; - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - unsigned int j; - struct rxq_elt_sp *elt = &(*elts)[i]; - - for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) { - struct rte_mbuf *buf = elt->bufs[j]; - - if (buf != NULL) - rte_pktmbuf_free_seg(buf); - } - } - rte_free(elts); -} - -/** * Allocate RX queue elements. * - * @param rxq + * @param rxq_ctrl * Pointer to RX queue structure. * @param elts_n * Number of elements to allocate. @@ -787,73 +643,67 @@ rxq_free_elts_sp(struct rxq *rxq) * 0 on success, errno value on failure. */ static int -rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool) +rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n, + struct rte_mbuf *(*pool)[]) { + const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n; unsigned int i; - struct rxq_elt (*elts)[elts_n] = - rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0, - rxq->socket); int ret = 0; - if (elts == NULL) { - ERROR("%p: can't allocate packets array", (void *)rxq); - ret = ENOMEM; - goto error; - } - /* For each WR (packet). */ + /* Iterate on segments. */ for (i = 0; (i != elts_n); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct ibv_sge *sge = &(*elts)[i].sge; struct rte_mbuf *buf; + volatile struct mlx5_wqe_data_seg *scat = + &(*rxq_ctrl->rxq.wqes)[i]; if (pool != NULL) { - buf = *(pool++); + buf = (*pool)[i]; assert(buf != NULL); rte_pktmbuf_reset(buf); + rte_pktmbuf_refcnt_update(buf, 1); } else - buf = rte_pktmbuf_alloc(rxq->mp); + buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp); if (buf == NULL) { assert(pool == NULL); - ERROR("%p: empty mbuf pool", (void *)rxq); + ERROR("%p: empty mbuf pool", (void *)rxq_ctrl); ret = ENOMEM; goto error; } - elt->buf = buf; /* Headroom is reserved by rte_pktmbuf_alloc(). */ assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); /* Buffer is supposed to be empty. */ assert(rte_pktmbuf_data_len(buf) == 0); assert(rte_pktmbuf_pkt_len(buf) == 0); - /* sge->addr must be able to store a pointer. */ - assert(sizeof(sge->addr) >= sizeof(uintptr_t)); - /* SGE keeps its headroom. */ - sge->addr = (uintptr_t) - ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM); - sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM); - sge->lkey = rxq->mr->lkey; - /* Redundant check for tailroom. */ - assert(sge->length == rte_pktmbuf_tailroom(buf)); + assert(!buf->next); + /* Only the first segment keeps headroom. */ + if (i % sges_n) + SET_DATA_OFF(buf, 0); + PORT(buf) = rxq_ctrl->rxq.port_id; + DATA_LEN(buf) = rte_pktmbuf_tailroom(buf); + PKT_LEN(buf) = DATA_LEN(buf); + NB_SEGS(buf) = 1; + /* scat->addr must be able to store a pointer. */ + assert(sizeof(scat->addr) >= sizeof(uintptr_t)); + *scat = (struct mlx5_wqe_data_seg){ + .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)), + .byte_count = htonl(DATA_LEN(buf)), + .lkey = htonl(rxq_ctrl->mr->lkey), + }; + (*rxq_ctrl->rxq.elts)[i] = buf; } - DEBUG("%p: allocated and configured %u single-segment WRs", - (void *)rxq, elts_n); - rxq->elts_n = elts_n; - rxq->elts_head = 0; - rxq->elts.no_sp = elts; + DEBUG("%p: allocated and configured %u segments (max %u packets)", + (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n)); assert(ret == 0); return 0; error: - if (elts != NULL) { - assert(pool == NULL); - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct rte_mbuf *buf = elt->buf; - - if (buf != NULL) - rte_pktmbuf_free_seg(buf); - } - rte_free(elts); + assert(pool == NULL); + elts_n = i; + for (i = 0; (i != elts_n); ++i) { + if ((*rxq_ctrl->rxq.elts)[i] != NULL) + rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]); + (*rxq_ctrl->rxq.elts)[i] = NULL; } - DEBUG("%p: failed, freed everything", (void *)rxq); + DEBUG("%p: failed, freed everything", (void *)rxq_ctrl); assert(ret > 0); return ret; } @@ -861,29 +711,23 @@ error: /** * Free RX queue elements. * - * @param rxq + * @param rxq_ctrl * Pointer to RX queue structure. */ static void -rxq_free_elts(struct rxq *rxq) +rxq_free_elts(struct rxq_ctrl *rxq_ctrl) { unsigned int i; - unsigned int elts_n = rxq->elts_n; - struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp; - DEBUG("%p: freeing WRs", (void *)rxq); - rxq->elts_n = 0; - rxq->elts.no_sp = NULL; - if (elts == NULL) + DEBUG("%p: freeing WRs", (void *)rxq_ctrl); + if (rxq_ctrl->rxq.elts == NULL) return; - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct rte_mbuf *buf = elt->buf; - if (buf != NULL) - rte_pktmbuf_free_seg(buf); + for (i = 0; (i != rxq_ctrl->rxq.elts_n); ++i) { + if ((*rxq_ctrl->rxq.elts)[i] != NULL) + rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]); + (*rxq_ctrl->rxq.elts)[i] = NULL; } - rte_free(elts); } /** @@ -891,65 +735,60 @@ rxq_free_elts(struct rxq *rxq) * * Destroy objects, free allocated memory and reset the structure for reuse. * - * @param rxq + * @param rxq_ctrl * Pointer to RX queue structure. */ void -rxq_cleanup(struct rxq *rxq) +rxq_cleanup(struct rxq_ctrl *rxq_ctrl) { struct ibv_exp_release_intf_params params; - DEBUG("cleaning up %p", (void *)rxq); - if (rxq->sp) - rxq_free_elts_sp(rxq); - else - rxq_free_elts(rxq); - rxq->poll = NULL; - rxq->recv = NULL; - if (rxq->if_wq != NULL) { - assert(rxq->priv != NULL); - assert(rxq->priv->ctx != NULL); - assert(rxq->wq != NULL); + DEBUG("cleaning up %p", (void *)rxq_ctrl); + rxq_free_elts(rxq_ctrl); + if (rxq_ctrl->if_wq != NULL) { + assert(rxq_ctrl->priv != NULL); + assert(rxq_ctrl->priv->ctx != NULL); + assert(rxq_ctrl->wq != NULL); params = (struct ibv_exp_release_intf_params){ .comp_mask = 0, }; - claim_zero(ibv_exp_release_intf(rxq->priv->ctx, - rxq->if_wq, + claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx, + rxq_ctrl->if_wq, ¶ms)); } - if (rxq->if_cq != NULL) { - assert(rxq->priv != NULL); - assert(rxq->priv->ctx != NULL); - assert(rxq->cq != NULL); + if (rxq_ctrl->if_cq != NULL) { + assert(rxq_ctrl->priv != NULL); + assert(rxq_ctrl->priv->ctx != NULL); + assert(rxq_ctrl->cq != NULL); params = (struct ibv_exp_release_intf_params){ .comp_mask = 0, }; - claim_zero(ibv_exp_release_intf(rxq->priv->ctx, - rxq->if_cq, + claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx, + rxq_ctrl->if_cq, ¶ms)); } - if (rxq->wq != NULL) - claim_zero(ibv_exp_destroy_wq(rxq->wq)); - if (rxq->cq != NULL) - claim_zero(ibv_destroy_cq(rxq->cq)); - if (rxq->rd != NULL) { + if (rxq_ctrl->wq != NULL) + claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq)); + if (rxq_ctrl->cq != NULL) + claim_zero(ibv_destroy_cq(rxq_ctrl->cq)); + if (rxq_ctrl->rd != NULL) { struct ibv_exp_destroy_res_domain_attr attr = { .comp_mask = 0, }; - assert(rxq->priv != NULL); - assert(rxq->priv->ctx != NULL); - claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx, - rxq->rd, + assert(rxq_ctrl->priv != NULL); + assert(rxq_ctrl->priv->ctx != NULL); + claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->priv->ctx, + rxq_ctrl->rd, &attr)); } - if (rxq->mr != NULL) - claim_zero(ibv_dereg_mr(rxq->mr)); - memset(rxq, 0, sizeof(*rxq)); + if (rxq_ctrl->mr != NULL) + claim_zero(ibv_dereg_mr(rxq_ctrl->mr)); + memset(rxq_ctrl, 0, sizeof(*rxq_ctrl)); } /** - * Reconfigure a RX queue with new parameters. + * Reconfigure RX queue buffers. * * rxq_rehash() does not allocate mbufs, which, if not done from the right * thread (such as a control thread), may corrupt the pool. @@ -957,173 +796,109 @@ rxq_cleanup(struct rxq *rxq) * * @param dev * Pointer to Ethernet device structure. - * @param rxq + * @param rxq_ctrl * RX queue pointer. * * @return * 0 on success, errno value on failure. */ int -rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq) +rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl) { - struct priv *priv = rxq->priv; - struct rxq tmpl = *rxq; - unsigned int mbuf_n; - unsigned int desc_n; - struct rte_mbuf **pool; - unsigned int i, k; + unsigned int elts_n = rxq_ctrl->rxq.elts_n; + unsigned int i; struct ibv_exp_wq_attr mod; int err; - DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq); - /* Number of descriptors and mbufs currently allocated. */ - desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1)); - mbuf_n = desc_n; - /* Toggle RX checksum offload if hardware supports it. */ - if (priv->hw_csum) { - tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum; - rxq->csum = tmpl.csum; - } - if (priv->hw_csum_l2tun) { - tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum; - rxq->csum_l2tun = tmpl.csum_l2tun; - } - /* Enable scattered packets support for this queue if necessary. */ - if ((dev->data->dev_conf.rxmode.jumbo_frame) && - (dev->data->dev_conf.rxmode.max_rx_pkt_len > - (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) { - tmpl.sp = 1; - desc_n /= MLX5_PMD_SGE_WR_N; - } else - tmpl.sp = 0; - DEBUG("%p: %s scattered packets support (%u WRs)", - (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n); - /* If scatter mode is the same as before, nothing to do. */ - if (tmpl.sp == rxq->sp) { - DEBUG("%p: nothing to do", (void *)dev); - return 0; - } + DEBUG("%p: rehashing queue %p with %u SGE(s) per packet", + (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n); + assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n))); /* From now on, any failure will render the queue unusable. * Reinitialize WQ. */ mod = (struct ibv_exp_wq_attr){ .attr_mask = IBV_EXP_WQ_ATTR_STATE, .wq_state = IBV_EXP_WQS_RESET, }; - err = ibv_exp_modify_wq(tmpl.wq, &mod); + err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod); if (err) { ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err)); assert(err > 0); return err; } - /* Allocate pool. */ - pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0); - if (pool == NULL) { - ERROR("%p: cannot allocate memory", (void *)dev); - return ENOBUFS; - } /* Snatch mbufs from original queue. */ - k = 0; - if (rxq->sp) { - struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; - - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - struct rxq_elt_sp *elt = &(*elts)[i]; - unsigned int j; - - for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) { - assert(elt->bufs[j] != NULL); - pool[k++] = elt->bufs[j]; - } - } - } else { - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; - - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct rte_mbuf *buf = elt->buf; + claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts)); + for (i = 0; i != elts_n; ++i) { + struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i]; - pool[k++] = buf; - } + assert(rte_mbuf_refcnt_read(buf) == 2); + rte_pktmbuf_free_seg(buf); } - assert(k == mbuf_n); - tmpl.elts_n = 0; - tmpl.elts.sp = NULL; - assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp); - err = ((tmpl.sp) ? - rxq_alloc_elts_sp(&tmpl, desc_n, pool) : - rxq_alloc_elts(&tmpl, desc_n, pool)); - if (err) { - ERROR("%p: cannot reallocate WRs, aborting", (void *)dev); - rte_free(pool); - assert(err > 0); - return err; - } - assert(tmpl.elts_n == desc_n); - assert(tmpl.elts.sp != NULL); - rte_free(pool); - /* Clean up original data. */ - rxq->elts_n = 0; - rte_free(rxq->elts.sp); - rxq->elts.sp = NULL; /* Change queue state to ready. */ mod = (struct ibv_exp_wq_attr){ .attr_mask = IBV_EXP_WQ_ATTR_STATE, .wq_state = IBV_EXP_WQS_RDY, }; - err = ibv_exp_modify_wq(tmpl.wq, &mod); + err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod); if (err) { ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s", (void *)dev, strerror(err)); goto error; } - /* Post SGEs. */ - assert(tmpl.if_wq != NULL); - if (tmpl.sp) { - struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp; - - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - err = tmpl.if_wq->recv_sg_list - (tmpl.wq, - (*elts)[i].sges, - RTE_DIM((*elts)[i].sges)); - if (err) - break; - } - } else { - struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp; - - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - err = tmpl.if_wq->recv_burst( - tmpl.wq, - &(*elts)[i].sge, - 1); - if (err) - break; - } - } - if (err) { - ERROR("%p: failed to post SGEs with error %d", - (void *)dev, err); - /* Set err because it does not contain a valid errno value. */ - err = EIO; - goto error; - } - if (tmpl.sp) - tmpl.recv = tmpl.if_wq->recv_sg_list; - else - tmpl.recv = tmpl.if_wq->recv_burst; + /* Update doorbell counter. */ + rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n; + rte_wmb(); + *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci); error: - *rxq = tmpl; assert(err >= 0); return err; } /** + * Initialize RX queue. + * + * @param tmpl + * Pointer to RX queue control template. + * + * @return + * 0 on success, errno value on failure. + */ +static inline int +rxq_setup(struct rxq_ctrl *tmpl) +{ + struct ibv_cq *ibcq = tmpl->cq; + struct mlx5_cq *cq = to_mxxx(cq, cq); + struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq); + struct rte_mbuf *(*elts)[tmpl->rxq.elts_n] = + rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket); + + if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) { + ERROR("Wrong MLX5_CQE_SIZE environment variable value: " + "it should be set to %u", RTE_CACHE_LINE_SIZE); + return EINVAL; + } + if (elts == NULL) + return ENOMEM; + tmpl->rxq.rq_db = rwq->rq.db; + tmpl->rxq.cqe_n = ibcq->cqe + 1; + tmpl->rxq.cq_ci = 0; + tmpl->rxq.rq_ci = 0; + tmpl->rxq.cq_db = cq->dbrec; + tmpl->rxq.wqes = + (volatile struct mlx5_wqe_data_seg (*)[]) + (uintptr_t)rwq->rq.buff; + tmpl->rxq.cqes = + (volatile struct mlx5_cqe (*)[]) + (uintptr_t)cq->active_buf->buf; + tmpl->rxq.elts = elts; + return 0; +} + +/** * Configure a RX queue. * * @param dev * Pointer to Ethernet device structure. - * @param rxq + * @param rxq_ctrl * Pointer to RX queue structure. * @param desc * Number of descriptors to configure in queue. @@ -1138,15 +913,18 @@ error: * 0 on success, errno value on failure. */ int -rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, - unsigned int socket, const struct rte_eth_rxconf *conf, - struct rte_mempool *mp) +rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, + uint16_t desc, unsigned int socket, + const struct rte_eth_rxconf *conf, struct rte_mempool *mp) { struct priv *priv = dev->data->dev_private; - struct rxq tmpl = { + struct rxq_ctrl tmpl = { .priv = priv, - .mp = mp, - .socket = socket + .socket = socket, + .rxq = { + .elts_n = desc, + .mp = mp, + }, }; struct ibv_exp_wq_attr mod; union { @@ -1154,44 +932,59 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, struct ibv_exp_cq_init_attr cq; struct ibv_exp_res_domain_init_attr rd; struct ibv_exp_wq_init_attr wq; + struct ibv_exp_cq_attr cq_attr; } attr; enum ibv_exp_query_intf_status status; - struct rte_mbuf *buf; + unsigned int mb_len = rte_pktmbuf_data_room_size(mp); + unsigned int cqe_n = desc - 1; + struct rte_mbuf *(*elts)[desc] = NULL; int ret = 0; - unsigned int i; - unsigned int cq_size = desc; (void)conf; /* Thresholds configuration (ignored). */ - if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) { - ERROR("%p: invalid number of RX descriptors (must be a" - " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N); - return EINVAL; + /* Enable scattered packets support for this queue if necessary. */ + assert(mb_len >= RTE_PKTMBUF_HEADROOM); + if ((dev->data->dev_conf.rxmode.jumbo_frame) && + (dev->data->dev_conf.rxmode.max_rx_pkt_len > + (mb_len - RTE_PKTMBUF_HEADROOM))) { + unsigned int size = + RTE_PKTMBUF_HEADROOM + + dev->data->dev_conf.rxmode.max_rx_pkt_len; + unsigned int sges_n; + + /* + * Determine the number of SGEs needed for a full packet + * and round it to the next power of two. + */ + sges_n = log2above((size / mb_len) + !!(size % mb_len)); + tmpl.rxq.sges_n = sges_n; + /* Make sure rxq.sges_n did not overflow. */ + size = mb_len * (1 << tmpl.rxq.sges_n); + size -= RTE_PKTMBUF_HEADROOM; + if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) { + ERROR("%p: too many SGEs (%u) needed to handle" + " requested maximum packet size %u", + (void *)dev, + 1 << sges_n, + dev->data->dev_conf.rxmode.max_rx_pkt_len); + return EOVERFLOW; + } } - /* Get mbuf length. */ - buf = rte_pktmbuf_alloc(mp); - if (buf == NULL) { - ERROR("%p: unable to allocate mbuf", (void *)dev); - return ENOMEM; + DEBUG("%p: maximum number of segments per packet: %u", + (void *)dev, 1 << tmpl.rxq.sges_n); + if (desc % (1 << tmpl.rxq.sges_n)) { + ERROR("%p: number of RX queue descriptors (%u) is not a" + " multiple of SGEs per packet (%u)", + (void *)dev, + desc, + 1 << tmpl.rxq.sges_n); + return EINVAL; } - tmpl.mb_len = buf->buf_len; - assert((rte_pktmbuf_headroom(buf) + - rte_pktmbuf_tailroom(buf)) == tmpl.mb_len); - assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM); - rte_pktmbuf_free(buf); /* Toggle RX checksum offload if hardware supports it. */ if (priv->hw_csum) - tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum; + tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum; if (priv->hw_csum_l2tun) - tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum; - /* Enable scattered packets support for this queue if necessary. */ - if ((dev->data->dev_conf.rxmode.jumbo_frame) && - (dev->data->dev_conf.rxmode.max_rx_pkt_len > - (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) { - tmpl.sp = 1; - desc /= MLX5_PMD_SGE_WR_N; - } - DEBUG("%p: %s scattered packets support (%u WRs)", - (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc); + tmpl.rxq.csum_l2tun = + !!dev->data->dev_conf.rxmode.hw_ip_checksum; /* Use the entire RX mempool as the memory region. */ tmpl.mr = mlx5_mp2mr(priv->pd, mp); if (tmpl.mr == NULL) { @@ -1217,7 +1010,12 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, .res_domain = tmpl.rd, }; - tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0, + if (priv->cqe_comp) { + attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS; + attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE; + cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */ + } + tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, NULL, 0, &attr.cq); if (tmpl.cq == NULL) { ret = ENOMEM; @@ -1230,64 +1028,51 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, DEBUG("priv->device_attr.max_sge is %d", priv->device_attr.max_sge); /* Configure VLAN stripping. */ - tmpl.vlan_strip = dev->data->dev_conf.rxmode.hw_vlan_strip; + tmpl.rxq.vlan_strip = (priv->hw_vlan_strip && + !!dev->data->dev_conf.rxmode.hw_vlan_strip); attr.wq = (struct ibv_exp_wq_init_attr){ .wq_context = NULL, /* Could be useful in the future. */ .wq_type = IBV_EXP_WQT_RQ, /* Max number of outstanding WRs. */ - .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ? - priv->device_attr.max_qp_wr : - (int)cq_size), + .max_recv_wr = desc >> tmpl.rxq.sges_n, /* Max number of scatter/gather elements in a WR. */ - .max_recv_sge = ((priv->device_attr.max_sge < - MLX5_PMD_SGE_WR_N) ? - priv->device_attr.max_sge : - MLX5_PMD_SGE_WR_N), + .max_recv_sge = 1 << tmpl.rxq.sges_n, .pd = priv->pd, .cq = tmpl.cq, .comp_mask = IBV_EXP_CREATE_WQ_RES_DOMAIN | -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS IBV_EXP_CREATE_WQ_VLAN_OFFLOADS | -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ 0, .res_domain = tmpl.rd, -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS - .vlan_offloads = (tmpl.vlan_strip ? + .vlan_offloads = (tmpl.rxq.vlan_strip ? IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : 0), -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ }; - -#ifdef HAVE_VERBS_FCS /* By default, FCS (CRC) is stripped by hardware. */ if (dev->data->dev_conf.rxmode.hw_strip_crc) { - tmpl.crc_present = 0; + tmpl.rxq.crc_present = 0; } else if (priv->hw_fcs_strip) { /* Ask HW/Verbs to leave CRC in place when supported. */ attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS; attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS; - tmpl.crc_present = 1; + tmpl.rxq.crc_present = 1; } else { WARN("%p: CRC stripping has been disabled but will still" " be performed by hardware, make sure MLNX_OFED and" " firmware are up to date", (void *)dev); - tmpl.crc_present = 0; + tmpl.rxq.crc_present = 0; } DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from" " incoming frames to hide it", (void *)dev, - tmpl.crc_present ? "disabled" : "enabled", - tmpl.crc_present << 2); -#endif /* HAVE_VERBS_FCS */ - -#ifdef HAVE_VERBS_RX_END_PADDING + tmpl.rxq.crc_present ? "disabled" : "enabled", + tmpl.rxq.crc_present << 2); if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING")) ; /* Nothing else to do. */ else if (priv->hw_padding) { INFO("%p: enabling packet padding on queue %p", - (void *)dev, (void *)rxq); + (void *)dev, (void *)rxq_ctrl); attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING; attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS; } else @@ -1295,7 +1080,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, " supported, make sure MLNX_OFED and firmware are" " up to date", (void *)dev); -#endif /* HAVE_VERBS_RX_END_PADDING */ tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq); if (tmpl.wq == NULL) { @@ -1304,23 +1088,25 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, (void *)dev, strerror(ret)); goto error; } - if (tmpl.sp) - ret = rxq_alloc_elts_sp(&tmpl, desc, NULL); - else - ret = rxq_alloc_elts(&tmpl, desc, NULL); - if (ret) { - ERROR("%p: RXQ allocation failed: %s", - (void *)dev, strerror(ret)); + /* + * Make sure number of WRs*SGEs match expectations since a queue + * cannot allocate more than "desc" buffers. + */ + if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) || + ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) { + ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs", + (void *)dev, + (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n), + attr.wq.max_recv_wr, attr.wq.max_recv_sge); + ret = EINVAL; goto error; } /* Save port ID. */ - tmpl.port_id = dev->data->port_id; - DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id); + tmpl.rxq.port_id = dev->data->port_id; + DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id); attr.params = (struct ibv_exp_query_intf_params){ .intf_scope = IBV_EXP_INTF_GLOBAL, -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS .intf_version = 1, -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ .intf = IBV_EXP_INTF_CQ, .obj = tmpl.cq, }; @@ -1352,56 +1138,47 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, (void *)dev, strerror(ret)); goto error; } - /* Post SGEs. */ - if (tmpl.sp) { - struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp; - - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - ret = tmpl.if_wq->recv_sg_list - (tmpl.wq, - (*elts)[i].sges, - RTE_DIM((*elts)[i].sges)); - if (ret) - break; - } - } else { - struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp; - - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - ret = tmpl.if_wq->recv_burst( - tmpl.wq, - &(*elts)[i].sge, - 1); - if (ret) - break; - } + ret = rxq_setup(&tmpl); + if (ret) { + ERROR("%p: cannot initialize RX queue structure: %s", + (void *)dev, strerror(ret)); + goto error; } + /* Reuse buffers from original queue if possible. */ + if (rxq_ctrl->rxq.elts_n) { + assert(rxq_ctrl->rxq.elts_n == desc); + assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts); + ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts); + } else + ret = rxq_alloc_elts(&tmpl, desc, NULL); if (ret) { - ERROR("%p: failed to post SGEs with error %d", - (void *)dev, ret); - /* Set ret because it does not contain a valid errno value. */ - ret = EIO; + ERROR("%p: RXQ allocation failed: %s", + (void *)dev, strerror(ret)); goto error; } /* Clean up rxq in case we're reinitializing it. */ - DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq); - rxq_cleanup(rxq); - *rxq = tmpl; - DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl); + DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl); + rxq_cleanup(rxq_ctrl); + /* Move mbuf pointers to dedicated storage area in RX queue. */ + elts = (void *)(rxq_ctrl + 1); + rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts)); +#ifndef NDEBUG + memset(tmpl.rxq.elts, 0x55, sizeof(*elts)); +#endif + rte_free(tmpl.rxq.elts); + tmpl.rxq.elts = elts; + *rxq_ctrl = tmpl; + /* Update doorbell counter. */ + rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n; + rte_wmb(); + *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci); + DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl); assert(ret == 0); - /* Assign function in queue. */ -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS - rxq->poll = rxq->if_cq->poll_length_flags_cvlan; -#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ - rxq->poll = rxq->if_cq->poll_length_flags; -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ - if (rxq->sp) - rxq->recv = rxq->if_wq->recv_sg_list; - else - rxq->recv = rxq->if_wq->recv_burst; return 0; error: + elts = tmpl.rxq.elts; rxq_cleanup(&tmpl); + rte_free(elts); assert(ret > 0); return ret; } @@ -1432,12 +1209,19 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, { struct priv *priv = dev->data->dev_private; struct rxq *rxq = (*priv->rxqs)[idx]; + struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); int ret; if (mlx5_is_secondary()) return -E_RTE_SECONDARY; priv_lock(priv); + if (!rte_is_power_of_2(desc)) { + desc = 1 << log2above(desc); + WARN("%p: increased number of descriptors in RX queue %u" + " to the next power of two (%d)", + (void *)dev, idx, desc); + } DEBUG("%p: configuring queue %u for %u descriptors", (void *)dev, idx, desc); if (idx >= priv->rxqs_n) { @@ -1454,29 +1238,28 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, return -EEXIST; } (*priv->rxqs)[idx] = NULL; - rxq_cleanup(rxq); + rxq_cleanup(rxq_ctrl); } else { - rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket); - if (rxq == NULL) { + rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) + + desc * sizeof(struct rte_mbuf *), + 0, socket); + if (rxq_ctrl == NULL) { ERROR("%p: unable to allocate queue index %u", (void *)dev, idx); priv_unlock(priv); return -ENOMEM; } } - ret = rxq_setup(dev, rxq, desc, socket, conf, mp); + ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp); if (ret) - rte_free(rxq); + rte_free(rxq_ctrl); else { - rxq->stats.idx = idx; + rxq_ctrl->rxq.stats.idx = idx; DEBUG("%p: adding RX queue %p to list", - (void *)dev, (void *)rxq); - (*priv->rxqs)[idx] = rxq; + (void *)dev, (void *)rxq_ctrl); + (*priv->rxqs)[idx] = &rxq_ctrl->rxq; /* Update receive callback. */ - if (rxq->sp) - dev->rx_pkt_burst = mlx5_rx_burst_sp; - else - dev->rx_pkt_burst = mlx5_rx_burst; + dev->rx_pkt_burst = mlx5_rx_burst; } priv_unlock(priv); return -ret; @@ -1492,6 +1275,7 @@ void mlx5_rx_queue_release(void *dpdk_rxq) { struct rxq *rxq = (struct rxq *)dpdk_rxq; + struct rxq_ctrl *rxq_ctrl; struct priv *priv; unsigned int i; @@ -1500,17 +1284,18 @@ mlx5_rx_queue_release(void *dpdk_rxq) if (rxq == NULL) return; - priv = rxq->priv; + rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); + priv = rxq_ctrl->priv; priv_lock(priv); for (i = 0; (i != priv->rxqs_n); ++i) if ((*priv->rxqs)[i] == rxq) { DEBUG("%p: removing RX queue %p from list", - (void *)priv->dev, (void *)rxq); + (void *)priv->dev, (void *)rxq_ctrl); (*priv->rxqs)[i] = NULL; break; } - rxq_cleanup(rxq); - rte_free(rxq); + rxq_cleanup(rxq_ctrl); + rte_free(rxq_ctrl); priv_unlock(priv); } @@ -1535,7 +1320,8 @@ mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct rxq *rxq = dpdk_rxq; - struct priv *priv = mlx5_secondary_data_setup(rxq->priv); + struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); + struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv); struct priv *primary_priv; unsigned int index; diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 9d1380a0..0c352f3f 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -42,6 +42,8 @@ #pragma GCC diagnostic ignored "-pedantic" #endif #include <infiniband/verbs.h> +#include <infiniband/mlx5_hw.h> +#include <infiniband/arch.h> #ifdef PEDANTIC #pragma GCC diagnostic error "-pedantic" #endif @@ -55,7 +57,7 @@ #include <rte_prefetch.h> #include <rte_common.h> #include <rte_branch_prediction.h> -#include <rte_memory.h> +#include <rte_ether.h> #ifdef PEDANTIC #pragma GCC diagnostic error "-pedantic" #endif @@ -65,125 +67,161 @@ #include "mlx5_rxtx.h" #include "mlx5_autoconf.h" #include "mlx5_defs.h" +#include "mlx5_prm.h" + +#ifndef NDEBUG /** - * Manage TX completions. - * - * When sending a burst, mlx5_tx_burst() posts several WRs. - * To improve performance, a completion event is only required once every - * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information - * for other WRs, but this information would not be used anyway. + * Verify or set magic value in CQE. * - * @param txq - * Pointer to TX queue structure. + * @param cqe + * Pointer to CQE. * * @return - * 0 on success, -1 on failure. + * 0 the first time. */ -static int -txq_complete(struct txq *txq) +static inline int +check_cqe64_seen(volatile struct mlx5_cqe64 *cqe) { - unsigned int elts_comp = txq->elts_comp; - unsigned int elts_tail = txq->elts_tail; - unsigned int elts_free = txq->elts_tail; - const unsigned int elts_n = txq->elts_n; - int wcs_n; + static const uint8_t magic[] = "seen"; + volatile uint8_t (*buf)[sizeof(cqe->rsvd40)] = &cqe->rsvd40; + int ret = 1; + unsigned int i; - if (unlikely(elts_comp == 0)) - return 0; -#ifdef DEBUG_SEND - DEBUG("%p: processing %u work requests completions", - (void *)txq, elts_comp); -#endif - wcs_n = txq->poll_cnt(txq->cq, elts_comp); - if (unlikely(wcs_n == 0)) - return 0; - if (unlikely(wcs_n < 0)) { - DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", - (void *)txq, wcs_n); - return -1; - } - elts_comp -= wcs_n; - assert(elts_comp <= txq->elts_comp); - /* - * Assume WC status is successful as nothing can be done about it - * anyway. - */ - elts_tail += wcs_n * txq->elts_comp_cd_init; - if (elts_tail >= elts_n) - elts_tail -= elts_n; + for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i) + if (!ret || (*buf)[i] != magic[i]) { + ret = 0; + (*buf)[i] = magic[i]; + } + return ret; +} - while (elts_free != elts_tail) { - struct txq_elt *elt = &(*txq->elts)[elts_free]; - unsigned int elts_free_next = - (((elts_free + 1) == elts_n) ? 0 : elts_free + 1); - struct rte_mbuf *tmp = elt->buf; - struct txq_elt *elt_next = &(*txq->elts)[elts_free_next]; +#endif /* NDEBUG */ + +static inline int +check_cqe64(volatile struct mlx5_cqe64 *cqe, + unsigned int cqes_n, const uint16_t ci) + __attribute__((always_inline)); + +/** + * Check whether CQE is valid. + * + * @param cqe + * Pointer to CQE. + * @param cqes_n + * Size of completion queue. + * @param ci + * Consumer index. + * + * @return + * 0 on success, 1 on failure. + */ +static inline int +check_cqe64(volatile struct mlx5_cqe64 *cqe, + unsigned int cqes_n, const uint16_t ci) +{ + uint16_t idx = ci & cqes_n; + uint8_t op_own = cqe->op_own; + uint8_t op_owner = MLX5_CQE_OWNER(op_own); + uint8_t op_code = MLX5_CQE_OPCODE(op_own); + if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID))) + return 1; /* No CQE. */ #ifndef NDEBUG - /* Poisoning. */ - memset(elt, 0x66, sizeof(*elt)); -#endif - RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); - /* Faster than rte_pktmbuf_free(). */ - do { - struct rte_mbuf *next = NEXT(tmp); + if ((op_code == MLX5_CQE_RESP_ERR) || + (op_code == MLX5_CQE_REQ_ERR)) { + volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe; + uint8_t syndrome = err_cqe->syndrome; - rte_pktmbuf_free_seg(tmp); - tmp = next; - } while (tmp != NULL); - elts_free = elts_free_next; + if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) || + (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) + return 0; + if (!check_cqe64_seen(cqe)) + ERROR("unexpected CQE error %u (0x%02x)" + " syndrome 0x%02x", + op_code, op_code, syndrome); + return 1; + } else if ((op_code != MLX5_CQE_RESP_SEND) && + (op_code != MLX5_CQE_REQ)) { + if (!check_cqe64_seen(cqe)) + ERROR("unexpected CQE opcode %u (0x%02x)", + op_code, op_code); + return 1; } - - txq->elts_tail = elts_tail; - txq->elts_comp = elts_comp; +#endif /* NDEBUG */ return 0; } -/* For best performance, this function should not be inlined. */ -struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, const struct rte_mempool *) - __attribute__((noinline)); - /** - * Register mempool as a memory region. + * Manage TX completions. * - * @param pd - * Pointer to protection domain. - * @param mp - * Pointer to memory pool. + * When sending a burst, mlx5_tx_burst() posts several WRs. * - * @return - * Memory region pointer, NULL in case of error. + * @param txq + * Pointer to TX queue structure. */ -struct ibv_mr * -mlx5_mp2mr(struct ibv_pd *pd, const struct rte_mempool *mp) +static void +txq_complete(struct txq *txq) { - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - uintptr_t start = mp->elt_va_start; - uintptr_t end = mp->elt_va_end; - unsigned int i; + const unsigned int elts_n = txq->elts_n; + const unsigned int cqe_n = txq->cqe_n; + const unsigned int cqe_cnt = cqe_n - 1; + uint16_t elts_free = txq->elts_tail; + uint16_t elts_tail; + uint16_t cq_ci = txq->cq_ci; + volatile struct mlx5_cqe64 *cqe = NULL; + volatile union mlx5_wqe *wqe; + + do { + volatile struct mlx5_cqe64 *tmp; - DEBUG("mempool %p area start=%p end=%p size=%zu", - (const void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - /* Round start and end to page boundary if found in memory segments. */ - for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { - uintptr_t addr = (uintptr_t)ms[i].addr; - size_t len = ms[i].len; - unsigned int align = ms[i].hugepage_sz; - - if ((start > addr) && (start < addr + len)) - start = RTE_ALIGN_FLOOR(start, align); - if ((end > addr) && (end < addr + len)) - end = RTE_ALIGN_CEIL(end, align); + tmp = &(*txq->cqes)[cq_ci & cqe_cnt].cqe64; + if (check_cqe64(tmp, cqe_n, cq_ci)) + break; + cqe = tmp; +#ifndef NDEBUG + if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) { + if (!check_cqe64_seen(cqe)) + ERROR("unexpected compressed CQE, TX stopped"); + return; + } + if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || + (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { + if (!check_cqe64_seen(cqe)) + ERROR("unexpected error CQE, TX stopped"); + return; + } +#endif /* NDEBUG */ + ++cq_ci; + } while (1); + if (unlikely(cqe == NULL)) + return; + wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)]; + elts_tail = wqe->wqe.ctrl.data[3]; + assert(elts_tail < txq->wqe_n); + /* Free buffers. */ + while (elts_free != elts_tail) { + struct rte_mbuf *elt = (*txq->elts)[elts_free]; + unsigned int elts_free_next = + (elts_free + 1) & (elts_n - 1); + struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next]; + +#ifndef NDEBUG + /* Poisoning. */ + memset(&(*txq->elts)[elts_free], + 0x66, + sizeof((*txq->elts)[elts_free])); +#endif + RTE_MBUF_PREFETCH_TO_FREE(elt_next); + /* Only one segment needs to be freed. */ + rte_pktmbuf_free_seg(elt); + elts_free = elts_free_next; } - DEBUG("mempool %p using start=%p end=%p size=%zu for MR", - (const void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - return ibv_reg_mr(pd, - (void *)start, - end - start, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + txq->cq_ci = cq_ci; + txq->elts_tail = elts_tail; + /* Update the consumer index. */ + rte_wmb(); + *txq->cq_db = htonl(cq_ci); } /** @@ -204,6 +242,10 @@ txq_mb2mp(struct rte_mbuf *buf) return buf->pool; } +static inline uint32_t +txq_mp2mr(struct txq *txq, struct rte_mempool *mp) + __attribute__((always_inline)); + /** * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, @@ -217,11 +259,11 @@ txq_mb2mp(struct rte_mbuf *buf) * @return * mr->lkey on success, (uint32_t)-1 on failure. */ -static uint32_t -txq_mp2mr(struct txq *txq, const struct rte_mempool *mp) +static inline uint32_t +txq_mp2mr(struct txq *txq, struct rte_mempool *mp) { unsigned int i; - struct ibv_mr *mr; + uint32_t lkey = (uint32_t)-1; for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { if (unlikely(txq->mp2mr[i].mp == NULL)) { @@ -230,295 +272,681 @@ txq_mp2mr(struct txq *txq, const struct rte_mempool *mp) } if (txq->mp2mr[i].mp == mp) { assert(txq->mp2mr[i].lkey != (uint32_t)-1); - assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); - return txq->mp2mr[i].lkey; + assert(htonl(txq->mp2mr[i].mr->lkey) == + txq->mp2mr[i].lkey); + lkey = txq->mp2mr[i].lkey; + break; } } - /* Add a new entry, register MR first. */ - DEBUG("%p: discovered new memory pool \"%s\" (%p)", - (void *)txq, mp->name, (const void *)mp); - mr = mlx5_mp2mr(txq->priv->pd, mp); - if (unlikely(mr == NULL)) { - DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", - (void *)txq); - return (uint32_t)-1; - } - if (unlikely(i == RTE_DIM(txq->mp2mr))) { - /* Table is full, remove oldest entry. */ - DEBUG("%p: MR <-> MP table full, dropping oldest entry.", - (void *)txq); - --i; - claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); - memmove(&txq->mp2mr[0], &txq->mp2mr[1], - (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); - } - /* Store the new entry. */ - txq->mp2mr[i].mp = mp; - txq->mp2mr[i].mr = mr; - txq->mp2mr[i].lkey = mr->lkey; - DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, - (void *)txq, mp->name, (const void *)mp, txq->mp2mr[i].lkey); - return txq->mp2mr[i].lkey; + if (unlikely(lkey == (uint32_t)-1)) + lkey = txq_mp2mr_reg(txq, mp, i); + return lkey; } -struct txq_mp2mr_mbuf_check_data { - const struct rte_mempool *mp; - int ret; -}; - /** - * Callback function for rte_mempool_obj_iter() to check whether a given - * mempool object looks like a mbuf. - * - * @param[in, out] arg - * Context data (struct txq_mp2mr_mbuf_check_data). Contains mempool pointer - * and return value. - * @param[in] start - * Object start address. - * @param[in] end - * Object end address. - * @param index - * Unused. + * Write a regular WQE. * - * @return - * Nonzero value when object is not a mbuf. + * @param txq + * Pointer to TX queue structure. + * @param wqe + * Pointer to the WQE to fill. + * @param addr + * Buffer data address. + * @param length + * Packet length. + * @param lkey + * Memory region lkey. */ -static void -txq_mp2mr_mbuf_check(void *arg, void *start, void *end, - uint32_t index __rte_unused) +static inline void +mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe, + uintptr_t addr, uint32_t length, uint32_t lkey) { - struct txq_mp2mr_mbuf_check_data *data = arg; - struct rte_mbuf *buf = - (void *)((uintptr_t)start + data->mp->header_size); - - (void)index; - /* Check whether mbuf structure fits element size and whether mempool - * pointer is valid. */ - if (((uintptr_t)end >= (uintptr_t)(buf + 1)) && - (buf->pool == data->mp)) - data->ret = 0; - else - data->ret = -1; + wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND); + wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4); + wqe->wqe.ctrl.data[2] = 0; + wqe->wqe.ctrl.data[3] = 0; + wqe->inl.eseg.rsvd0 = 0; + wqe->inl.eseg.rsvd1 = 0; + wqe->inl.eseg.mss = 0; + wqe->inl.eseg.rsvd2 = 0; + wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); + /* Copy the first 16 bytes into inline header. */ + rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start, + (uint8_t *)(uintptr_t)addr, + MLX5_ETH_INLINE_HEADER_SIZE); + addr += MLX5_ETH_INLINE_HEADER_SIZE; + length -= MLX5_ETH_INLINE_HEADER_SIZE; + /* Store remaining data in data segment. */ + wqe->wqe.dseg.byte_count = htonl(length); + wqe->wqe.dseg.lkey = lkey; + wqe->wqe.dseg.addr = htonll(addr); + /* Increment consumer index. */ + ++txq->wqe_ci; } /** - * Iterator function for rte_mempool_walk() to register existing mempools and - * fill the MP to MR cache of a TX queue. + * Write a regular WQE with VLAN. * - * @param[in] mp - * Memory Pool to register. - * @param *arg + * @param txq * Pointer to TX queue structure. + * @param wqe + * Pointer to the WQE to fill. + * @param addr + * Buffer data address. + * @param length + * Packet length. + * @param lkey + * Memory region lkey. + * @param vlan_tci + * VLAN field to insert in packet. */ -void -txq_mp2mr_iter(const struct rte_mempool *mp, void *arg) +static inline void +mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe, + uintptr_t addr, uint32_t length, uint32_t lkey, + uint16_t vlan_tci) { - struct txq *txq = arg; - struct txq_mp2mr_mbuf_check_data data = { - .mp = mp, - .ret = -1, - }; + uint32_t vlan = htonl(0x81000000 | vlan_tci); - /* Discard empty mempools. */ - if (mp->size == 0) - return; - /* Register mempool only if the first element looks like a mbuf. */ - rte_mempool_obj_iter((void *)mp->elt_va_start, - 1, - mp->header_size + mp->elt_size + mp->trailer_size, - 1, - mp->elt_pa, - mp->pg_num, - mp->pg_shift, - txq_mp2mr_mbuf_check, - &data); - if (data.ret) - return; - txq_mp2mr(txq, mp); + wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND); + wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4); + wqe->wqe.ctrl.data[2] = 0; + wqe->wqe.ctrl.data[3] = 0; + wqe->inl.eseg.rsvd0 = 0; + wqe->inl.eseg.rsvd1 = 0; + wqe->inl.eseg.mss = 0; + wqe->inl.eseg.rsvd2 = 0; + wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE); + /* + * Copy 12 bytes of source & destination MAC address. + * Copy 4 bytes of VLAN. + * Copy 2 bytes of Ether type. + */ + rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start, + (uint8_t *)(uintptr_t)addr, 12); + rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 12), + &vlan, sizeof(vlan)); + rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 16), + (uint8_t *)((uintptr_t)addr + 12), 2); + addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan); + length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan); + /* Store remaining data in data segment. */ + wqe->wqe.dseg.byte_count = htonl(length); + wqe->wqe.dseg.lkey = lkey; + wqe->wqe.dseg.addr = htonll(addr); + /* Increment consumer index. */ + ++txq->wqe_ci; } /** - * Insert VLAN using mbuf headroom space. + * Write a inline WQE. * - * @param buf - * Buffer for VLAN insertion. + * @param txq + * Pointer to TX queue structure. + * @param wqe + * Pointer to the WQE to fill. + * @param addr + * Buffer data address. + * @param length + * Packet length. + * @param lkey + * Memory region lkey. + */ +static inline void +mlx5_wqe_write_inline(struct txq *txq, volatile union mlx5_wqe *wqe, + uintptr_t addr, uint32_t length) +{ + uint32_t size; + uint16_t wqe_cnt = txq->wqe_n - 1; + uint16_t wqe_ci = txq->wqe_ci + 1; + + /* Copy the first 16 bytes into inline header. */ + rte_memcpy((void *)(uintptr_t)wqe->inl.eseg.inline_hdr_start, + (void *)(uintptr_t)addr, + MLX5_ETH_INLINE_HEADER_SIZE); + addr += MLX5_ETH_INLINE_HEADER_SIZE; + length -= MLX5_ETH_INLINE_HEADER_SIZE; + size = 3 + ((4 + length + 15) / 16); + wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG); + rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0], + (void *)addr, MLX5_WQE64_INL_DATA); + addr += MLX5_WQE64_INL_DATA; + length -= MLX5_WQE64_INL_DATA; + while (length) { + volatile union mlx5_wqe *wqe_next = + &(*txq->wqes)[wqe_ci & wqe_cnt]; + uint32_t copy_bytes = (length > sizeof(*wqe)) ? + sizeof(*wqe) : + length; + + rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0], + (uint8_t *)addr); + addr += copy_bytes; + length -= copy_bytes; + ++wqe_ci; + } + assert(size < 64); + wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND); + wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size); + wqe->inl.ctrl.data[2] = 0; + wqe->inl.ctrl.data[3] = 0; + wqe->inl.eseg.rsvd0 = 0; + wqe->inl.eseg.rsvd1 = 0; + wqe->inl.eseg.mss = 0; + wqe->inl.eseg.rsvd2 = 0; + wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); + /* Increment consumer index. */ + txq->wqe_ci = wqe_ci; +} + +/** + * Write a inline WQE with VLAN. * - * @return - * 0 on success, errno value on failure. + * @param txq + * Pointer to TX queue structure. + * @param wqe + * Pointer to the WQE to fill. + * @param addr + * Buffer data address. + * @param length + * Packet length. + * @param lkey + * Memory region lkey. + * @param vlan_tci + * VLAN field to insert in packet. */ -static inline int -insert_vlan_sw(struct rte_mbuf *buf) +static inline void +mlx5_wqe_write_inline_vlan(struct txq *txq, volatile union mlx5_wqe *wqe, + uintptr_t addr, uint32_t length, uint16_t vlan_tci) { - uintptr_t addr; - uint32_t vlan; - uint16_t head_room_len = rte_pktmbuf_headroom(buf); + uint32_t size; + uint32_t wqe_cnt = txq->wqe_n - 1; + uint16_t wqe_ci = txq->wqe_ci + 1; + uint32_t vlan = htonl(0x81000000 | vlan_tci); - if (head_room_len < 4) - return EINVAL; + /* + * Copy 12 bytes of source & destination MAC address. + * Copy 4 bytes of VLAN. + * Copy 2 bytes of Ether type. + */ + rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start, + (uint8_t *)addr, 12); + rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 12, + &vlan, sizeof(vlan)); + rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 16, + ((uint8_t *)addr + 12), 2); + addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan); + length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan); + size = (sizeof(wqe->inl.ctrl.ctrl) + + sizeof(wqe->inl.eseg) + + sizeof(wqe->inl.byte_cnt) + + length + 15) / 16; + wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG); + rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0], + (void *)addr, MLX5_WQE64_INL_DATA); + addr += MLX5_WQE64_INL_DATA; + length -= MLX5_WQE64_INL_DATA; + while (length) { + volatile union mlx5_wqe *wqe_next = + &(*txq->wqes)[wqe_ci & wqe_cnt]; + uint32_t copy_bytes = (length > sizeof(*wqe)) ? + sizeof(*wqe) : + length; - addr = rte_pktmbuf_mtod(buf, uintptr_t); - vlan = htonl(0x81000000 | buf->vlan_tci); - memmove((void *)(addr - 4), (void *)addr, 12); - memcpy((void *)(addr + 8), &vlan, sizeof(vlan)); + rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0], + (uint8_t *)addr); + addr += copy_bytes; + length -= copy_bytes; + ++wqe_ci; + } + assert(size < 64); + wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND); + wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size); + wqe->inl.ctrl.data[2] = 0; + wqe->inl.ctrl.data[3] = 0; + wqe->inl.eseg.rsvd0 = 0; + wqe->inl.eseg.rsvd1 = 0; + wqe->inl.eseg.mss = 0; + wqe->inl.eseg.rsvd2 = 0; + wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE); + /* Increment consumer index. */ + txq->wqe_ci = wqe_ci; +} + +/** + * Ring TX queue doorbell. + * + * @param txq + * Pointer to TX queue structure. + */ +static inline void +mlx5_tx_dbrec(struct txq *txq) +{ + uint8_t *dst = (uint8_t *)((uintptr_t)txq->bf_reg + txq->bf_offset); + uint32_t data[4] = { + htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND), + htonl(txq->qp_num_8s), + 0, + 0, + }; + rte_wmb(); + *txq->qp_db = htonl(txq->wqe_ci); + /* Ensure ordering between DB record and BF copy. */ + rte_wmb(); + rte_mov16(dst, (uint8_t *)data); + txq->bf_offset ^= txq->bf_buf_size; +} - SET_DATA_OFF(buf, head_room_len - 4); - DATA_LEN(buf) += 4; +/** + * Prefetch a CQE. + * + * @param txq + * Pointer to TX queue structure. + * @param cqe_ci + * CQE consumer index. + */ +static inline void +tx_prefetch_cqe(struct txq *txq, uint16_t ci) +{ + volatile struct mlx5_cqe64 *cqe; - return 0; + cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)].cqe64; + rte_prefetch0(cqe); } -#if MLX5_PMD_SGE_WR_N > 1 +/** + * Prefetch a WQE. + * + * @param txq + * Pointer to TX queue structure. + * @param wqe_ci + * WQE consumer index. + */ +static inline void +tx_prefetch_wqe(struct txq *txq, uint16_t ci) +{ + volatile union mlx5_wqe *wqe; + + wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)]; + rte_prefetch0(wqe); +} /** - * Copy scattered mbuf contents to a single linear buffer. + * DPDK callback for TX. * - * @param[out] linear - * Linear output buffer. - * @param[in] buf - * Scattered input buffer. + * @param dpdk_txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. * * @return - * Number of bytes copied to the output buffer or 0 if not large enough. + * Number of packets successfully transmitted (<= pkts_n). */ -static unsigned int -linearize_mbuf(linear_t *linear, struct rte_mbuf *buf) +uint16_t +mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { - unsigned int size = 0; - unsigned int offset; + struct txq *txq = (struct txq *)dpdk_txq; + uint16_t elts_head = txq->elts_head; + const unsigned int elts_n = txq->elts_n; + unsigned int i = 0; + unsigned int j = 0; + unsigned int max; + unsigned int comp; + volatile union mlx5_wqe *wqe = NULL; + if (unlikely(!pkts_n)) + return 0; + /* Prefetch first packet cacheline. */ + tx_prefetch_cqe(txq, txq->cq_ci); + tx_prefetch_cqe(txq, txq->cq_ci + 1); + rte_prefetch0(*pkts); + /* Start processing. */ + txq_complete(txq); + max = (elts_n - (elts_head - txq->elts_tail)); + if (max > elts_n) + max -= elts_n; do { - unsigned int len = DATA_LEN(buf); + struct rte_mbuf *buf = *(pkts++); + unsigned int elts_head_next; + uintptr_t addr; + uint32_t length; + uint32_t lkey; + unsigned int segs_n = buf->nb_segs; + volatile struct mlx5_wqe_data_seg *dseg; + unsigned int ds = sizeof(*wqe) / 16; - offset = size; - size += len; - if (unlikely(size > sizeof(*linear))) - return 0; - memcpy(&(*linear)[offset], - rte_pktmbuf_mtod(buf, uint8_t *), - len); - buf = NEXT(buf); - } while (buf != NULL); - return size; + /* + * Make sure there is enough room to store this packet and + * that one ring entry remains unused. + */ + assert(segs_n); + if (max < segs_n + 1) + break; + max -= segs_n; + --pkts_n; + elts_head_next = (elts_head + 1) & (elts_n - 1); + wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)]; + dseg = &wqe->wqe.dseg; + rte_prefetch0(wqe); + if (pkts_n) + rte_prefetch0(*pkts); + /* Retrieve buffer information. */ + addr = rte_pktmbuf_mtod(buf, uintptr_t); + length = DATA_LEN(buf); + /* Update element. */ + (*txq->elts)[elts_head] = buf; + /* Prefetch next buffer data. */ + if (pkts_n) + rte_prefetch0(rte_pktmbuf_mtod(*pkts, + volatile void *)); + /* Retrieve Memory Region key for this memory pool. */ + lkey = txq_mp2mr(txq, txq_mb2mp(buf)); + if (buf->ol_flags & PKT_TX_VLAN_PKT) + mlx5_wqe_write_vlan(txq, wqe, addr, length, lkey, + buf->vlan_tci); + else + mlx5_wqe_write(txq, wqe, addr, length, lkey); + /* Should we enable HW CKSUM offload */ + if (buf->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { + wqe->wqe.eseg.cs_flags = + MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + } else { + wqe->wqe.eseg.cs_flags = 0; + } + while (--segs_n) { + /* + * Spill on next WQE when the current one does not have + * enough room left. Size of WQE must a be a multiple + * of data segment size. + */ + assert(!(sizeof(*wqe) % sizeof(*dseg))); + if (!(ds % (sizeof(*wqe) / 16))) + dseg = (volatile void *) + &(*txq->wqes)[txq->wqe_ci++ & + (txq->wqe_n - 1)]; + else + ++dseg; + ++ds; + buf = buf->next; + assert(buf); + /* Store segment information. */ + dseg->byte_count = htonl(DATA_LEN(buf)); + dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf)); + dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); + (*txq->elts)[elts_head_next] = buf; + elts_head_next = (elts_head_next + 1) & (elts_n - 1); +#ifdef MLX5_PMD_SOFT_COUNTERS + length += DATA_LEN(buf); +#endif + ++j; + } + /* Update DS field in WQE. */ + wqe->wqe.ctrl.data[1] &= htonl(0xffffffc0); + wqe->wqe.ctrl.data[1] |= htonl(ds & 0x3f); + elts_head = elts_head_next; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent bytes counter. */ + txq->stats.obytes += length; +#endif + elts_head = elts_head_next; + ++i; + } while (pkts_n); + /* Take a shortcut if nothing must be sent. */ + if (unlikely(i == 0)) + return 0; + /* Check whether completion threshold has been reached. */ + comp = txq->elts_comp + i + j; + if (comp >= MLX5_TX_COMP_THRESH) { + /* Request completion on last WQE. */ + wqe->wqe.ctrl.data[2] = htonl(8); + /* Save elts_head in unused "immediate" field of WQE. */ + wqe->wqe.ctrl.data[3] = elts_head; + txq->elts_comp = 0; + } else { + txq->elts_comp = comp; + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent packets counter. */ + txq->stats.opackets += i; +#endif + /* Ring QP doorbell. */ + mlx5_tx_dbrec(txq); + txq->elts_head = elts_head; + return i; } /** - * Handle scattered buffers for mlx5_tx_burst(). + * DPDK callback for TX with inline support. * - * @param txq - * TX queue structure. - * @param segs - * Number of segments in buf. - * @param elt - * TX queue element to fill. - * @param[in] buf - * Buffer to process. - * @param elts_head - * Index of the linear buffer to use if necessary (normally txq->elts_head). - * @param[out] sges - * Array filled with SGEs on success. + * @param dpdk_txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. * * @return - * A structure containing the processed packet size in bytes and the - * number of SGEs. Both fields are set to (unsigned int)-1 in case of - * failure. + * Number of packets successfully transmitted (<= pkts_n). */ -static struct tx_burst_sg_ret { - unsigned int length; - unsigned int num; -} -tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt, - struct rte_mbuf *buf, unsigned int elts_head, - struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N]) +uint16_t +mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { - unsigned int sent_size = 0; - unsigned int j; - int linearize = 0; - - /* When there are too many segments, extra segments are - * linearized in the last SGE. */ - if (unlikely(segs > RTE_DIM(*sges))) { - segs = (RTE_DIM(*sges) - 1); - linearize = 1; - } - /* Update element. */ - elt->buf = buf; - /* Register segments as SGEs. */ - for (j = 0; (j != segs); ++j) { - struct ibv_sge *sge = &(*sges)[j]; + struct txq *txq = (struct txq *)dpdk_txq; + uint16_t elts_head = txq->elts_head; + const unsigned int elts_n = txq->elts_n; + unsigned int i = 0; + unsigned int j = 0; + unsigned int max; + unsigned int comp; + volatile union mlx5_wqe *wqe = NULL; + unsigned int max_inline = txq->max_inline; + + if (unlikely(!pkts_n)) + return 0; + /* Prefetch first packet cacheline. */ + tx_prefetch_cqe(txq, txq->cq_ci); + tx_prefetch_cqe(txq, txq->cq_ci + 1); + rte_prefetch0(*pkts); + /* Start processing. */ + txq_complete(txq); + max = (elts_n - (elts_head - txq->elts_tail)); + if (max > elts_n) + max -= elts_n; + do { + struct rte_mbuf *buf = *(pkts++); + unsigned int elts_head_next; + uintptr_t addr; + uint32_t length; uint32_t lkey; + unsigned int segs_n = buf->nb_segs; + volatile struct mlx5_wqe_data_seg *dseg; + unsigned int ds = sizeof(*wqe) / 16; - /* Retrieve Memory Region key for this memory pool. */ - lkey = txq_mp2mr(txq, txq_mb2mp(buf)); - if (unlikely(lkey == (uint32_t)-1)) { - /* MR does not exist. */ - DEBUG("%p: unable to get MP <-> MR association", - (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; + /* + * Make sure there is enough room to store this packet and + * that one ring entry remains unused. + */ + assert(segs_n); + if (max < segs_n + 1) + break; + max -= segs_n; + --pkts_n; + elts_head_next = (elts_head + 1) & (elts_n - 1); + wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)]; + dseg = &wqe->wqe.dseg; + tx_prefetch_wqe(txq, txq->wqe_ci); + tx_prefetch_wqe(txq, txq->wqe_ci + 1); + if (pkts_n) + rte_prefetch0(*pkts); + /* Should we enable HW CKSUM offload */ + if (buf->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { + wqe->inl.eseg.cs_flags = + MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + } else { + wqe->inl.eseg.cs_flags = 0; } - /* Update SGE. */ - sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); - if (txq->priv->vf) - rte_prefetch0((volatile void *) - (uintptr_t)sge->addr); - sge->length = DATA_LEN(buf); - sge->lkey = lkey; - sent_size += sge->length; - buf = NEXT(buf); - } - /* If buf is not NULL here and is not going to be linearized, - * nb_segs is not valid. */ - assert(j == segs); - assert((buf == NULL) || (linearize)); - /* Linearize extra segments. */ - if (linearize) { - struct ibv_sge *sge = &(*sges)[segs]; - linear_t *linear = &(*txq->elts_linear)[elts_head]; - unsigned int size = linearize_mbuf(linear, buf); - - assert(segs == (RTE_DIM(*sges) - 1)); - if (size == 0) { - /* Invalid packet. */ - DEBUG("%p: packet too large to be linearized.", - (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; + /* Retrieve buffer information. */ + addr = rte_pktmbuf_mtod(buf, uintptr_t); + length = DATA_LEN(buf); + /* Update element. */ + (*txq->elts)[elts_head] = buf; + /* Prefetch next buffer data. */ + if (pkts_n) + rte_prefetch0(rte_pktmbuf_mtod(*pkts, + volatile void *)); + if ((length <= max_inline) && (segs_n == 1)) { + if (buf->ol_flags & PKT_TX_VLAN_PKT) + mlx5_wqe_write_inline_vlan(txq, wqe, + addr, length, + buf->vlan_tci); + else + mlx5_wqe_write_inline(txq, wqe, addr, length); + goto skip_segs; + } else { + /* Retrieve Memory Region key for this memory pool. */ + lkey = txq_mp2mr(txq, txq_mb2mp(buf)); + if (buf->ol_flags & PKT_TX_VLAN_PKT) + mlx5_wqe_write_vlan(txq, wqe, addr, length, + lkey, buf->vlan_tci); + else + mlx5_wqe_write(txq, wqe, addr, length, lkey); } - /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */ - if (RTE_DIM(*sges) == 1) { - do { - struct rte_mbuf *next = NEXT(buf); - - rte_pktmbuf_free_seg(buf); - buf = next; - } while (buf != NULL); - elt->buf = NULL; + while (--segs_n) { + /* + * Spill on next WQE when the current one does not have + * enough room left. Size of WQE must a be a multiple + * of data segment size. + */ + assert(!(sizeof(*wqe) % sizeof(*dseg))); + if (!(ds % (sizeof(*wqe) / 16))) + dseg = (volatile void *) + &(*txq->wqes)[txq->wqe_ci++ & + (txq->wqe_n - 1)]; + else + ++dseg; + ++ds; + buf = buf->next; + assert(buf); + /* Store segment information. */ + dseg->byte_count = htonl(DATA_LEN(buf)); + dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf)); + dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); + (*txq->elts)[elts_head_next] = buf; + elts_head_next = (elts_head_next + 1) & (elts_n - 1); +#ifdef MLX5_PMD_SOFT_COUNTERS + length += DATA_LEN(buf); +#endif + ++j; } - /* Update SGE. */ - sge->addr = (uintptr_t)&(*linear)[0]; - sge->length = size; - sge->lkey = txq->mr_linear->lkey; - sent_size += size; - /* Include last segment. */ - segs++; + /* Update DS field in WQE. */ + wqe->inl.ctrl.data[1] &= htonl(0xffffffc0); + wqe->inl.ctrl.data[1] |= htonl(ds & 0x3f); +skip_segs: + elts_head = elts_head_next; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent bytes counter. */ + txq->stats.obytes += length; +#endif + ++i; + } while (pkts_n); + /* Take a shortcut if nothing must be sent. */ + if (unlikely(i == 0)) + return 0; + /* Check whether completion threshold has been reached. */ + comp = txq->elts_comp + i + j; + if (comp >= MLX5_TX_COMP_THRESH) { + /* Request completion on last WQE. */ + wqe->inl.ctrl.data[2] = htonl(8); + /* Save elts_head in unused "immediate" field of WQE. */ + wqe->inl.ctrl.data[3] = elts_head; + txq->elts_comp = 0; + } else { + txq->elts_comp = comp; } - return (struct tx_burst_sg_ret){ - .length = sent_size, - .num = segs, - }; -stop: - return (struct tx_burst_sg_ret){ - .length = -1, - .num = -1, - }; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent packets counter. */ + txq->stats.opackets += i; +#endif + /* Ring QP doorbell. */ + mlx5_tx_dbrec(txq); + txq->elts_head = elts_head; + return i; } -#endif /* MLX5_PMD_SGE_WR_N > 1 */ +/** + * Open a MPW session. + * + * @param txq + * Pointer to TX queue structure. + * @param mpw + * Pointer to MPW session structure. + * @param length + * Packet length. + */ +static inline void +mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) +{ + uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1); + volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] = + (volatile struct mlx5_wqe_data_seg (*)[]) + (uintptr_t)&(*txq->wqes)[(idx + 1) & (txq->wqe_n - 1)]; + + mpw->state = MLX5_MPW_STATE_OPENED; + mpw->pkts_n = 0; + mpw->len = length; + mpw->total_len = 0; + mpw->wqe = &(*txq->wqes)[idx]; + mpw->wqe->mpw.eseg.mss = htons(length); + mpw->wqe->mpw.eseg.inline_hdr_sz = 0; + mpw->wqe->mpw.eseg.rsvd0 = 0; + mpw->wqe->mpw.eseg.rsvd1 = 0; + mpw->wqe->mpw.eseg.rsvd2 = 0; + mpw->wqe->mpw.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) | + (txq->wqe_ci << 8) | + MLX5_OPCODE_LSO_MPW); + mpw->wqe->mpw.ctrl.data[2] = 0; + mpw->wqe->mpw.ctrl.data[3] = 0; + mpw->data.dseg[0] = &mpw->wqe->mpw.dseg[0]; + mpw->data.dseg[1] = &mpw->wqe->mpw.dseg[1]; + mpw->data.dseg[2] = &(*dseg)[0]; + mpw->data.dseg[3] = &(*dseg)[1]; + mpw->data.dseg[4] = &(*dseg)[2]; +} /** - * DPDK callback for TX. + * Close a MPW session. + * + * @param txq + * Pointer to TX queue structure. + * @param mpw + * Pointer to MPW session structure. + */ +static inline void +mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw) +{ + unsigned int num = mpw->pkts_n; + + /* + * Store size in multiple of 16 bytes. Control and Ethernet segments + * count as 2. + */ + mpw->wqe->mpw.ctrl.data[1] = htonl(txq->qp_num_8s | (2 + num)); + mpw->state = MLX5_MPW_STATE_CLOSED; + if (num < 3) + ++txq->wqe_ci; + else + txq->wqe_ci += 2; + tx_prefetch_wqe(txq, txq->wqe_ci); + tx_prefetch_wqe(txq, txq->wqe_ci + 1); +} + +/** + * DPDK callback for TX with MPW support. * * @param dpdk_txq * Generic pointer to TX queue structure. @@ -531,224 +959,399 @@ stop: * Number of packets successfully transmitted (<= pkts_n). */ uint16_t -mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct txq *txq = (struct txq *)dpdk_txq; - unsigned int elts_head = txq->elts_head; + uint16_t elts_head = txq->elts_head; const unsigned int elts_n = txq->elts_n; - unsigned int elts_comp_cd = txq->elts_comp_cd; - unsigned int elts_comp = 0; - unsigned int i; + unsigned int i = 0; + unsigned int j = 0; unsigned int max; - int err; - struct rte_mbuf *buf = pkts[0]; + unsigned int comp; + struct mlx5_mpw mpw = { + .state = MLX5_MPW_STATE_CLOSED, + }; - assert(elts_comp_cd != 0); + if (unlikely(!pkts_n)) + return 0; /* Prefetch first packet cacheline. */ - rte_prefetch0(buf); + tx_prefetch_cqe(txq, txq->cq_ci); + tx_prefetch_wqe(txq, txq->wqe_ci); + tx_prefetch_wqe(txq, txq->wqe_ci + 1); + /* Start processing. */ txq_complete(txq); max = (elts_n - (elts_head - txq->elts_tail)); if (max > elts_n) max -= elts_n; - assert(max >= 1); - assert(max <= elts_n); - /* Always leave one free entry in the ring. */ - --max; - if (max == 0) + do { + struct rte_mbuf *buf = *(pkts++); + unsigned int elts_head_next; + uint32_t length; + unsigned int segs_n = buf->nb_segs; + uint32_t cs_flags = 0; + + /* + * Make sure there is enough room to store this packet and + * that one ring entry remains unused. + */ + assert(segs_n); + if (max < segs_n + 1) + break; + /* Do not bother with large packets MPW cannot handle. */ + if (segs_n > MLX5_MPW_DSEG_MAX) + break; + max -= segs_n; + --pkts_n; + /* Should we enable HW CKSUM offload */ + if (buf->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) + cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; + /* Retrieve packet information. */ + length = PKT_LEN(buf); + assert(length); + /* Start new session if packet differs. */ + if ((mpw.state == MLX5_MPW_STATE_OPENED) && + ((mpw.len != length) || + (segs_n != 1) || + (mpw.wqe->mpw.eseg.cs_flags != cs_flags))) + mlx5_mpw_close(txq, &mpw); + if (mpw.state == MLX5_MPW_STATE_CLOSED) { + mlx5_mpw_new(txq, &mpw, length); + mpw.wqe->mpw.eseg.cs_flags = cs_flags; + } + /* Multi-segment packets must be alone in their MPW. */ + assert((segs_n == 1) || (mpw.pkts_n == 0)); +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) + length = 0; +#endif + do { + volatile struct mlx5_wqe_data_seg *dseg; + uintptr_t addr; + + elts_head_next = (elts_head + 1) & (elts_n - 1); + assert(buf); + (*txq->elts)[elts_head] = buf; + dseg = mpw.data.dseg[mpw.pkts_n]; + addr = rte_pktmbuf_mtod(buf, uintptr_t); + *dseg = (struct mlx5_wqe_data_seg){ + .byte_count = htonl(DATA_LEN(buf)), + .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), + .addr = htonll(addr), + }; + elts_head = elts_head_next; +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) + length += DATA_LEN(buf); +#endif + buf = buf->next; + ++mpw.pkts_n; + ++j; + } while (--segs_n); + assert(length == mpw.len); + if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) + mlx5_mpw_close(txq, &mpw); + elts_head = elts_head_next; +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Increment sent bytes counter. */ + txq->stats.obytes += length; +#endif + ++i; + } while (pkts_n); + /* Take a shortcut if nothing must be sent. */ + if (unlikely(i == 0)) return 0; - if (max > pkts_n) - max = pkts_n; - for (i = 0; (i != max); ++i) { - struct rte_mbuf *buf_next = pkts[i + 1]; - unsigned int elts_head_next = - (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); - struct txq_elt *elt = &(*txq->elts)[elts_head]; - unsigned int segs = NB_SEGS(buf); + /* Check whether completion threshold has been reached. */ + /* "j" includes both packets and segments. */ + comp = txq->elts_comp + j; + if (comp >= MLX5_TX_COMP_THRESH) { + volatile union mlx5_wqe *wqe = mpw.wqe; + + /* Request completion on last WQE. */ + wqe->mpw.ctrl.data[2] = htonl(8); + /* Save elts_head in unused "immediate" field of WQE. */ + wqe->mpw.ctrl.data[3] = elts_head; + txq->elts_comp = 0; + } else { + txq->elts_comp = comp; + } #ifdef MLX5_PMD_SOFT_COUNTERS - unsigned int sent_size = 0; + /* Increment sent packets counter. */ + txq->stats.opackets += i; #endif - uint32_t send_flags = 0; -#ifdef HAVE_VERBS_VLAN_INSERTION - int insert_vlan = 0; -#endif /* HAVE_VERBS_VLAN_INSERTION */ - - if (i + 1 < max) - rte_prefetch0(buf_next); - /* Request TX completion. */ - if (unlikely(--elts_comp_cd == 0)) { - elts_comp_cd = txq->elts_comp_cd_init; - ++elts_comp; - send_flags |= IBV_EXP_QP_BURST_SIGNALED; - } + /* Ring QP doorbell. */ + if (mpw.state == MLX5_MPW_STATE_OPENED) + mlx5_mpw_close(txq, &mpw); + mlx5_tx_dbrec(txq); + txq->elts_head = elts_head; + return i; +} + +/** + * Open a MPW inline session. + * + * @param txq + * Pointer to TX queue structure. + * @param mpw + * Pointer to MPW session structure. + * @param length + * Packet length. + */ +static inline void +mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) +{ + uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1); + + mpw->state = MLX5_MPW_INL_STATE_OPENED; + mpw->pkts_n = 0; + mpw->len = length; + mpw->total_len = 0; + mpw->wqe = &(*txq->wqes)[idx]; + mpw->wqe->mpw_inl.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) | + (txq->wqe_ci << 8) | + MLX5_OPCODE_LSO_MPW); + mpw->wqe->mpw_inl.ctrl.data[2] = 0; + mpw->wqe->mpw_inl.ctrl.data[3] = 0; + mpw->wqe->mpw_inl.eseg.mss = htons(length); + mpw->wqe->mpw_inl.eseg.inline_hdr_sz = 0; + mpw->wqe->mpw_inl.eseg.cs_flags = 0; + mpw->wqe->mpw_inl.eseg.rsvd0 = 0; + mpw->wqe->mpw_inl.eseg.rsvd1 = 0; + mpw->wqe->mpw_inl.eseg.rsvd2 = 0; + mpw->data.raw = &mpw->wqe->mpw_inl.data[0]; +} + +/** + * Close a MPW inline session. + * + * @param txq + * Pointer to TX queue structure. + * @param mpw + * Pointer to MPW session structure. + */ +static inline void +mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw) +{ + unsigned int size; + + size = sizeof(*mpw->wqe) - MLX5_MWQE64_INL_DATA + mpw->total_len; + /* + * Store size in multiple of 16 bytes. Control and Ethernet segments + * count as 2. + */ + mpw->wqe->mpw_inl.ctrl.data[1] = + htonl(txq->qp_num_8s | ((size + 15) / 16)); + mpw->state = MLX5_MPW_STATE_CLOSED; + mpw->wqe->mpw_inl.byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG); + txq->wqe_ci += (size + (sizeof(*mpw->wqe) - 1)) / sizeof(*mpw->wqe); +} + +/** + * DPDK callback for TX with MPW inline support. + * + * @param dpdk_txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n) +{ + struct txq *txq = (struct txq *)dpdk_txq; + uint16_t elts_head = txq->elts_head; + const unsigned int elts_n = txq->elts_n; + unsigned int i = 0; + unsigned int j = 0; + unsigned int max; + unsigned int comp; + unsigned int inline_room = txq->max_inline; + struct mlx5_mpw mpw = { + .state = MLX5_MPW_STATE_CLOSED, + }; + + if (unlikely(!pkts_n)) + return 0; + /* Prefetch first packet cacheline. */ + tx_prefetch_cqe(txq, txq->cq_ci); + tx_prefetch_wqe(txq, txq->wqe_ci); + tx_prefetch_wqe(txq, txq->wqe_ci + 1); + /* Start processing. */ + txq_complete(txq); + max = (elts_n - (elts_head - txq->elts_tail)); + if (max > elts_n) + max -= elts_n; + do { + struct rte_mbuf *buf = *(pkts++); + unsigned int elts_head_next; + uintptr_t addr; + uint32_t length; + unsigned int segs_n = buf->nb_segs; + uint32_t cs_flags = 0; + + /* + * Make sure there is enough room to store this packet and + * that one ring entry remains unused. + */ + assert(segs_n); + if (max < segs_n + 1) + break; + /* Do not bother with large packets MPW cannot handle. */ + if (segs_n > MLX5_MPW_DSEG_MAX) + break; + max -= segs_n; + --pkts_n; /* Should we enable HW CKSUM offload */ if (buf->ol_flags & - (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { - send_flags |= IBV_EXP_QP_BURST_IP_CSUM; - /* HW does not support checksum offloads at arbitrary - * offsets but automatically recognizes the packet - * type. For inner L3/L4 checksums, only VXLAN (UDP) - * tunnels are currently supported. */ - if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) - send_flags |= IBV_EXP_QP_BURST_TUNNEL; + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) + cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; + /* Retrieve packet information. */ + length = PKT_LEN(buf); + /* Start new session if packet differs. */ + if (mpw.state == MLX5_MPW_STATE_OPENED) { + if ((mpw.len != length) || + (segs_n != 1) || + (mpw.wqe->mpw.eseg.cs_flags != cs_flags)) + mlx5_mpw_close(txq, &mpw); + } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) { + if ((mpw.len != length) || + (segs_n != 1) || + (length > inline_room) || + (mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) { + mlx5_mpw_inline_close(txq, &mpw); + inline_room = txq->max_inline; + } } - if (buf->ol_flags & PKT_TX_VLAN_PKT) { -#ifdef HAVE_VERBS_VLAN_INSERTION - if (!txq->priv->mps) - insert_vlan = 1; - else -#endif /* HAVE_VERBS_VLAN_INSERTION */ - { - err = insert_vlan_sw(buf); - if (unlikely(err)) - goto stop; + if (mpw.state == MLX5_MPW_STATE_CLOSED) { + if ((segs_n != 1) || + (length > inline_room)) { + mlx5_mpw_new(txq, &mpw, length); + mpw.wqe->mpw.eseg.cs_flags = cs_flags; + } else { + mlx5_mpw_inline_new(txq, &mpw, length); + mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags; } } - if (likely(segs == 1)) { - uintptr_t addr; - uint32_t length; - uint32_t lkey; - uintptr_t buf_next_addr; + /* Multi-segment packets must be alone in their MPW. */ + assert((segs_n == 1) || (mpw.pkts_n == 0)); + if (mpw.state == MLX5_MPW_STATE_OPENED) { + assert(inline_room == txq->max_inline); +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) + length = 0; +#endif + do { + volatile struct mlx5_wqe_data_seg *dseg; + + elts_head_next = + (elts_head + 1) & (elts_n - 1); + assert(buf); + (*txq->elts)[elts_head] = buf; + dseg = mpw.data.dseg[mpw.pkts_n]; + addr = rte_pktmbuf_mtod(buf, uintptr_t); + *dseg = (struct mlx5_wqe_data_seg){ + .byte_count = htonl(DATA_LEN(buf)), + .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), + .addr = htonll(addr), + }; + elts_head = elts_head_next; +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) + length += DATA_LEN(buf); +#endif + buf = buf->next; + ++mpw.pkts_n; + ++j; + } while (--segs_n); + assert(length == mpw.len); + if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) + mlx5_mpw_close(txq, &mpw); + } else { + unsigned int max; - /* Retrieve buffer information. */ + assert(mpw.state == MLX5_MPW_INL_STATE_OPENED); + assert(length <= inline_room); + assert(length == DATA_LEN(buf)); + elts_head_next = (elts_head + 1) & (elts_n - 1); addr = rte_pktmbuf_mtod(buf, uintptr_t); - length = DATA_LEN(buf); - /* Update element. */ - elt->buf = buf; - if (txq->priv->vf) - rte_prefetch0((volatile void *) - (uintptr_t)addr); - /* Prefetch next buffer data. */ - if (i + 1 < max) { - buf_next_addr = - rte_pktmbuf_mtod(buf_next, uintptr_t); - rte_prefetch0((volatile void *) - (uintptr_t)buf_next_addr); + (*txq->elts)[elts_head] = buf; + /* Maximum number of bytes before wrapping. */ + max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] - + (uintptr_t)mpw.data.raw); + if (length > max) { + rte_memcpy((void *)(uintptr_t)mpw.data.raw, + (void *)addr, + max); + mpw.data.raw = + (volatile void *)&(*txq->wqes)[0]; + rte_memcpy((void *)(uintptr_t)mpw.data.raw, + (void *)(addr + max), + length - max); + mpw.data.raw += length - max; + } else { + rte_memcpy((void *)(uintptr_t)mpw.data.raw, + (void *)addr, + length); + mpw.data.raw += length; } - /* Put packet into send queue. */ -#if MLX5_PMD_MAX_INLINE > 0 - if (length <= txq->max_inline) { -#ifdef HAVE_VERBS_VLAN_INSERTION - if (insert_vlan) - err = txq->send_pending_inline_vlan - (txq->qp, - (void *)addr, - length, - send_flags, - &buf->vlan_tci); - else -#endif /* HAVE_VERBS_VLAN_INSERTION */ - err = txq->send_pending_inline - (txq->qp, - (void *)addr, - length, - send_flags); - } else -#endif - { - /* Retrieve Memory Region key for this - * memory pool. */ - lkey = txq_mp2mr(txq, txq_mb2mp(buf)); - if (unlikely(lkey == (uint32_t)-1)) { - /* MR does not exist. */ - DEBUG("%p: unable to get MP <-> MR" - " association", (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; - } -#ifdef HAVE_VERBS_VLAN_INSERTION - if (insert_vlan) - err = txq->send_pending_vlan - (txq->qp, - addr, - length, - lkey, - send_flags, - &buf->vlan_tci); - else -#endif /* HAVE_VERBS_VLAN_INSERTION */ - err = txq->send_pending - (txq->qp, - addr, - length, - lkey, - send_flags); + if ((uintptr_t)mpw.data.raw == + (uintptr_t)&(*txq->wqes)[txq->wqe_n]) + mpw.data.raw = + (volatile void *)&(*txq->wqes)[0]; + ++mpw.pkts_n; + ++j; + if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) { + mlx5_mpw_inline_close(txq, &mpw); + inline_room = txq->max_inline; + } else { + inline_room -= length; } - if (unlikely(err)) - goto stop; -#ifdef MLX5_PMD_SOFT_COUNTERS - sent_size += length; -#endif - } else { -#if MLX5_PMD_SGE_WR_N > 1 - struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; - struct tx_burst_sg_ret ret; - - ret = tx_burst_sg(txq, segs, elt, buf, elts_head, - &sges); - if (ret.length == (unsigned int)-1) - goto stop; - /* Put SG list into send queue. */ -#ifdef HAVE_VERBS_VLAN_INSERTION - if (insert_vlan) - err = txq->send_pending_sg_list_vlan - (txq->qp, - sges, - ret.num, - send_flags, - &buf->vlan_tci); - else -#endif /* HAVE_VERBS_VLAN_INSERTION */ - err = txq->send_pending_sg_list - (txq->qp, - sges, - ret.num, - send_flags); - if (unlikely(err)) - goto stop; -#ifdef MLX5_PMD_SOFT_COUNTERS - sent_size += ret.length; -#endif -#else /* MLX5_PMD_SGE_WR_N > 1 */ - DEBUG("%p: TX scattered buffers support not" - " compiled in", (void *)txq); - goto stop; -#endif /* MLX5_PMD_SGE_WR_N > 1 */ } + mpw.total_len += length; elts_head = elts_head_next; - buf = buf_next; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent bytes counter. */ - txq->stats.obytes += sent_size; + txq->stats.obytes += length; #endif - } -stop: + ++i; + } while (pkts_n); /* Take a shortcut if nothing must be sent. */ if (unlikely(i == 0)) return 0; + /* Check whether completion threshold has been reached. */ + /* "j" includes both packets and segments. */ + comp = txq->elts_comp + j; + if (comp >= MLX5_TX_COMP_THRESH) { + volatile union mlx5_wqe *wqe = mpw.wqe; + + /* Request completion on last WQE. */ + wqe->mpw_inl.ctrl.data[2] = htonl(8); + /* Save elts_head in unused "immediate" field of WQE. */ + wqe->mpw_inl.ctrl.data[3] = elts_head; + txq->elts_comp = 0; + } else { + txq->elts_comp = comp; + } #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent packets counter. */ txq->stats.opackets += i; #endif /* Ring QP doorbell. */ - err = txq->send_flush(txq->qp); - if (unlikely(err)) { - /* A nonzero value is not supposed to be returned. - * Nothing can be done about it. */ - DEBUG("%p: send_flush() failed with error %d", - (void *)txq, err); - } + if (mpw.state == MLX5_MPW_INL_STATE_OPENED) + mlx5_mpw_inline_close(txq, &mpw); + else if (mpw.state == MLX5_MPW_STATE_OPENED) + mlx5_mpw_close(txq, &mpw); + mlx5_tx_dbrec(txq); txq->elts_head = elts_head; - txq->elts_comp += elts_comp; - txq->elts_comp_cd = elts_comp_cd; return i; } /** * Translate RX completion flags to packet type. * - * @param flags - * RX completion flags returned by poll_length_flags(). + * @param[in] cqe + * Pointer to CQE. * * @note: fix mlx5_dev_supported_ptypes_get() if any change here. * @@ -756,11 +1359,13 @@ stop: * Packet type for struct rte_mbuf. */ static inline uint32_t -rxq_cq_to_pkt_type(uint32_t flags) +rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe) { uint32_t pkt_type; + uint8_t flags = cqe->l4_hdr_type_etc; + uint8_t info = cqe->rsvd0[0]; - if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) + if (info & IBV_EXP_CQ_RX_TUNNEL_PACKET) pkt_type = TRANSPOSE(flags, IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, @@ -777,66 +1382,157 @@ rxq_cq_to_pkt_type(uint32_t flags) else pkt_type = TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV4_PACKET, - RTE_PTYPE_L3_IPV4) | + MLX5_CQE_L3_HDR_TYPE_IPV6, + RTE_PTYPE_L3_IPV6) | TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV6_PACKET, - RTE_PTYPE_L3_IPV6); + MLX5_CQE_L3_HDR_TYPE_IPV4, + RTE_PTYPE_L3_IPV4); return pkt_type; } /** + * Get size of the next packet for a given CQE. For compressed CQEs, the + * consumer index is updated only once all packets of the current one have + * been processed. + * + * @param rxq + * Pointer to RX queue. + * @param cqe + * CQE to process. + * + * @return + * Packet size in bytes (0 if there is none), -1 in case of completion + * with error. + */ +static inline int +mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe, + uint16_t cqe_cnt) +{ + struct rxq_zip *zip = &rxq->zip; + uint16_t cqe_n = cqe_cnt + 1; + int len = 0; + + /* Process compressed data in the CQE and mini arrays. */ + if (zip->ai) { + volatile struct mlx5_mini_cqe8 (*mc)[8] = + (volatile struct mlx5_mini_cqe8 (*)[8]) + (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].cqe64); + + len = ntohl((*mc)[zip->ai & 7].byte_cnt); + if ((++zip->ai & 7) == 0) { + /* + * Increment consumer index to skip the number of + * CQEs consumed. Hardware leaves holes in the CQ + * ring for software use. + */ + zip->ca = zip->na; + zip->na += 8; + } + if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) { + uint16_t idx = rxq->cq_ci; + uint16_t end = zip->cq_ci; + + while (idx != end) { + (*rxq->cqes)[idx & cqe_cnt].cqe64.op_own = + MLX5_CQE_INVALIDATE; + ++idx; + } + rxq->cq_ci = zip->cq_ci; + zip->ai = 0; + } + /* No compressed data, get next CQE and verify if it is compressed. */ + } else { + int ret; + int8_t op_own; + + ret = check_cqe64(cqe, cqe_n, rxq->cq_ci); + if (unlikely(ret == 1)) + return 0; + ++rxq->cq_ci; + op_own = cqe->op_own; + if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { + volatile struct mlx5_mini_cqe8 (*mc)[8] = + (volatile struct mlx5_mini_cqe8 (*)[8]) + (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci & + cqe_cnt].cqe64); + + /* Fix endianness. */ + zip->cqe_cnt = ntohl(cqe->byte_cnt); + /* + * Current mini array position is the one returned by + * check_cqe64(). + * + * If completion comprises several mini arrays, as a + * special case the second one is located 7 CQEs after + * the initial CQE instead of 8 for subsequent ones. + */ + zip->ca = rxq->cq_ci & cqe_cnt; + zip->na = zip->ca + 7; + /* Compute the next non compressed CQE. */ + --rxq->cq_ci; + zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; + /* Get packet size to return. */ + len = ntohl((*mc)[0].byte_cnt); + zip->ai = 1; + } else { + len = ntohl(cqe->byte_cnt); + } + /* Error while receiving packet. */ + if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR)) + return -1; + } + return len; +} + +/** * Translate RX completion flags to offload flags. * * @param[in] rxq * Pointer to RX queue structure. - * @param flags - * RX completion flags returned by poll_length_flags(). + * @param[in] cqe + * Pointer to CQE. * * @return * Offload flags (ol_flags) for struct rte_mbuf. */ static inline uint32_t -rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) +rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe) { uint32_t ol_flags = 0; + uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK; + uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK; + uint8_t info = cqe->rsvd0[0]; - if (rxq->csum) { - /* Set IP checksum flag only for IPv4/IPv6 packets. */ - if (flags & - (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET)) - ol_flags |= - TRANSPOSE(~flags, - IBV_EXP_CQ_RX_IP_CSUM_OK, - PKT_RX_IP_CKSUM_BAD); -#ifdef HAVE_EXP_CQ_RX_TCP_PACKET - /* Set L4 checksum flag only for TCP/UDP packets. */ - if (flags & - (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET)) -#endif /* HAVE_EXP_CQ_RX_TCP_PACKET */ - ol_flags |= - TRANSPOSE(~flags, - IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK, - PKT_RX_L4_CKSUM_BAD); - } + if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) || + (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6)) + ol_flags |= + (!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) * + PKT_RX_IP_CKSUM_BAD); + if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) || + (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) || + (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) || + (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP)) + ol_flags |= + (!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) * + PKT_RX_L4_CKSUM_BAD); /* * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional * (its value is 0). */ - if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) + if ((info & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) ol_flags |= - TRANSPOSE(~flags, + TRANSPOSE(~cqe->l4_hdr_type_etc, IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK, PKT_RX_IP_CKSUM_BAD) | - TRANSPOSE(~flags, + TRANSPOSE(~cqe->l4_hdr_type_etc, IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK, PKT_RX_L4_CKSUM_BAD); return ol_flags; } /** - * DPDK callback for RX with scattered packets support. + * DPDK callback for RX. * * @param dpdk_rxq * Generic pointer to RX queue structure. @@ -849,353 +1545,127 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) * Number of packets successfully received (<= pkts_n). */ uint16_t -mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; - const unsigned int elts_n = rxq->elts_n; - unsigned int elts_head = rxq->elts_head; - unsigned int i; - unsigned int pkts_ret = 0; - int ret; + struct rxq *rxq = dpdk_rxq; + const unsigned int wqe_cnt = rxq->elts_n - 1; + const unsigned int cqe_cnt = rxq->cqe_n - 1; + const unsigned int sges_n = rxq->sges_n; + struct rte_mbuf *pkt = NULL; + struct rte_mbuf *seg = NULL; + volatile struct mlx5_cqe64 *cqe = + &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64; + unsigned int i = 0; + unsigned int rq_ci = rxq->rq_ci << sges_n; + int len; - if (unlikely(!rxq->sp)) - return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n); - if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */ - return 0; - for (i = 0; (i != pkts_n); ++i) { - struct rxq_elt_sp *elt = &(*elts)[elts_head]; - unsigned int len; - unsigned int pkt_buf_len; - struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */ - struct rte_mbuf **pkt_buf_next = &pkt_buf; - unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM; - unsigned int j = 0; - uint32_t flags; - uint16_t vlan_tci; - - /* Sanity checks. */ - assert(elts_head < rxq->elts_n); - assert(rxq->elts_head < rxq->elts_n); - ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci); - if (unlikely(ret < 0)) { - struct ibv_wc wc; - int wcs_n; - - DEBUG("rxq=%p, poll_length() failed (ret=%d)", - (void *)rxq, ret); - /* ibv_poll_cq() must be used in case of failure. */ - wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); - if (unlikely(wcs_n == 0)) - break; - if (unlikely(wcs_n < 0)) { - DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", - (void *)rxq, wcs_n); + while (pkts_n) { + unsigned int idx = rq_ci & wqe_cnt; + volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx]; + struct rte_mbuf *rep = (*rxq->elts)[idx]; + + if (pkt) + NEXT(seg) = rep; + seg = rep; + rte_prefetch0(seg); + rte_prefetch0(cqe); + rte_prefetch0(wqe); + rep = rte_mbuf_raw_alloc(rxq->mp); + if (unlikely(rep == NULL)) { + while (pkt) { + seg = NEXT(pkt); + rte_mbuf_refcnt_set(pkt, 0); + __rte_mbuf_raw_free(pkt); + pkt = seg; + } + ++rxq->stats.rx_nombuf; + break; + } + if (!pkt) { + cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64; + len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt); + if (len == 0) { + rte_mbuf_refcnt_set(rep, 0); + __rte_mbuf_raw_free(rep); break; } - assert(wcs_n == 1); - if (unlikely(wc.status != IBV_WC_SUCCESS)) { - /* Whatever, just repost the offending WR. */ - DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" - " completion status (%d): %s", - (void *)rxq, wc.wr_id, wc.status, - ibv_wc_status_str(wc.status)); -#ifdef MLX5_PMD_SOFT_COUNTERS - /* Increment dropped packets counter. */ + if (unlikely(len == -1)) { + /* RX error, packet is likely too large. */ + rte_mbuf_refcnt_set(rep, 0); + __rte_mbuf_raw_free(rep); ++rxq->stats.idropped; -#endif - goto repost; + goto skip; } - ret = wc.byte_len; - } - if (ret == 0) - break; - assert(ret >= (rxq->crc_present << 2)); - len = ret - (rxq->crc_present << 2); - pkt_buf_len = len; - /* - * Replace spent segments with new ones, concatenate and - * return them as pkt_buf. - */ - while (1) { - struct ibv_sge *sge = &elt->sges[j]; - struct rte_mbuf *seg = elt->bufs[j]; - struct rte_mbuf *rep; - unsigned int seg_tailroom; - - assert(seg != NULL); - /* - * Fetch initial bytes of packet descriptor into a - * cacheline while allocating rep. - */ - rte_prefetch0(seg); - rep = __rte_mbuf_raw_alloc(rxq->mp); - if (unlikely(rep == NULL)) { - /* - * Unable to allocate a replacement mbuf, - * repost WR. - */ - DEBUG("rxq=%p: can't allocate a new mbuf", - (void *)rxq); - if (pkt_buf != NULL) { - *pkt_buf_next = NULL; - rte_pktmbuf_free(pkt_buf); + pkt = seg; + assert(len >= (rxq->crc_present << 2)); + /* Update packet information. */ + if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip | + rxq->crc_present) { + if (rxq->csum) { + pkt->packet_type = + rxq_cq_to_pkt_type(cqe); + pkt->ol_flags = + rxq_cq_to_ol_flags(rxq, cqe); } - /* Increment out of memory counters. */ - ++rxq->stats.rx_nombuf; - ++rxq->priv->dev->data->rx_mbuf_alloc_failed; - goto repost; - } -#ifndef NDEBUG - /* Poison user-modifiable fields in rep. */ - NEXT(rep) = (void *)((uintptr_t)-1); - SET_DATA_OFF(rep, 0xdead); - DATA_LEN(rep) = 0xd00d; - PKT_LEN(rep) = 0xdeadd00d; - NB_SEGS(rep) = 0x2a; - PORT(rep) = 0x2a; - rep->ol_flags = -1; -#endif - assert(rep->buf_len == seg->buf_len); - assert(rep->buf_len == rxq->mb_len); - /* Reconfigure sge to use rep instead of seg. */ - assert(sge->lkey == rxq->mr->lkey); - sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom); - elt->bufs[j] = rep; - ++j; - /* Update pkt_buf if it's the first segment, or link - * seg to the previous one and update pkt_buf_next. */ - *pkt_buf_next = seg; - pkt_buf_next = &NEXT(seg); - /* Update seg information. */ - seg_tailroom = (seg->buf_len - seg_headroom); - assert(sge->length == seg_tailroom); - SET_DATA_OFF(seg, seg_headroom); - if (likely(len <= seg_tailroom)) { - /* Last segment. */ - DATA_LEN(seg) = len; - PKT_LEN(seg) = len; - /* Sanity check. */ - assert(rte_pktmbuf_headroom(seg) == - seg_headroom); - assert(rte_pktmbuf_tailroom(seg) == - (seg_tailroom - len)); - break; - } - DATA_LEN(seg) = seg_tailroom; - PKT_LEN(seg) = seg_tailroom; - /* Sanity check. */ - assert(rte_pktmbuf_headroom(seg) == seg_headroom); - assert(rte_pktmbuf_tailroom(seg) == 0); - /* Fix len and clear headroom for next segments. */ - len -= seg_tailroom; - seg_headroom = 0; - } - /* Update head and tail segments. */ - *pkt_buf_next = NULL; - assert(pkt_buf != NULL); - assert(j != 0); - NB_SEGS(pkt_buf) = j; - PORT(pkt_buf) = rxq->port_id; - PKT_LEN(pkt_buf) = pkt_buf_len; - if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { - pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); - pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS - if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { - pkt_buf->ol_flags |= PKT_RX_VLAN_PKT; - pkt_buf->vlan_tci = vlan_tci; + if (cqe->l4_hdr_type_etc & + MLX5_CQE_VLAN_STRIPPED) { + pkt->ol_flags |= PKT_RX_VLAN_PKT | + PKT_RX_VLAN_STRIPPED; + pkt->vlan_tci = ntohs(cqe->vlan_info); + } + if (rxq->crc_present) + len -= ETHER_CRC_LEN; } -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ + PKT_LEN(pkt) = len; } - - /* Return packet. */ - *(pkts++) = pkt_buf; - ++pkts_ret; -#ifdef MLX5_PMD_SOFT_COUNTERS - /* Increment bytes counter. */ - rxq->stats.ibytes += pkt_buf_len; -#endif -repost: - ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges)); - if (unlikely(ret)) { - /* Inability to repost WRs is fatal. */ - DEBUG("%p: recv_sg_list(): failed (ret=%d)", - (void *)rxq->priv, - ret); - abort(); - } - if (++elts_head >= elts_n) - elts_head = 0; - continue; - } - if (unlikely(i == 0)) - return 0; - rxq->elts_head = elts_head; -#ifdef MLX5_PMD_SOFT_COUNTERS - /* Increment packets counter. */ - rxq->stats.ipackets += pkts_ret; -#endif - return pkts_ret; -} - -/** - * DPDK callback for RX. - * - * The following function is the same as mlx5_rx_burst_sp(), except it doesn't - * manage scattered packets. Improves performance when MRU is lower than the - * size of the first segment. - * - * @param dpdk_rxq - * Generic pointer to RX queue structure. - * @param[out] pkts - * Array to store received packets. - * @param pkts_n - * Maximum number of packets in array. - * - * @return - * Number of packets successfully received (<= pkts_n). - */ -uint16_t -mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) -{ - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; - const unsigned int elts_n = rxq->elts_n; - unsigned int elts_head = rxq->elts_head; - struct ibv_sge sges[pkts_n]; - unsigned int i; - unsigned int pkts_ret = 0; - int ret; - - if (unlikely(rxq->sp)) - return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n); - for (i = 0; (i != pkts_n); ++i) { - struct rxq_elt *elt = &(*elts)[elts_head]; - unsigned int len; - struct rte_mbuf *seg = elt->buf; - struct rte_mbuf *rep; - uint32_t flags; - uint16_t vlan_tci; - - /* Sanity checks. */ - assert(seg != NULL); - assert(elts_head < rxq->elts_n); - assert(rxq->elts_head < rxq->elts_n); + DATA_LEN(rep) = DATA_LEN(seg); + PKT_LEN(rep) = PKT_LEN(seg); + SET_DATA_OFF(rep, DATA_OFF(seg)); + NB_SEGS(rep) = NB_SEGS(seg); + PORT(rep) = PORT(seg); + NEXT(rep) = NULL; + (*rxq->elts)[idx] = rep; /* - * Fetch initial bytes of packet descriptor into a - * cacheline while allocating rep. + * Fill NIC descriptor with the new buffer. The lkey and size + * of the buffers are already known, only the buffer address + * changes. */ - rte_prefetch0(seg); - rte_prefetch0(&seg->cacheline1); - ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci); - if (unlikely(ret < 0)) { - struct ibv_wc wc; - int wcs_n; - - DEBUG("rxq=%p, poll_length() failed (ret=%d)", - (void *)rxq, ret); - /* ibv_poll_cq() must be used in case of failure. */ - wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); - if (unlikely(wcs_n == 0)) - break; - if (unlikely(wcs_n < 0)) { - DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", - (void *)rxq, wcs_n); - break; - } - assert(wcs_n == 1); - if (unlikely(wc.status != IBV_WC_SUCCESS)) { - /* Whatever, just repost the offending WR. */ - DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" - " completion status (%d): %s", - (void *)rxq, wc.wr_id, wc.status, - ibv_wc_status_str(wc.status)); -#ifdef MLX5_PMD_SOFT_COUNTERS - /* Increment dropped packets counter. */ - ++rxq->stats.idropped; -#endif - /* Add SGE to array for repost. */ - sges[i] = elt->sge; - goto repost; - } - ret = wc.byte_len; - } - if (ret == 0) - break; - assert(ret >= (rxq->crc_present << 2)); - len = ret - (rxq->crc_present << 2); - rep = __rte_mbuf_raw_alloc(rxq->mp); - if (unlikely(rep == NULL)) { - /* - * Unable to allocate a replacement mbuf, - * repost WR. - */ - DEBUG("rxq=%p: can't allocate a new mbuf", - (void *)rxq); - /* Increment out of memory counters. */ - ++rxq->stats.rx_nombuf; - ++rxq->priv->dev->data->rx_mbuf_alloc_failed; - goto repost; + wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t)); + if (len > DATA_LEN(seg)) { + len -= DATA_LEN(seg); + ++NB_SEGS(pkt); + ++rq_ci; + continue; } - - /* Reconfigure sge to use rep instead of seg. */ - elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; - assert(elt->sge.lkey == rxq->mr->lkey); - elt->buf = rep; - - /* Add SGE to array for repost. */ - sges[i] = elt->sge; - - /* Update seg information. */ - SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM); - NB_SEGS(seg) = 1; - PORT(seg) = rxq->port_id; - NEXT(seg) = NULL; - PKT_LEN(seg) = len; DATA_LEN(seg) = len; - if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { - seg->packet_type = rxq_cq_to_pkt_type(flags); - seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS - if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { - seg->ol_flags |= PKT_RX_VLAN_PKT; - seg->vlan_tci = vlan_tci; - } -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ - } - /* Return packet. */ - *(pkts++) = seg; - ++pkts_ret; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment bytes counter. */ - rxq->stats.ibytes += len; + rxq->stats.ibytes += PKT_LEN(pkt); #endif -repost: - if (++elts_head >= elts_n) - elts_head = 0; - continue; + /* Return packet. */ + *(pkts++) = pkt; + pkt = NULL; + --pkts_n; + ++i; +skip: + /* Align consumer index to the next stride. */ + rq_ci >>= sges_n; + ++rq_ci; + rq_ci <<= sges_n; } - if (unlikely(i == 0)) + if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci))) return 0; - /* Repost WRs. */ -#ifdef DEBUG_RECV - DEBUG("%p: reposting %u WRs", (void *)rxq, i); -#endif - ret = rxq->recv(rxq->wq, sges, i); - if (unlikely(ret)) { - /* Inability to repost WRs is fatal. */ - DEBUG("%p: recv_burst(): failed (ret=%d)", - (void *)rxq->priv, - ret); - abort(); - } - rxq->elts_head = elts_head; + /* Update the consumer index. */ + rxq->rq_ci = rq_ci >> sges_n; + rte_wmb(); + *rxq->cq_db = htonl(rxq->cq_ci); + rte_wmb(); + *rxq->rq_db = htonl(rxq->rq_ci); #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment packets counter. */ - rxq->stats.ipackets += pkts_ret; + rxq->stats.ipackets += i; #endif - return pkts_ret; + return i; } /** diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index 0e2b607d..f6e2cbac 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -43,6 +43,7 @@ #pragma GCC diagnostic ignored "-pedantic" #endif #include <infiniband/verbs.h> +#include <infiniband/mlx5_hw.h> #ifdef PEDANTIC #pragma GCC diagnostic error "-pedantic" #endif @@ -61,6 +62,7 @@ #include "mlx5.h" #include "mlx5_autoconf.h" #include "mlx5_defs.h" +#include "mlx5_prm.h" struct mlx5_rxq_stats { unsigned int idx; /**< Mapping index. */ @@ -81,18 +83,6 @@ struct mlx5_txq_stats { uint64_t odropped; /**< Total of packets not sent when TX ring full. */ }; -/* RX element (scattered packets). */ -struct rxq_elt_sp { - struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */ - struct rte_mbuf *bufs[MLX5_PMD_SGE_WR_N]; /* SGEs buffers. */ -}; - -/* RX element. */ -struct rxq_elt { - struct ibv_sge sge; /* Scatter/Gather Element. */ - struct rte_mbuf *buf; /* SGE buffer. */ -}; - /* Flow director queue structure. */ struct fdir_queue { struct ibv_qp *qp; /* Associated RX QP. */ @@ -101,38 +91,49 @@ struct fdir_queue { struct priv; +/* Compressed CQE context. */ +struct rxq_zip { + uint16_t ai; /* Array index. */ + uint16_t ca; /* Current array index. */ + uint16_t na; /* Next array index. */ + uint16_t cq_ci; /* The next CQE. */ + uint32_t cqe_cnt; /* Number of CQEs. */ +}; + /* RX queue descriptor. */ struct rxq { - struct priv *priv; /* Back pointer to private data. */ - struct rte_mempool *mp; /* Memory Pool for allocations. */ - struct ibv_cq *cq; /* Completion Queue. */ - struct ibv_exp_wq *wq; /* Work Queue. */ - int32_t (*poll)(); /* Verbs poll function. */ - int32_t (*recv)(); /* Verbs receive function. */ - unsigned int port_id; /* Port ID for incoming packets. */ - unsigned int elts_n; /* (*elts)[] length. */ - unsigned int elts_head; /* Current index in (*elts)[]. */ - unsigned int sp:1; /* Use scattered RX elements. */ unsigned int csum:1; /* Enable checksum offloading. */ unsigned int csum_l2tun:1; /* Same for L2 tunnels. */ unsigned int vlan_strip:1; /* Enable VLAN stripping. */ unsigned int crc_present:1; /* CRC must be subtracted. */ - union { - struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */ - struct rxq_elt (*no_sp)[]; /* RX elements. */ - } elts; - uint32_t mb_len; /* Length of a mp-issued mbuf. */ - unsigned int socket; /* CPU socket ID for allocations. */ - struct mlx5_rxq_stats stats; /* RX queue counters. */ + unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */ + uint16_t rq_ci; + uint16_t cq_ci; + uint16_t elts_n; + uint16_t cqe_n; /* Number of CQ elements. */ + uint16_t port_id; + volatile struct mlx5_wqe_data_seg(*wqes)[]; + volatile struct mlx5_cqe(*cqes)[]; + struct rxq_zip zip; /* Compressed context. */ + volatile uint32_t *rq_db; + volatile uint32_t *cq_db; + struct rte_mbuf *(*elts)[]; + struct rte_mempool *mp; + struct mlx5_rxq_stats stats; +} __rte_cache_aligned; + +/* RX queue control descriptor. */ +struct rxq_ctrl { + struct priv *priv; /* Back pointer to private data. */ + struct ibv_cq *cq; /* Completion Queue. */ + struct ibv_exp_wq *wq; /* Work Queue. */ struct ibv_exp_res_domain *rd; /* Resource Domain. */ struct fdir_queue fdir_queue; /* Flow director queue. */ struct ibv_mr *mr; /* Memory Region (for mp). */ struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */ -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS struct ibv_exp_cq_family_v1 *if_cq; /* CQ interface. */ -#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ - struct ibv_exp_cq_family *if_cq; /* CQ interface. */ -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ + unsigned int socket; /* CPU socket ID for allocations. */ + struct rxq rxq; /* Data path structure. */ }; /* Hash RX queue types. */ @@ -140,11 +141,9 @@ enum hash_rxq_type { HASH_RXQ_TCPV4, HASH_RXQ_UDPV4, HASH_RXQ_IPV4, -#ifdef HAVE_FLOW_SPEC_IPV6 HASH_RXQ_TCPV6, HASH_RXQ_UDPV6, HASH_RXQ_IPV6, -#endif /* HAVE_FLOW_SPEC_IPV6 */ HASH_RXQ_ETH, }; @@ -175,9 +174,7 @@ struct hash_rxq_init { } hdr; struct ibv_exp_flow_spec_tcp_udp tcp_udp; struct ibv_exp_flow_spec_ipv4 ipv4; -#ifdef HAVE_FLOW_SPEC_IPV6 struct ibv_exp_flow_spec_ipv6 ipv6; -#endif /* HAVE_FLOW_SPEC_IPV6 */ struct ibv_exp_flow_spec_eth eth; } flow_spec; /* Flow specification template. */ const struct hash_rxq_init *underlayer; /* Pointer to underlayer. */ @@ -232,74 +229,50 @@ struct hash_rxq { struct ibv_qp *qp; /* Hash RX QP. */ enum hash_rxq_type type; /* Hash RX queue type. */ /* MAC flow steering rules, one per VLAN ID. */ - struct ibv_exp_flow *mac_flow[MLX5_MAX_MAC_ADDRESSES][MLX5_MAX_VLAN_IDS]; + struct ibv_exp_flow *mac_flow + [MLX5_MAX_MAC_ADDRESSES][MLX5_MAX_VLAN_IDS]; struct ibv_exp_flow *special_flow [MLX5_MAX_SPECIAL_FLOWS][MLX5_MAX_VLAN_IDS]; }; -/* TX element. */ -struct txq_elt { - struct rte_mbuf *buf; -}; - -/* Linear buffer type. It is used when transmitting buffers with too many - * segments that do not fit the hardware queue (see max_send_sge). - * Extra segments are copied (linearized) in such buffers, replacing the - * last SGE during TX. - * The size is arbitrary but large enough to hold a jumbo frame with - * 8 segments considering mbuf.buf_len is about 2048 bytes. */ -typedef uint8_t linear_t[16384]; - /* TX queue descriptor. */ struct txq { - struct priv *priv; /* Back pointer to private data. */ - int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max); - int (*send_pending)(); -#ifdef HAVE_VERBS_VLAN_INSERTION - int (*send_pending_vlan)(); -#endif -#if MLX5_PMD_MAX_INLINE > 0 - int (*send_pending_inline)(); -#ifdef HAVE_VERBS_VLAN_INSERTION - int (*send_pending_inline_vlan)(); -#endif -#endif -#if MLX5_PMD_SGE_WR_N > 1 - int (*send_pending_sg_list)(); -#ifdef HAVE_VERBS_VLAN_INSERTION - int (*send_pending_sg_list_vlan)(); -#endif -#endif - int (*send_flush)(struct ibv_qp *qp); - struct ibv_cq *cq; /* Completion Queue. */ - struct ibv_qp *qp; /* Queue Pair. */ - struct txq_elt (*elts)[]; /* TX elements. */ -#if MLX5_PMD_MAX_INLINE > 0 - uint32_t max_inline; /* Max inline send size <= MLX5_PMD_MAX_INLINE. */ -#endif - unsigned int elts_n; /* (*elts)[] length. */ - unsigned int elts_head; /* Current index in (*elts)[]. */ - unsigned int elts_tail; /* First element awaiting completion. */ - unsigned int elts_comp; /* Number of completion requests. */ - unsigned int elts_comp_cd; /* Countdown for next completion request. */ - unsigned int elts_comp_cd_init; /* Initial value for countdown. */ + uint16_t elts_head; /* Current index in (*elts)[]. */ + uint16_t elts_tail; /* First element awaiting completion. */ + uint16_t elts_comp; /* Counter since last completion request. */ + uint16_t elts_n; /* (*elts)[] length. */ + uint16_t cq_ci; /* Consumer index for completion queue. */ + uint16_t cqe_n; /* Number of CQ elements. */ + uint16_t wqe_ci; /* Consumer index for work queue. */ + uint16_t wqe_n; /* Number of WQ elements. */ + uint16_t bf_offset; /* Blueflame offset. */ + uint16_t bf_buf_size; /* Blueflame size. */ + uint16_t max_inline; /* Maximum size to inline in a WQE. */ + uint32_t qp_num_8s; /* QP number shifted by 8. */ + volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */ + volatile union mlx5_wqe (*wqes)[]; /* Work queue. */ + volatile uint32_t *qp_db; /* Work queue doorbell. */ + volatile uint32_t *cq_db; /* Completion queue doorbell. */ + volatile void *bf_reg; /* Blueflame register. */ struct { const struct rte_mempool *mp; /* Cached Memory Pool. */ struct ibv_mr *mr; /* Memory Region (for mp). */ - uint32_t lkey; /* mr->lkey */ + uint32_t lkey; /* htonl(mr->lkey) */ } mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */ + struct rte_mbuf *(*elts)[]; /* TX elements. */ struct mlx5_txq_stats stats; /* TX queue counters. */ - /* Elements used only for init part are here. */ - linear_t (*elts_linear)[]; /* Linearized buffers. */ - struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */ -#ifdef HAVE_VERBS_VLAN_INSERTION - struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */ -#else +} __rte_cache_aligned; + +/* TX queue control descriptor. */ +struct txq_ctrl { + struct priv *priv; /* Back pointer to private data. */ + struct ibv_cq *cq; /* Completion Queue. */ + struct ibv_qp *qp; /* Queue Pair. */ struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ -#endif struct ibv_exp_cq_family *if_cq; /* CQ interface. */ struct ibv_exp_res_domain *rd; /* Resource Domain. */ unsigned int socket; /* CPU socket ID for allocations. */ + struct txq txq; /* Data path structure. */ }; /* mlx5_rxq.c */ @@ -316,37 +289,40 @@ int priv_create_hash_rxqs(struct priv *); void priv_destroy_hash_rxqs(struct priv *); int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type); int priv_rehash_flows(struct priv *); -void rxq_cleanup(struct rxq *); -int rxq_rehash(struct rte_eth_dev *, struct rxq *); -int rxq_setup(struct rte_eth_dev *, struct rxq *, uint16_t, unsigned int, - const struct rte_eth_rxconf *, struct rte_mempool *); +void rxq_cleanup(struct rxq_ctrl *); +int rxq_rehash(struct rte_eth_dev *, struct rxq_ctrl *); +int rxq_ctrl_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t, + unsigned int, const struct rte_eth_rxconf *, + struct rte_mempool *); int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int, const struct rte_eth_rxconf *, struct rte_mempool *); void mlx5_rx_queue_release(void *); -uint16_t mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts, - uint16_t pkts_n); - +uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t); /* mlx5_txq.c */ -void txq_cleanup(struct txq *); -int txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, - unsigned int socket, const struct rte_eth_txconf *conf); - +void txq_cleanup(struct txq_ctrl *); +int txq_ctrl_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t, + unsigned int, const struct rte_eth_txconf *); int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int, const struct rte_eth_txconf *); void mlx5_tx_queue_release(void *); -uint16_t mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts, - uint16_t pkts_n); +uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t); /* mlx5_rxtx.c */ -struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, const struct rte_mempool *); -void txq_mp2mr_iter(const struct rte_mempool *, void *); uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t); -uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t); +uint16_t mlx5_tx_burst_inline(void *, struct rte_mbuf **, uint16_t); +uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t); +uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t); uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t); uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t); uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t); +/* mlx5_mr.c */ + +struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *); +void txq_mp2mr_iter(struct rte_mempool *, void *); +uint32_t txq_mp2mr_reg(struct txq *, struct rte_mempool *, unsigned int); + #endif /* RTE_PMD_MLX5_RXTX_H_ */ diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index 31ce53ad..6fe61c4a 100644 --- a/drivers/net/mlx5/mlx5_txq.c +++ b/drivers/net/mlx5/mlx5_txq.c @@ -60,6 +60,7 @@ #endif #include "mlx5_utils.h" +#include "mlx5_defs.h" #include "mlx5.h" #include "mlx5_rxtx.h" #include "mlx5_autoconf.h" @@ -68,118 +69,62 @@ /** * Allocate TX queue elements. * - * @param txq + * @param txq_ctrl * Pointer to TX queue structure. * @param elts_n * Number of elements to allocate. - * - * @return - * 0 on success, errno value on failure. */ -static int -txq_alloc_elts(struct txq *txq, unsigned int elts_n) +static void +txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n) { unsigned int i; - struct txq_elt (*elts)[elts_n] = - rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket); - linear_t (*elts_linear)[elts_n] = - rte_calloc_socket("TXQ", 1, sizeof(*elts_linear), 0, - txq->socket); - struct ibv_mr *mr_linear = NULL; - int ret = 0; - if ((elts == NULL) || (elts_linear == NULL)) { - ERROR("%p: can't allocate packets array", (void *)txq); - ret = ENOMEM; - goto error; - } - mr_linear = - ibv_reg_mr(txq->priv->pd, elts_linear, sizeof(*elts_linear), - (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE)); - if (mr_linear == NULL) { - ERROR("%p: unable to configure MR, ibv_reg_mr() failed", - (void *)txq); - ret = EINVAL; - goto error; - } - for (i = 0; (i != elts_n); ++i) { - struct txq_elt *elt = &(*elts)[i]; + for (i = 0; (i != elts_n); ++i) + (*txq_ctrl->txq.elts)[i] = NULL; + for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) { + volatile union mlx5_wqe *wqe = &(*txq_ctrl->txq.wqes)[i]; - elt->buf = NULL; + memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe)); } - DEBUG("%p: allocated and configured %u WRs", (void *)txq, elts_n); - txq->elts_n = elts_n; - txq->elts = elts; - txq->elts_head = 0; - txq->elts_tail = 0; - txq->elts_comp = 0; - /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or - * at least 4 times per ring. */ - txq->elts_comp_cd_init = - ((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ? - MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4)); - txq->elts_comp_cd = txq->elts_comp_cd_init; - txq->elts_linear = elts_linear; - txq->mr_linear = mr_linear; - assert(ret == 0); - return 0; -error: - if (mr_linear != NULL) - claim_zero(ibv_dereg_mr(mr_linear)); - - rte_free(elts_linear); - rte_free(elts); - - DEBUG("%p: failed, freed everything", (void *)txq); - assert(ret > 0); - return ret; + DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n); + txq_ctrl->txq.elts_head = 0; + txq_ctrl->txq.elts_tail = 0; + txq_ctrl->txq.elts_comp = 0; } /** * Free TX queue elements. * - * @param txq + * @param txq_ctrl * Pointer to TX queue structure. */ static void -txq_free_elts(struct txq *txq) +txq_free_elts(struct txq_ctrl *txq_ctrl) { - unsigned int elts_n = txq->elts_n; - unsigned int elts_head = txq->elts_head; - unsigned int elts_tail = txq->elts_tail; - struct txq_elt (*elts)[elts_n] = txq->elts; - linear_t (*elts_linear)[elts_n] = txq->elts_linear; - struct ibv_mr *mr_linear = txq->mr_linear; + unsigned int elts_n = txq_ctrl->txq.elts_n; + unsigned int elts_head = txq_ctrl->txq.elts_head; + unsigned int elts_tail = txq_ctrl->txq.elts_tail; + struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts; - DEBUG("%p: freeing WRs", (void *)txq); - txq->elts_n = 0; - txq->elts_head = 0; - txq->elts_tail = 0; - txq->elts_comp = 0; - txq->elts_comp_cd = 0; - txq->elts_comp_cd_init = 0; - txq->elts = NULL; - txq->elts_linear = NULL; - txq->mr_linear = NULL; - if (mr_linear != NULL) - claim_zero(ibv_dereg_mr(mr_linear)); + DEBUG("%p: freeing WRs", (void *)txq_ctrl); + txq_ctrl->txq.elts_head = 0; + txq_ctrl->txq.elts_tail = 0; + txq_ctrl->txq.elts_comp = 0; - rte_free(elts_linear); - if (elts == NULL) - return; while (elts_tail != elts_head) { - struct txq_elt *elt = &(*elts)[elts_tail]; + struct rte_mbuf *elt = (*elts)[elts_tail]; - assert(elt->buf != NULL); - rte_pktmbuf_free(elt->buf); + assert(elt != NULL); + rte_pktmbuf_free(elt); #ifndef NDEBUG /* Poisoning. */ - memset(elt, 0x77, sizeof(*elt)); + memset(&(*elts)[elts_tail], + 0x77, + sizeof((*elts)[elts_tail])); #endif if (++elts_tail == elts_n) elts_tail = 0; } - rte_free(elts); } /** @@ -187,66 +132,104 @@ txq_free_elts(struct txq *txq) * * Destroy objects, free allocated memory and reset the structure for reuse. * - * @param txq + * @param txq_ctrl * Pointer to TX queue structure. */ void -txq_cleanup(struct txq *txq) +txq_cleanup(struct txq_ctrl *txq_ctrl) { struct ibv_exp_release_intf_params params; size_t i; - DEBUG("cleaning up %p", (void *)txq); - txq_free_elts(txq); - txq->poll_cnt = NULL; -#if MLX5_PMD_MAX_INLINE > 0 - txq->send_pending_inline = NULL; -#endif - txq->send_flush = NULL; - if (txq->if_qp != NULL) { - assert(txq->priv != NULL); - assert(txq->priv->ctx != NULL); - assert(txq->qp != NULL); + DEBUG("cleaning up %p", (void *)txq_ctrl); + txq_free_elts(txq_ctrl); + if (txq_ctrl->if_qp != NULL) { + assert(txq_ctrl->priv != NULL); + assert(txq_ctrl->priv->ctx != NULL); + assert(txq_ctrl->qp != NULL); params = (struct ibv_exp_release_intf_params){ .comp_mask = 0, }; - claim_zero(ibv_exp_release_intf(txq->priv->ctx, - txq->if_qp, + claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx, + txq_ctrl->if_qp, ¶ms)); } - if (txq->if_cq != NULL) { - assert(txq->priv != NULL); - assert(txq->priv->ctx != NULL); - assert(txq->cq != NULL); + if (txq_ctrl->if_cq != NULL) { + assert(txq_ctrl->priv != NULL); + assert(txq_ctrl->priv->ctx != NULL); + assert(txq_ctrl->cq != NULL); params = (struct ibv_exp_release_intf_params){ .comp_mask = 0, }; - claim_zero(ibv_exp_release_intf(txq->priv->ctx, - txq->if_cq, + claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx, + txq_ctrl->if_cq, ¶ms)); } - if (txq->qp != NULL) - claim_zero(ibv_destroy_qp(txq->qp)); - if (txq->cq != NULL) - claim_zero(ibv_destroy_cq(txq->cq)); - if (txq->rd != NULL) { + if (txq_ctrl->qp != NULL) + claim_zero(ibv_destroy_qp(txq_ctrl->qp)); + if (txq_ctrl->cq != NULL) + claim_zero(ibv_destroy_cq(txq_ctrl->cq)); + if (txq_ctrl->rd != NULL) { struct ibv_exp_destroy_res_domain_attr attr = { .comp_mask = 0, }; - assert(txq->priv != NULL); - assert(txq->priv->ctx != NULL); - claim_zero(ibv_exp_destroy_res_domain(txq->priv->ctx, - txq->rd, + assert(txq_ctrl->priv != NULL); + assert(txq_ctrl->priv->ctx != NULL); + claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->priv->ctx, + txq_ctrl->rd, &attr)); } - for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { - if (txq->mp2mr[i].mp == NULL) + for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) { + if (txq_ctrl->txq.mp2mr[i].mp == NULL) break; - assert(txq->mp2mr[i].mr != NULL); - claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr)); + assert(txq_ctrl->txq.mp2mr[i].mr != NULL); + claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[i].mr)); + } + memset(txq_ctrl, 0, sizeof(*txq_ctrl)); +} + +/** + * Initialize TX queue. + * + * @param tmpl + * Pointer to TX queue control template. + * @param txq_ctrl + * Pointer to TX queue control. + * + * @return + * 0 on success, errno value on failure. + */ +static inline int +txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl) +{ + struct mlx5_qp *qp = to_mqp(tmpl->qp); + struct ibv_cq *ibcq = tmpl->cq; + struct mlx5_cq *cq = to_mxxx(cq, cq); + + if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) { + ERROR("Wrong MLX5_CQE_SIZE environment variable value: " + "it should be set to %u", RTE_CACHE_LINE_SIZE); + return EINVAL; } - memset(txq, 0, sizeof(*txq)); + tmpl->txq.cqe_n = ibcq->cqe + 1; + tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8; + tmpl->txq.wqes = + (volatile union mlx5_wqe (*)[]) + (uintptr_t)qp->gen_data.sqstart; + tmpl->txq.wqe_n = qp->sq.wqe_cnt; + tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR]; + tmpl->txq.bf_reg = qp->gen_data.bf->reg; + tmpl->txq.bf_offset = qp->gen_data.bf->offset; + tmpl->txq.bf_buf_size = qp->gen_data.bf->buf_size; + tmpl->txq.cq_db = cq->dbrec; + tmpl->txq.cqes = + (volatile struct mlx5_cqe (*)[]) + (uintptr_t)cq->active_buf->buf; + tmpl->txq.elts = + (struct rte_mbuf *(*)[tmpl->txq.elts_n]) + ((uintptr_t)txq_ctrl + sizeof(*txq_ctrl)); + return 0; } /** @@ -254,7 +237,7 @@ txq_cleanup(struct txq *txq) * * @param dev * Pointer to Ethernet device structure. - * @param txq + * @param txq_ctrl * Pointer to TX queue structure. * @param desc * Number of descriptors to configure in queue. @@ -267,13 +250,14 @@ txq_cleanup(struct txq *txq) * 0 on success, errno value on failure. */ int -txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, - unsigned int socket, const struct rte_eth_txconf *conf) +txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, + uint16_t desc, unsigned int socket, + const struct rte_eth_txconf *conf) { struct priv *priv = mlx5_get_priv(dev); - struct txq tmpl = { + struct txq_ctrl tmpl = { .priv = priv, - .socket = socket + .socket = socket, }; union { struct ibv_exp_query_intf_params params; @@ -281,17 +265,19 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, struct ibv_exp_res_domain_init_attr rd; struct ibv_exp_cq_init_attr cq; struct ibv_exp_qp_attr mod; + struct ibv_exp_cq_attr cq_attr; } attr; enum ibv_exp_query_intf_status status; int ret = 0; - (void)conf; /* Thresholds configuration (ignored). */ - if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) { - ERROR("%p: invalid number of TX descriptors (must be a" - " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N); - return EINVAL; + if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) { + ret = ENOTSUP; + ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set"); + goto error; } - desc /= MLX5_PMD_SGE_WR_N; + (void)conf; /* Thresholds configuration (ignored). */ + assert(desc > MLX5_TX_COMP_THRESH); + tmpl.txq.elts_n = desc; /* MRs will be registered in mp2mr[] later. */ attr.rd = (struct ibv_exp_res_domain_init_attr){ .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL | @@ -310,7 +296,10 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, .res_domain = tmpl.rd, }; - tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq); + tmpl.cq = ibv_exp_create_cq(priv->ctx, + (((desc / MLX5_TX_COMP_THRESH) - 1) ? + ((desc / MLX5_TX_COMP_THRESH) - 1) : 1), + NULL, NULL, 0, &attr.cq); if (tmpl.cq == NULL) { ret = ENOMEM; ERROR("%p: CQ creation failure: %s", @@ -331,14 +320,14 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, .max_send_wr = ((priv->device_attr.max_qp_wr < desc) ? priv->device_attr.max_qp_wr : desc), - /* Max number of scatter/gather elements in a WR. */ - .max_send_sge = ((priv->device_attr.max_sge < - MLX5_PMD_SGE_WR_N) ? - priv->device_attr.max_sge : - MLX5_PMD_SGE_WR_N), -#if MLX5_PMD_MAX_INLINE > 0 - .max_inline_data = MLX5_PMD_MAX_INLINE, -#endif + /* + * Max number of scatter/gather elements in a WR, + * must be 1 to prevent libmlx5 from trying to affect + * too much memory. TX gather is not impacted by the + * priv->device_attr.max_sge limit and will still work + * properly. + */ + .max_send_sge = 1, }, .qp_type = IBV_QPT_RAW_PACKET, /* Do *NOT* enable this, completions events are managed per @@ -349,6 +338,10 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD | IBV_EXP_QP_INIT_ATTR_RES_DOMAIN), }; + if (priv->txq_inline && priv->txqs_n >= priv->txqs_inline) { + tmpl.txq.max_inline = priv->txq_inline; + attr.init.cap.max_inline_data = tmpl.txq.max_inline; + } tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init); if (tmpl.qp == NULL) { ret = (errno ? errno : EINVAL); @@ -356,10 +349,11 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, (void *)dev, strerror(ret)); goto error; } -#if MLX5_PMD_MAX_INLINE > 0 - /* ibv_create_qp() updates this value. */ - tmpl.max_inline = attr.init.cap.max_inline_data; -#endif + DEBUG("TX queue capabilities: max_send_wr=%u, max_send_sge=%u," + " max_inline_data=%u", + attr.init.cap.max_send_wr, + attr.init.cap.max_send_sge, + attr.init.cap.max_inline_data); attr.mod = (struct ibv_exp_qp_attr){ /* Move the QP to this state. */ .qp_state = IBV_QPS_INIT, @@ -373,12 +367,13 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, (void *)dev, strerror(ret)); goto error; } - ret = txq_alloc_elts(&tmpl, desc); + ret = txq_setup(&tmpl, txq_ctrl); if (ret) { - ERROR("%p: TXQ allocation failed: %s", + ERROR("%p: cannot initialize TX queue structure: %s", (void *)dev, strerror(ret)); goto error; } + txq_alloc_elts(&tmpl, desc); attr.mod = (struct ibv_exp_qp_attr){ .qp_state = IBV_QPS_RTR }; @@ -410,17 +405,13 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, attr.params = (struct ibv_exp_query_intf_params){ .intf_scope = IBV_EXP_INTF_GLOBAL, .intf = IBV_EXP_INTF_QP_BURST, - .obj = tmpl.qp, -#ifdef HAVE_VERBS_VLAN_INSERTION .intf_version = 1, -#endif -#ifdef HAVE_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR + .obj = tmpl.qp, /* Enable multi-packet send if supported. */ .family_flags = - (priv->mps ? + ((priv->mps && !priv->sriov) ? IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR : 0), -#endif }; tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status); if (tmpl.if_qp == NULL) { @@ -430,30 +421,12 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, goto error; } /* Clean up txq in case we're reinitializing it. */ - DEBUG("%p: cleaning-up old txq just in case", (void *)txq); - txq_cleanup(txq); - *txq = tmpl; - txq->poll_cnt = txq->if_cq->poll_cnt; -#if MLX5_PMD_MAX_INLINE > 0 - txq->send_pending_inline = txq->if_qp->send_pending_inline; -#ifdef HAVE_VERBS_VLAN_INSERTION - txq->send_pending_inline_vlan = txq->if_qp->send_pending_inline_vlan; -#endif -#endif -#if MLX5_PMD_SGE_WR_N > 1 - txq->send_pending_sg_list = txq->if_qp->send_pending_sg_list; -#ifdef HAVE_VERBS_VLAN_INSERTION - txq->send_pending_sg_list_vlan = txq->if_qp->send_pending_sg_list_vlan; -#endif -#endif - txq->send_pending = txq->if_qp->send_pending; -#ifdef HAVE_VERBS_VLAN_INSERTION - txq->send_pending_vlan = txq->if_qp->send_pending_vlan; -#endif - txq->send_flush = txq->if_qp->send_flush; - DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl); + DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl); + txq_cleanup(txq_ctrl); + *txq_ctrl = tmpl; + DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl); /* Pre-register known mempools. */ - rte_mempool_walk(txq_mp2mr_iter, txq); + rte_mempool_walk(txq_mp2mr_iter, txq_ctrl); assert(ret == 0); return 0; error: @@ -485,12 +458,26 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, { struct priv *priv = dev->data->dev_private; struct txq *txq = (*priv->txqs)[idx]; + struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq); int ret; if (mlx5_is_secondary()) return -E_RTE_SECONDARY; priv_lock(priv); + if (desc <= MLX5_TX_COMP_THRESH) { + WARN("%p: number of descriptors requested for TX queue %u" + " must be higher than MLX5_TX_COMP_THRESH, using" + " %u instead of %u", + (void *)dev, idx, MLX5_TX_COMP_THRESH + 1, desc); + desc = MLX5_TX_COMP_THRESH + 1; + } + if (!rte_is_power_of_2(desc)) { + desc = 1 << log2above(desc); + WARN("%p: increased number of descriptors in TX queue %u" + " to the next power of two (%d)", + (void *)dev, idx, desc); + } DEBUG("%p: configuring queue %u for %u descriptors", (void *)dev, idx, desc); if (idx >= priv->txqs_n) { @@ -507,26 +494,30 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, return -EEXIST; } (*priv->txqs)[idx] = NULL; - txq_cleanup(txq); + txq_cleanup(txq_ctrl); } else { - txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, socket); - if (txq == NULL) { + txq_ctrl = + rte_calloc_socket("TXQ", 1, + sizeof(*txq_ctrl) + + desc * sizeof(struct rte_mbuf *), + 0, socket); + if (txq_ctrl == NULL) { ERROR("%p: unable to allocate queue index %u", (void *)dev, idx); priv_unlock(priv); return -ENOMEM; } } - ret = txq_setup(dev, txq, desc, socket, conf); + ret = txq_ctrl_setup(dev, txq_ctrl, desc, socket, conf); if (ret) - rte_free(txq); + rte_free(txq_ctrl); else { - txq->stats.idx = idx; + txq_ctrl->txq.stats.idx = idx; DEBUG("%p: adding TX queue %p to list", - (void *)dev, (void *)txq); - (*priv->txqs)[idx] = txq; + (void *)dev, (void *)txq_ctrl); + (*priv->txqs)[idx] = &txq_ctrl->txq; /* Update send callback. */ - dev->tx_pkt_burst = mlx5_tx_burst; + priv_select_tx_function(priv); } priv_unlock(priv); return -ret; @@ -542,6 +533,7 @@ void mlx5_tx_queue_release(void *dpdk_txq) { struct txq *txq = (struct txq *)dpdk_txq; + struct txq_ctrl *txq_ctrl; struct priv *priv; unsigned int i; @@ -550,17 +542,18 @@ mlx5_tx_queue_release(void *dpdk_txq) if (txq == NULL) return; - priv = txq->priv; + txq_ctrl = container_of(txq, struct txq_ctrl, txq); + priv = txq_ctrl->priv; priv_lock(priv); for (i = 0; (i != priv->txqs_n); ++i) if ((*priv->txqs)[i] == txq) { DEBUG("%p: removing TX queue %p from list", - (void *)priv->dev, (void *)txq); + (void *)priv->dev, (void *)txq_ctrl); (*priv->txqs)[i] = NULL; break; } - txq_cleanup(txq); - rte_free(txq); + txq_cleanup(txq_ctrl); + rte_free(txq_ctrl); priv_unlock(priv); } @@ -585,7 +578,8 @@ mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct txq *txq = dpdk_txq; - struct priv *priv = mlx5_secondary_data_setup(txq->priv); + struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq); + struct priv *priv = mlx5_secondary_data_setup(txq_ctrl->priv); struct priv *primary_priv; unsigned int index; diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c index ea7af1e4..4719e697 100644 --- a/drivers/net/mlx5/mlx5_vlan.c +++ b/drivers/net/mlx5/mlx5_vlan.c @@ -144,7 +144,7 @@ static void priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on) { struct rxq *rxq = (*priv->rxqs)[idx]; -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS + struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); struct ibv_exp_wq_attr mod; uint16_t vlan_offloads = (on ? IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : 0) | @@ -158,15 +158,13 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on) .vlan_offloads = vlan_offloads, }; - err = ibv_exp_modify_wq(rxq->wq, &mod); + err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod); if (err) { ERROR("%p: failed to modified stripping mode: %s", (void *)priv, strerror(err)); return; } -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ - /* Update related bits in RX queue. */ rxq->vlan_strip = !!on; } @@ -218,7 +216,7 @@ mlx5_vlan_offload_set(struct rte_eth_dev *dev, int mask) unsigned int i; if (mask & ETH_VLAN_STRIP_MASK) { - int hw_vlan_strip = dev->data->dev_conf.rxmode.hw_vlan_strip; + int hw_vlan_strip = !!dev->data->dev_conf.rxmode.hw_vlan_strip; if (!priv->hw_vlan_strip) { ERROR("VLAN stripping is not supported"); |