From f239aed5e674965691846e8ce3f187dd47523689 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Wed, 16 Aug 2017 18:42:05 +0100 Subject: New upstream version 17.08 Change-Id: I288b50990f52646089d6b1f3aaa6ba2f091a51d7 Signed-off-by: Luca Boccassi --- drivers/net/mlx5/Makefile | 10 +- drivers/net/mlx5/mlx5.c | 28 + drivers/net/mlx5/mlx5.h | 5 + drivers/net/mlx5/mlx5_defs.h | 18 + drivers/net/mlx5/mlx5_ethdev.c | 200 ++--- drivers/net/mlx5/mlx5_fdir.c | 1 + drivers/net/mlx5/mlx5_flow.c | 50 +- drivers/net/mlx5/mlx5_mac.c | 2 + drivers/net/mlx5/mlx5_mr.c | 17 +- drivers/net/mlx5/mlx5_rxmode.c | 2 + drivers/net/mlx5/mlx5_rxq.c | 263 ++++--- drivers/net/mlx5/mlx5_rxtx.c | 667 ++++++---------- drivers/net/mlx5/mlx5_rxtx.h | 297 ++++++- drivers/net/mlx5/mlx5_rxtx_vec_sse.c | 1417 ++++++++++++++++++++++++++++++++++ drivers/net/mlx5/mlx5_trigger.c | 19 +- drivers/net/mlx5/mlx5_txq.c | 24 +- 16 files changed, 2296 insertions(+), 724 deletions(-) create mode 100644 drivers/net/mlx5/mlx5_rxtx_vec_sse.c (limited to 'drivers/net/mlx5') diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index c0799591..8736de5d 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -39,6 +39,9 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxq.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_txq.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx.c +ifeq ($(CONFIG_RTE_ARCH_X86_64),y) +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx_vec_sse.c +endif SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_trigger.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_ethdev.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mac.c @@ -52,7 +55,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c # Basic CFLAGS. CFLAGS += -O3 -CFLAGS += -std=gnu99 -Wall -Wextra +CFLAGS += -std=c11 -Wall -Wextra CFLAGS += -g CFLAGS += -I. CFLAGS += -D_BSD_SOURCE @@ -100,6 +103,11 @@ mlx5_autoconf.h.new: FORCE mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh $Q $(RM) -f -- '$@' + $Q sh -- '$<' '$@' \ + HAVE_VERBS_IBV_EXP_FLOW_SPEC_ACTION_DROP \ + infiniband/verbs_exp.h \ + enum IBV_EXP_FLOW_SPEC_ACTION_DROP \ + $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \ infiniband/verbs_exp.h \ diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index bcb2c1b2..b7e50463 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -94,6 +94,12 @@ /* Device parameter to enable hardware TSO offload. */ #define MLX5_TSO "tso" +/* Device parameter to enable hardware Tx vector. */ +#define MLX5_TX_VEC_EN "tx_vec_en" + +/* Device parameter to enable hardware Rx vector. */ +#define MLX5_RX_VEC_EN "rx_vec_en" + /* Default PMD specific parameter value. */ #define MLX5_ARG_UNSET (-1) @@ -105,6 +111,8 @@ struct mlx5_args { int mpw_hdr_dseg; int inline_max_packet_sz; int tso; + int tx_vec_en; + int rx_vec_en; }; /** * Retrieve integer value from environment variable. @@ -246,8 +254,10 @@ static const struct eth_dev_ops mlx5_dev_ops = { .filter_ctrl = mlx5_dev_filter_ctrl, .rx_descriptor_status = mlx5_rx_descriptor_status, .tx_descriptor_status = mlx5_tx_descriptor_status, +#ifdef HAVE_UPDATE_CQ_CI .rx_queue_intr_enable = mlx5_rx_intr_enable, .rx_queue_intr_disable = mlx5_rx_intr_disable, +#endif }; static struct { @@ -322,6 +332,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque) args->inline_max_packet_sz = tmp; } else if (strcmp(MLX5_TSO, key) == 0) { args->tso = !!tmp; + } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) { + args->tx_vec_en = !!tmp; + } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { + args->rx_vec_en = !!tmp; } else { WARN("%s: unknown parameter", key); return -EINVAL; @@ -351,6 +365,8 @@ mlx5_args(struct mlx5_args *args, struct rte_devargs *devargs) MLX5_TXQ_MPW_HDR_DSEG_EN, MLX5_TXQ_MAX_INLINE_LEN, MLX5_TSO, + MLX5_TX_VEC_EN, + MLX5_RX_VEC_EN, NULL, }; struct rte_kvargs *kvlist; @@ -406,6 +422,10 @@ mlx5_args_assign(struct priv *priv, struct mlx5_args *args) priv->inline_max_packet_sz = args->inline_max_packet_sz; if (args->tso != MLX5_ARG_UNSET) priv->tso = args->tso; + if (args->tx_vec_en != MLX5_ARG_UNSET) + priv->tx_vec_en = args->tx_vec_en; + if (args->rx_vec_en != MLX5_ARG_UNSET) + priv->rx_vec_en = args->rx_vec_en; } /** @@ -551,6 +571,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) .mpw_hdr_dseg = MLX5_ARG_UNSET, .inline_max_packet_sz = MLX5_ARG_UNSET, .tso = MLX5_ARG_UNSET, + .tx_vec_en = MLX5_ARG_UNSET, + .rx_vec_en = MLX5_ARG_UNSET, }; exp_device_attr.comp_mask = @@ -613,6 +635,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) priv->mps = mps; /* Enable MPW by default if supported. */ priv->cqe_comp = 1; /* Enable compression by default. */ priv->tunnel_en = tunnel_en; + /* Enable vector by default if supported. */ + priv->tx_vec_en = 1; + priv->rx_vec_en = 1; err = mlx5_args(&args, pci_dev->device.devargs); if (err) { ERROR("failed to process device arguments: %s", @@ -786,6 +811,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) eth_dev->device = &pci_dev->device; rte_eth_copy_pci_info(eth_dev, pci_dev); + eth_dev->data->dev_flags |= RTE_ETH_DEV_DETACHABLE; eth_dev->device->driver = &mlx5_driver.driver; priv->dev = eth_dev; eth_dev->dev_ops = &mlx5_dev_ops; @@ -885,6 +911,8 @@ RTE_INIT(rte_mlx5_pmd_init); static void rte_mlx5_pmd_init(void) { + /* Build the static table for ptype conversion. */ + mlx5_set_ptype_table(); /* * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use * huge pages. Calling ibv_fork_init() during init allows diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 1148dee3..43c53841 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -54,6 +54,7 @@ #ifdef PEDANTIC #pragma GCC diagnostic ignored "-Wpedantic" #endif +#include #include #include #include @@ -129,6 +130,9 @@ struct priv { unsigned int pending_alarm:1; /* An alarm is pending. */ unsigned int tso:1; /* Whether TSO is supported. */ unsigned int tunnel_en:1; + unsigned int isolated:1; /* Whether isolated mode is enabled. */ + unsigned int tx_vec_en:1; /* Whether Tx vector is enabled. */ + unsigned int rx_vec_en:1; /* Whether Rx vector is enabled. */ /* Whether Tx offloads for tunneled packets are supported. */ unsigned int max_tso_payload_sz; /* Maximum TCP payload for TSO. */ unsigned int txq_inline; /* Maximum packet size for inlining. */ @@ -311,6 +315,7 @@ struct rte_flow *mlx5_flow_create(struct rte_eth_dev *, int mlx5_flow_destroy(struct rte_eth_dev *, struct rte_flow *, struct rte_flow_error *); int mlx5_flow_flush(struct rte_eth_dev *, struct rte_flow_error *); +int mlx5_flow_isolate(struct rte_eth_dev *, int, struct rte_flow_error *); int priv_flow_start(struct priv *); void priv_flow_stop(struct priv *); int priv_flow_rxq_in_use(struct priv *, struct rxq *); diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index 201bb336..a76bc6f6 100644 --- a/drivers/net/mlx5/mlx5_defs.h +++ b/drivers/net/mlx5/mlx5_defs.h @@ -89,4 +89,22 @@ /* Maximum Packet headers size (L2+L3+L4) for TSO. */ #define MLX5_MAX_TSO_HEADER 128 +/* Default minimum number of Tx queues for vectorized Tx. */ +#define MLX5_VPMD_MIN_TXQS 4 + +/* Threshold of buffer replenishment for vectorized Rx. */ +#define MLX5_VPMD_RXQ_RPLNSH_THRESH 64U + +/* Maximum size of burst for vectorized Rx. */ +#define MLX5_VPMD_RX_MAX_BURST MLX5_VPMD_RXQ_RPLNSH_THRESH + +/* + * Maximum size of burst for vectorized Tx. This is related to the maximum size + * of Enhaned MPW (eMPW) WQE as vectorized Tx is supported with eMPW. + */ +#define MLX5_VPMD_TX_MAX_BURST 32U + +/* Number of packets vectorized Rx can simultaneously process in a loop. */ +#define MLX5_VPMD_DESCS_PER_LOOP 4 + #endif /* RTE_PMD_MLX5_DEFS_H_ */ diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index 3fd22cb8..b0eb3cdf 100644 --- a/drivers/net/mlx5/mlx5_ethdev.c +++ b/drivers/net/mlx5/mlx5_ethdev.c @@ -119,6 +119,7 @@ struct ethtool_link_settings { #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39 #endif +#define ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32 (SCHAR_MAX) /** * Return private structure associated with an Ethernet device. @@ -661,7 +662,7 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) unsigned int max; char ifname[IF_NAMESIZE]; - info->pci_dev = RTE_DEV_TO_PCI(dev->device); + info->pci_dev = RTE_ETH_DEV_TO_PCI(dev); priv_lock(priv); /* FIXME: we should ask the device for these values. */ @@ -715,15 +716,24 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) { static const uint32_t ptypes[] = { /* refers to rxq_cq_to_pkt_type() */ + RTE_PTYPE_L2_ETHER, RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, + RTE_PTYPE_L4_NONFRAG, + RTE_PTYPE_L4_FRAG, + RTE_PTYPE_L4_TCP, + RTE_PTYPE_L4_UDP, RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, + RTE_PTYPE_INNER_L4_NONFRAG, + RTE_PTYPE_INNER_L4_FRAG, + RTE_PTYPE_INNER_L4_TCP, + RTE_PTYPE_INNER_L4_UDP, RTE_PTYPE_UNKNOWN - }; - if (dev->rx_pkt_burst == mlx5_rx_burst) + if (dev->rx_pkt_burst == mlx5_rx_burst || + dev->rx_pkt_burst == mlx5_rx_burst_vec) return ptypes; return NULL; } @@ -806,9 +816,12 @@ static int mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete) { struct priv *priv = mlx5_get_priv(dev); - struct ethtool_link_settings edata = { - .cmd = ETHTOOL_GLINKSETTINGS, - }; + __extension__ struct { + struct ethtool_link_settings edata; + uint32_t link_mode_data[3 * + ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32]; + } ecmd; + struct ifreq ifr; struct rte_eth_link dev_link; uint64_t sc; @@ -821,15 +834,23 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete) memset(&dev_link, 0, sizeof(dev_link)); dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING)); - ifr.ifr_data = (void *)&edata; + memset(&ecmd, 0, sizeof(ecmd)); + ecmd.edata.cmd = ETHTOOL_GLINKSETTINGS; + ifr.ifr_data = (void *)&ecmd; + if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { + DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s", + strerror(errno)); + return -1; + } + ecmd.edata.link_mode_masks_nwords = -ecmd.edata.link_mode_masks_nwords; if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s", strerror(errno)); return -1; } - dev_link.link_speed = edata.speed; - sc = edata.link_mode_masks[0] | - ((uint64_t)edata.link_mode_masks[1] << 32); + dev_link.link_speed = ecmd.edata.speed; + sc = ecmd.edata.link_mode_masks[0] | + ((uint64_t)ecmd.edata.link_mode_masks[1] << 32); priv->link_speed_capa = 0; if (sc & ETHTOOL_LINK_MODE_Autoneg_BIT) priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG; @@ -865,7 +886,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete) ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT | ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)) priv->link_speed_capa |= ETH_LINK_SPEED_100G; - dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? + dev_link.link_duplex = ((ecmd.edata.duplex == DUPLEX_HALF) ? ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & ETH_LINK_SPEED_FIXED); @@ -903,12 +924,6 @@ mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete) /** * DPDK callback to change the MTU. * - * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be - * received). Use this as a hint to enable/disable scattered packets support - * and improve performance when not needed. - * Since failure is not an option, reconfiguring queues on the fly is not - * recommended. - * * @param dev * Pointer to Ethernet device structure. * @param in_mtu @@ -921,123 +936,33 @@ int mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) { struct priv *priv = dev->data->dev_private; + uint16_t kern_mtu; int ret = 0; - unsigned int i; - uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) = - mlx5_rx_burst; - unsigned int max_frame_len; - int rehash; - int restart = priv->started; if (mlx5_is_secondary()) return -E_RTE_SECONDARY; priv_lock(priv); + ret = priv_get_mtu(priv, &kern_mtu); + if (ret) + goto out; /* Set kernel interface MTU first. */ - if (priv_set_mtu(priv, mtu)) { - ret = errno; - WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, - strerror(ret)); + ret = priv_set_mtu(priv, mtu); + if (ret) goto out; - } else + ret = priv_get_mtu(priv, &kern_mtu); + if (ret) + goto out; + if (kern_mtu == mtu) { + priv->mtu = mtu; DEBUG("adapter port %u MTU set to %u", priv->port, mtu); - /* Temporarily replace RX handler with a fake one, assuming it has not - * been copied elsewhere. */ - dev->rx_pkt_burst = removed_rx_burst; - /* Make sure everyone has left mlx5_rx_burst() and uses - * removed_rx_burst() instead. */ - rte_wmb(); - usleep(1000); - /* MTU does not include header and CRC. */ - max_frame_len = ETHER_HDR_LEN + mtu + ETHER_CRC_LEN; - /* Check if at least one queue is going to need a SGE update. */ - for (i = 0; i != priv->rxqs_n; ++i) { - struct rxq *rxq = (*priv->rxqs)[i]; - unsigned int mb_len; - unsigned int size = RTE_PKTMBUF_HEADROOM + max_frame_len; - unsigned int sges_n; - - if (rxq == NULL) - continue; - mb_len = rte_pktmbuf_data_room_size(rxq->mp); - assert(mb_len >= RTE_PKTMBUF_HEADROOM); - /* - * Determine the number of SGEs needed for a full packet - * and round it to the next power of two. - */ - sges_n = log2above((size / mb_len) + !!(size % mb_len)); - if (sges_n != rxq->sges_n) - break; - } - /* - * If all queues have the right number of SGEs, a simple rehash - * of their buffers is enough, otherwise SGE information can only - * be updated in a queue by recreating it. All resources that depend - * on queues (flows, indirection tables) must be recreated as well in - * that case. - */ - rehash = (i == priv->rxqs_n); - if (!rehash) { - /* Clean up everything as with mlx5_dev_stop(). */ - priv_special_flow_disable_all(priv); - priv_mac_addrs_disable(priv); - priv_destroy_hash_rxqs(priv); - priv_fdir_disable(priv); - priv_dev_interrupt_handler_uninstall(priv, dev); } -recover: - /* Reconfigure each RX queue. */ - for (i = 0; (i != priv->rxqs_n); ++i) { - struct rxq *rxq = (*priv->rxqs)[i]; - struct rxq_ctrl *rxq_ctrl = - container_of(rxq, struct rxq_ctrl, rxq); - unsigned int mb_len; - unsigned int tmp; - - if (rxq == NULL) - continue; - mb_len = rte_pktmbuf_data_room_size(rxq->mp); - assert(mb_len >= RTE_PKTMBUF_HEADROOM); - /* Provide new values to rxq_setup(). */ - dev->data->dev_conf.rxmode.jumbo_frame = - (max_frame_len > ETHER_MAX_LEN); - dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; - if (rehash) - ret = rxq_rehash(dev, rxq_ctrl); - else - ret = rxq_ctrl_setup(dev, rxq_ctrl, 1 << rxq->elts_n, - rxq_ctrl->socket, NULL, rxq->mp); - if (!ret) - continue; - /* Attempt to roll back in case of error. */ - tmp = (mb_len << rxq->sges_n) - RTE_PKTMBUF_HEADROOM; - if (max_frame_len != tmp) { - max_frame_len = tmp; - goto recover; - } - /* Double fault, disable RX. */ - break; - } - /* - * Use a safe RX burst function in case of error, otherwise mimic - * mlx5_dev_start(). - */ - if (ret) { - ERROR("unable to reconfigure RX queues, RX disabled"); - rx_func = removed_rx_burst; - } else if (restart && - !rehash && - !priv_create_hash_rxqs(priv) && - !priv_rehash_flows(priv)) { - if (dev->data->dev_conf.fdir_conf.mode == RTE_FDIR_MODE_NONE) - priv_fdir_enable(priv); - priv_dev_interrupt_handler_install(priv, dev); - } - priv->mtu = mtu; - /* Burst functions can now be called again. */ - rte_wmb(); - dev->rx_pkt_burst = rx_func; + priv_unlock(priv); + return 0; out: + ret = errno; + WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, + strerror(ret)); priv_unlock(priv); assert(ret >= 0); return -ret; @@ -1185,7 +1110,7 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device, /* Extract information. */ if (sscanf(line, "PCI_SLOT_NAME=" - "%" SCNx16 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", + "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", &pci_addr->domain, &pci_addr->bus, &pci_addr->devid, @@ -1262,7 +1187,8 @@ mlx5_dev_link_status_handler(void *arg) ret = priv_dev_link_status_handler(priv, dev); priv_unlock(priv); if (ret) - _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); + _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL, + NULL); } /** @@ -1284,7 +1210,8 @@ mlx5_dev_interrupt_handler(void *cb_arg) ret = priv_dev_link_status_handler(priv, dev); priv_unlock(priv); if (ret) - _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); + _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL, + NULL); } /** @@ -1584,9 +1511,16 @@ priv_select_tx_function(struct priv *priv) priv->dev->tx_pkt_burst = mlx5_tx_burst; /* Select appropriate TX function. */ if (priv->mps == MLX5_MPW_ENHANCED) { - priv->dev->tx_pkt_burst = - mlx5_tx_burst_empw; - DEBUG("selected Enhanced MPW TX function"); + if (priv_check_vec_tx_support(priv) > 0) { + if (priv_check_raw_vec_tx_support(priv) > 0) + priv->dev->tx_pkt_burst = mlx5_tx_burst_raw_vec; + else + priv->dev->tx_pkt_burst = mlx5_tx_burst_vec; + DEBUG("selected Enhanced MPW TX vectorized function"); + } else { + priv->dev->tx_pkt_burst = mlx5_tx_burst_empw; + DEBUG("selected Enhanced MPW TX function"); + } } else if (priv->mps && priv->txq_inline) { priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline; DEBUG("selected MPW inline TX function"); @@ -1605,5 +1539,11 @@ priv_select_tx_function(struct priv *priv) void priv_select_rx_function(struct priv *priv) { - priv->dev->rx_pkt_burst = mlx5_rx_burst; + if (priv_check_vec_rx_support(priv) > 0) { + priv_prep_vec_rx_function(priv); + priv->dev->rx_pkt_burst = mlx5_rx_burst_vec; + DEBUG("selected RX vectorized function"); + } else { + priv->dev->rx_pkt_burst = mlx5_rx_burst; + } } diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c index c8d47489..34a7e69f 100644 --- a/drivers/net/mlx5/mlx5_fdir.c +++ b/drivers/net/mlx5/mlx5_fdir.c @@ -1053,6 +1053,7 @@ static const struct rte_flow_ops mlx5_flow_ops = { .destroy = mlx5_flow_destroy, .flush = mlx5_flow_flush, .query = NULL, + .isolate = mlx5_flow_isolate, }; /** diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index 8b3957ba..86be9291 100644 --- a/drivers/net/mlx5/mlx5_flow.c +++ b/drivers/net/mlx5/mlx5_flow.c @@ -53,7 +53,11 @@ #include "mlx5_prm.h" /* Number of Work Queue necessary for the DROP queue. */ +#ifndef HAVE_VERBS_IBV_EXP_FLOW_SPEC_ACTION_DROP #define MLX5_DROP_WQ_N 4 +#else +#define MLX5_DROP_WQ_N 1 +#endif static int mlx5_flow_create_eth(const struct rte_flow_item *item, @@ -576,6 +580,10 @@ priv_flow_validate(struct priv *priv, } if (action->mark && !flow->ibv_attr && !action->drop) flow->offset += sizeof(struct ibv_exp_flow_spec_action_tag); +#ifdef HAVE_VERBS_IBV_EXP_FLOW_SPEC_ACTION_DROP + if (!flow->ibv_attr && action->drop) + flow->offset += sizeof(struct ibv_exp_flow_spec_action_drop); +#endif if (!action->queue && !action->drop) { rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, "no valid action"); @@ -993,6 +1001,10 @@ priv_flow_create_action_queue_drop(struct priv *priv, struct rte_flow_error *error) { struct rte_flow *rte_flow; +#ifdef HAVE_VERBS_IBV_EXP_FLOW_SPEC_ACTION_DROP + struct ibv_exp_flow_spec_action_drop *drop; + unsigned int size = sizeof(struct ibv_exp_flow_spec_action_drop); +#endif assert(priv->pd); assert(priv->ctx); @@ -1003,10 +1015,19 @@ priv_flow_create_action_queue_drop(struct priv *priv, return NULL; } rte_flow->drop = 1; +#ifdef HAVE_VERBS_IBV_EXP_FLOW_SPEC_ACTION_DROP + drop = (void *)((uintptr_t)flow->ibv_attr + flow->offset); + *drop = (struct ibv_exp_flow_spec_action_drop){ + .type = IBV_EXP_FLOW_SPEC_ACTION_DROP, + .size = size, + }; + ++flow->ibv_attr->num_of_specs; + flow->offset += sizeof(struct ibv_exp_flow_spec_action_drop); +#endif rte_flow->ibv_attr = flow->ibv_attr; - rte_flow->qp = priv->flow_drop_queue->qp; if (!priv->started) return rte_flow; + rte_flow->qp = priv->flow_drop_queue->qp; rte_flow->ibv_flow = ibv_exp_create_flow(rte_flow->qp, rte_flow->ibv_attr); if (!rte_flow->ibv_flow) { @@ -1579,3 +1600,30 @@ priv_flow_rxq_in_use(struct priv *priv, struct rxq *rxq) } return 0; } + +/** + * Isolated mode. + * + * @see rte_flow_isolate() + * @see rte_flow_ops + */ +int +mlx5_flow_isolate(struct rte_eth_dev *dev, + int enable, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + + priv_lock(priv); + if (priv->started) { + rte_flow_error_set(error, EBUSY, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "port must be stopped first"); + priv_unlock(priv); + return -rte_errno; + } + priv->isolated = !!enable; + priv_unlock(priv); + return 0; +} diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c index 79e0c410..8489ea67 100644 --- a/drivers/net/mlx5/mlx5_mac.c +++ b/drivers/net/mlx5/mlx5_mac.c @@ -443,6 +443,8 @@ priv_mac_addrs_enable(struct priv *priv) unsigned int i; int ret; + if (priv->isolated) + return 0; if (!priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC)) return 0; for (i = 0; (i != priv->hash_rxqs_n); ++i) { diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c index 0a363846..28733517 100644 --- a/drivers/net/mlx5/mlx5_mr.c +++ b/drivers/net/mlx5/mlx5_mr.c @@ -207,7 +207,8 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx) sizeof(txq_ctrl->txq.mp2mr[0]))); } /* Store the new entry. */ - txq_ctrl->txq.mp2mr[idx].mp = mp; + txq_ctrl->txq.mp2mr[idx].start = (uintptr_t)mr->addr; + txq_ctrl->txq.mp2mr[idx].end = (uintptr_t)mr->addr + mr->length; txq_ctrl->txq.mp2mr[idx].mr = mr; txq_ctrl->txq.mp2mr[idx].lkey = htonl(mr->lkey); DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, @@ -265,18 +266,28 @@ txq_mp2mr_iter(struct rte_mempool *mp, void *arg) struct txq_mp2mr_mbuf_check_data data = { .ret = 0, }; + uintptr_t start; + uintptr_t end; unsigned int i; /* Register mempool only if the first element looks like a mbuf. */ if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 || data.ret == -1) return; + if (mlx5_check_mempool(mp, &start, &end) != 0) { + ERROR("mempool %p: not virtually contiguous", + (void *)mp); + return; + } for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) { - if (unlikely(txq_ctrl->txq.mp2mr[i].mp == NULL)) { + struct ibv_mr *mr = txq_ctrl->txq.mp2mr[i].mr; + + if (unlikely(mr == NULL)) { /* Unknown MP, add a new MR for it. */ break; } - if (txq_ctrl->txq.mp2mr[i].mp == mp) + if (start >= (uintptr_t)mr->addr && + end <= (uintptr_t)mr->addr + mr->length) return; } txq_mp2mr_reg(&txq_ctrl->txq, mp, i); diff --git a/drivers/net/mlx5/mlx5_rxmode.c b/drivers/net/mlx5/mlx5_rxmode.c index 173e6e84..a67e5426 100644 --- a/drivers/net/mlx5/mlx5_rxmode.c +++ b/drivers/net/mlx5/mlx5_rxmode.c @@ -347,6 +347,8 @@ priv_special_flow_enable_all(struct priv *priv) { enum hash_rxq_flow_type flow_type; + if (priv->isolated) + return 0; for (flow_type = HASH_RXQ_FLOW_TYPE_PROMISC; flow_type != HASH_RXQ_FLOW_TYPE_MAC; ++flow_type) { diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index 2a268398..74387a79 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -363,6 +363,8 @@ priv_create_hash_rxqs(struct priv *priv) assert(priv->hash_rxqs_n == 0); assert(priv->pd != NULL); assert(priv->ctx != NULL); + if (priv->isolated) + return 0; if (priv->rxqs_n == 0) return EINVAL; assert(priv->rxqs != NULL); @@ -631,6 +633,32 @@ priv_rehash_flows(struct priv *priv) return 0; } +/** + * Unlike regular Rx function, vPMD Rx doesn't replace mbufs immediately when + * receiving packets. Instead it replaces later in bulk. In rxq->elts[], entries + * from rq_pi to rq_ci are owned by device but the rest is already delivered to + * application. In order not to reuse those mbufs by rxq_alloc_elts(), this + * function must be called to replace used mbufs. + * + * @param rxq + * Pointer to RX queue structure. + */ +static void +rxq_trim_elts(struct rxq *rxq) +{ + const uint16_t q_n = (1 << rxq->elts_n); + const uint16_t q_mask = q_n - 1; + uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi); + uint16_t i; + + if (!rxq->trim_elts) + return; + for (i = 0; i < used; ++i) + (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL; + rxq->trim_elts = 0; + return; +} + /** * Allocate RX queue elements. * @@ -659,15 +687,13 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n, volatile struct mlx5_wqe_data_seg *scat = &(*rxq_ctrl->rxq.wqes)[i]; - if (pool != NULL) { - buf = (*pool)[i]; - assert(buf != NULL); + buf = (pool != NULL) ? (*pool)[i] : NULL; + if (buf != NULL) { rte_pktmbuf_reset(buf); rte_pktmbuf_refcnt_update(buf, 1); } else buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp); if (buf == NULL) { - assert(pool == NULL); ERROR("%p: empty mbuf pool", (void *)rxq_ctrl); ret = ENOMEM; goto error; @@ -722,6 +748,7 @@ rxq_free_elts(struct rxq_ctrl *rxq_ctrl) { unsigned int i; + rxq_trim_elts(&rxq_ctrl->rxq); DEBUG("%p: freeing WRs", (void *)rxq_ctrl); if (rxq_ctrl->rxq.elts == NULL) return; @@ -759,72 +786,6 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl) memset(rxq_ctrl, 0, sizeof(*rxq_ctrl)); } -/** - * Reconfigure RX queue buffers. - * - * rxq_rehash() does not allocate mbufs, which, if not done from the right - * thread (such as a control thread), may corrupt the pool. - * In case of failure, the queue is left untouched. - * - * @param dev - * Pointer to Ethernet device structure. - * @param rxq_ctrl - * RX queue pointer. - * - * @return - * 0 on success, errno value on failure. - */ -int -rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl) -{ - unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n; - unsigned int i; - struct ibv_exp_wq_attr mod; - int err; - - DEBUG("%p: rehashing queue %p with %u SGE(s) per packet", - (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n); - assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n))); - /* From now on, any failure will render the queue unusable. - * Reinitialize WQ. */ - mod = (struct ibv_exp_wq_attr){ - .attr_mask = IBV_EXP_WQ_ATTR_STATE, - .wq_state = IBV_EXP_WQS_RESET, - }; - err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod); - if (err) { - ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err)); - assert(err > 0); - return err; - } - /* Snatch mbufs from original queue. */ - claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts)); - for (i = 0; i != elts_n; ++i) { - struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i]; - - assert(rte_mbuf_refcnt_read(buf) == 2); - rte_pktmbuf_free_seg(buf); - } - /* Change queue state to ready. */ - mod = (struct ibv_exp_wq_attr){ - .attr_mask = IBV_EXP_WQ_ATTR_STATE, - .wq_state = IBV_EXP_WQS_RDY, - }; - err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod); - if (err) { - ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s", - (void *)dev, strerror(err)); - goto error; - } - /* Update doorbell counter. */ - rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n; - rte_wmb(); - *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci); -error: - assert(err >= 0); - return err; -} - /** * Initialize RX queue. * @@ -858,6 +819,7 @@ rxq_setup(struct rxq_ctrl *tmpl) tmpl->rxq.cqe_n = log2above(cq_info.cqe_cnt); tmpl->rxq.cq_ci = 0; tmpl->rxq.rq_ci = 0; + tmpl->rxq.rq_pi = 0; tmpl->rxq.cq_db = cq_info.dbrec; tmpl->rxq.wqes = (volatile struct mlx5_wqe_data_seg (*)[]) @@ -888,7 +850,7 @@ rxq_setup(struct rxq_ctrl *tmpl) * @return * 0 on success, errno value on failure. */ -int +static int rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc, unsigned int socket, const struct rte_eth_rxconf *conf, struct rte_mempool *mp) @@ -978,10 +940,10 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, if (dev->data->dev_conf.intr_conf.rxq) { tmpl.channel = ibv_create_comp_channel(priv->ctx); if (tmpl.channel == NULL) { - dev->data->dev_conf.intr_conf.rxq = 0; ret = ENOMEM; - ERROR("%p: Comp Channel creation failure: %s", - (void *)dev, strerror(ret)); + ERROR("%p: Rx interrupt completion channel creation" + " failure: %s", + (void *)dev, strerror(ret)); goto error; } } @@ -991,7 +953,12 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, if (priv->cqe_comp) { attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS; attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE; - cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */ + /* + * For vectorized Rx, it must not be doubled in order to + * make cq_ci and rq_ci aligned. + */ + if (rxq_check_vec_support(&tmpl.rxq) < 0) + cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */ } tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl.channel, 0, &attr.cq); @@ -1101,6 +1068,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, if (rxq_ctrl->rxq.elts_n) { assert(1 << rxq_ctrl->rxq.elts_n == desc); assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts); + rxq_trim_elts(&rxq_ctrl->rxq); ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts); } else ret = rxq_alloc_elts(&tmpl, desc, NULL); @@ -1163,6 +1131,7 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, struct priv *priv = dev->data->dev_private; struct rxq *rxq = (*priv->rxqs)[idx]; struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); + const uint16_t desc_pad = MLX5_VPMD_DESCS_PER_LOOP; /* For vPMD. */ int ret; if (mlx5_is_secondary()) @@ -1196,7 +1165,8 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, if (rxq_ctrl->rxq.elts_n != log2above(desc)) { rxq_ctrl = rte_realloc(rxq_ctrl, sizeof(*rxq_ctrl) + - desc * sizeof(struct rte_mbuf *), + (desc + desc_pad) * + sizeof(struct rte_mbuf *), RTE_CACHE_LINE_SIZE); if (!rxq_ctrl) { ERROR("%p: unable to reallocate queue index %u", @@ -1207,7 +1177,8 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, } } else { rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) + - desc * sizeof(struct rte_mbuf *), + (desc + desc_pad) * + sizeof(struct rte_mbuf *), 0, socket); if (rxq_ctrl == NULL) { ERROR("%p: unable to allocate queue index %u", @@ -1224,8 +1195,6 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, DEBUG("%p: adding RX queue %p to list", (void *)dev, (void *)rxq_ctrl); (*priv->rxqs)[idx] = &rxq_ctrl->rxq; - /* Update receive callback. */ - priv_select_rx_function(priv); } priv_unlock(priv); return -ret; @@ -1310,111 +1279,159 @@ mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts, } /** - * Fill epoll fd list for rxq interrupts. + * Allocate queue vector and fill epoll fd list for Rx interrupts. * * @param priv - * Private structure. + * Pointer to private structure. * * @return * 0 on success, negative on failure. */ int -priv_intr_efd_enable(struct priv *priv) +priv_rx_intr_vec_enable(struct priv *priv) { unsigned int i; unsigned int rxqs_n = priv->rxqs_n; unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); + unsigned int count = 0; struct rte_intr_handle *intr_handle = priv->dev->intr_handle; - if (n == 0) + if (!priv->dev->data->dev_conf.intr_conf.rxq) return 0; - if (n < rxqs_n) { - WARN("rxqs num is larger than EAL max interrupt vector " - "%u > %u unable to supprt rxq interrupts", - rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); - return -EINVAL; + priv_rx_intr_vec_disable(priv); + intr_handle->intr_vec = malloc(sizeof(intr_handle->intr_vec[rxqs_n])); + if (intr_handle->intr_vec == NULL) { + ERROR("failed to allocate memory for interrupt vector," + " Rx interrupts will not be supported"); + return -ENOMEM; } intr_handle->type = RTE_INTR_HANDLE_EXT; for (i = 0; i != n; ++i) { struct rxq *rxq = (*priv->rxqs)[i]; struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); - int fd = rxq_ctrl->channel->fd; + int fd; int flags; int rc; + /* Skip queues that cannot request interrupts. */ + if (!rxq || !rxq_ctrl->channel) { + /* Use invalid intr_vec[] index to disable entry. */ + intr_handle->intr_vec[i] = + RTE_INTR_VEC_RXTX_OFFSET + + RTE_MAX_RXTX_INTR_VEC_ID; + continue; + } + if (count >= RTE_MAX_RXTX_INTR_VEC_ID) { + ERROR("too many Rx queues for interrupt vector size" + " (%d), Rx interrupts cannot be enabled", + RTE_MAX_RXTX_INTR_VEC_ID); + priv_rx_intr_vec_disable(priv); + return -1; + } + fd = rxq_ctrl->channel->fd; flags = fcntl(fd, F_GETFL); rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK); if (rc < 0) { - WARN("failed to change rxq interrupt file " - "descriptor %d for queue index %d", fd, i); + ERROR("failed to make Rx interrupt file descriptor" + " %d non-blocking for queue index %d", fd, i); + priv_rx_intr_vec_disable(priv); return -1; } - intr_handle->efds[i] = fd; + intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count; + intr_handle->efds[count] = fd; + count++; } - intr_handle->nb_efd = n; + if (!count) + priv_rx_intr_vec_disable(priv); + else + intr_handle->nb_efd = count; return 0; } /** - * Clean epoll fd list for rxq interrupts. + * Clean up Rx interrupts handler. * * @param priv - * Private structure. + * Pointer to private structure. */ void -priv_intr_efd_disable(struct priv *priv) +priv_rx_intr_vec_disable(struct priv *priv) { struct rte_intr_handle *intr_handle = priv->dev->intr_handle; rte_intr_free_epoll_fd(intr_handle); + free(intr_handle->intr_vec); + intr_handle->nb_efd = 0; + intr_handle->intr_vec = NULL; } +#ifdef HAVE_UPDATE_CQ_CI + /** - * Create and init interrupt vector array. + * DPDK callback for Rx queue interrupt enable. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device structure. + * @param rx_queue_id + * Rx queue number. * * @return * 0 on success, negative on failure. */ int -priv_create_intr_vec(struct priv *priv) +mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id) { - unsigned int rxqs_n = priv->rxqs_n; - unsigned int i; - struct rte_intr_handle *intr_handle = priv->dev->intr_handle; + struct priv *priv = mlx5_get_priv(dev); + struct rxq *rxq = (*priv->rxqs)[rx_queue_id]; + struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); + int ret; - if (rxqs_n == 0) - return 0; - intr_handle->intr_vec = (int *) - rte_malloc("intr_vec", rxqs_n * sizeof(int), 0); - if (intr_handle->intr_vec == NULL) { - WARN("Failed to allocate memory for intr_vec " - "rxq interrupt will not be supported"); - return -ENOMEM; - } - for (i = 0; i != rxqs_n; ++i) { - /* 1:1 mapping between rxq and interrupt. */ - intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i; + if (!rxq || !rxq_ctrl->channel) { + ret = EINVAL; + } else { + ibv_mlx5_exp_update_cq_ci(rxq_ctrl->cq, rxq->cq_ci); + ret = ibv_req_notify_cq(rxq_ctrl->cq, 0); } - return 0; + if (ret) + WARN("unable to arm interrupt on rx queue %d", rx_queue_id); + return -ret; } /** - * Destroy init interrupt vector array. + * DPDK callback for Rx queue interrupt disable. * - * @param priv - * Private structure. + * @param dev + * Pointer to Ethernet device structure. + * @param rx_queue_id + * Rx queue number. * * @return * 0 on success, negative on failure. */ -void -priv_destroy_intr_vec(struct priv *priv) +int +mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id) { - struct rte_intr_handle *intr_handle = priv->dev->intr_handle; + struct priv *priv = mlx5_get_priv(dev); + struct rxq *rxq = (*priv->rxqs)[rx_queue_id]; + struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); + struct ibv_cq *ev_cq; + void *ev_ctx; + int ret; - rte_free(intr_handle->intr_vec); + if (!rxq || !rxq_ctrl->channel) { + ret = EINVAL; + } else { + ret = ibv_get_cq_event(rxq_ctrl->cq->channel, &ev_cq, &ev_ctx); + if (ret || ev_cq != rxq_ctrl->cq) + ret = EINVAL; + } + if (ret) + WARN("unable to disable interrupt on rx queue %d", + rx_queue_id); + else + ibv_ack_cq_events(rxq_ctrl->cq, 1); + return -ret; } + +#endif /* HAVE_UPDATE_CQ_CI */ diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index de6e0fa4..b07bcd11 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -69,129 +69,131 @@ #include "mlx5_defs.h" #include "mlx5_prm.h" -static inline int -check_cqe(volatile struct mlx5_cqe *cqe, - unsigned int cqes_n, const uint16_t ci) - __attribute__((always_inline)); - -static inline void -txq_complete(struct txq *txq) __attribute__((always_inline)); +static __rte_always_inline uint32_t +rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe); -static inline uint32_t -txq_mp2mr(struct txq *txq, struct rte_mempool *mp) - __attribute__((always_inline)); - -static inline void -mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) - __attribute__((always_inline)); - -static inline uint32_t -rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) - __attribute__((always_inline)); - -static inline int +static __rte_always_inline int mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, - uint16_t cqe_cnt, uint32_t *rss_hash) - __attribute__((always_inline)); + uint16_t cqe_cnt, uint32_t *rss_hash); -static inline uint32_t -rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe) - __attribute__((always_inline)); +static __rte_always_inline uint32_t +rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe); -#ifndef NDEBUG +uint32_t mlx5_ptype_table[] __rte_cache_aligned = { + [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ +}; /** - * Verify or set magic value in CQE. - * - * @param cqe - * Pointer to CQE. + * Build a table to translate Rx completion flags to packet type. * - * @return - * 0 the first time. + * @note: fix mlx5_dev_supported_ptypes_get() if any change here. */ -static inline int -check_cqe_seen(volatile struct mlx5_cqe *cqe) +void +mlx5_set_ptype_table(void) { - static const uint8_t magic[] = "seen"; - volatile uint8_t (*buf)[sizeof(cqe->rsvd0)] = &cqe->rsvd0; - int ret = 1; unsigned int i; + uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table; - for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i) - if (!ret || (*buf)[i] != magic[i]) { - ret = 0; - (*buf)[i] = magic[i]; - } - return ret; -} - -#endif /* NDEBUG */ - -/** - * Check whether CQE is valid. - * - * @param cqe - * Pointer to CQE. - * @param cqes_n - * Size of completion queue. - * @param ci - * Consumer index. - * - * @return - * 0 on success, 1 on failure. - */ -static inline int -check_cqe(volatile struct mlx5_cqe *cqe, - unsigned int cqes_n, const uint16_t ci) -{ - uint16_t idx = ci & cqes_n; - uint8_t op_own = cqe->op_own; - uint8_t op_owner = MLX5_CQE_OWNER(op_own); - uint8_t op_code = MLX5_CQE_OPCODE(op_own); - - if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID))) - return 1; /* No CQE. */ -#ifndef NDEBUG - if ((op_code == MLX5_CQE_RESP_ERR) || - (op_code == MLX5_CQE_REQ_ERR)) { - volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe; - uint8_t syndrome = err_cqe->syndrome; - - if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) || - (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) - return 0; - if (!check_cqe_seen(cqe)) - ERROR("unexpected CQE error %u (0x%02x)" - " syndrome 0x%02x", - op_code, op_code, syndrome); - return 1; - } else if ((op_code != MLX5_CQE_RESP_SEND) && - (op_code != MLX5_CQE_REQ)) { - if (!check_cqe_seen(cqe)) - ERROR("unexpected CQE opcode %u (0x%02x)", - op_code, op_code); - return 1; - } -#endif /* NDEBUG */ - return 0; -} - -/** - * Return the address of the WQE. - * - * @param txq - * Pointer to TX queue structure. - * @param wqe_ci - * WQE consumer index. - * - * @return - * WQE address. - */ -static inline uintptr_t * -tx_mlx5_wqe(struct txq *txq, uint16_t ci) -{ - ci &= ((1 << txq->wqe_n) - 1); - return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE); + /* Last entry must not be overwritten, reserved for errored packet. */ + for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i) + (*p)[i] = RTE_PTYPE_UNKNOWN; + /* + * The index to the array should have: + * bit[1:0] = l3_hdr_type + * bit[4:2] = l4_hdr_type + * bit[5] = ip_frag + * bit[6] = tunneled + * bit[7] = outer_l3_type + */ + /* L3 */ + (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + /* Fragmented */ + (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + /* TCP */ + (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + /* UDP */ + (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + /* Repeat with outer_l3_type being set. Just in case. */ + (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_NONFRAG; + (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG; + (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + /* Tunneled - L3 */ + (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_NONFRAG; + /* Tunneled - Fragmented */ + (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG; + /* Tunneled - TCP */ + (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP; + /* Tunneled - UDP */ + (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; + (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP; } /** @@ -250,156 +252,6 @@ mlx5_copy_to_wq(void *dst, const void *src, size_t n, return ret; } -/** - * Manage TX completions. - * - * When sending a burst, mlx5_tx_burst() posts several WRs. - * - * @param txq - * Pointer to TX queue structure. - */ -static inline void -txq_complete(struct txq *txq) -{ - const unsigned int elts_n = 1 << txq->elts_n; - const unsigned int cqe_n = 1 << txq->cqe_n; - const unsigned int cqe_cnt = cqe_n - 1; - uint16_t elts_free = txq->elts_tail; - uint16_t elts_tail; - uint16_t cq_ci = txq->cq_ci; - volatile struct mlx5_cqe *cqe = NULL; - volatile struct mlx5_wqe_ctrl *ctrl; - - do { - volatile struct mlx5_cqe *tmp; - - tmp = &(*txq->cqes)[cq_ci & cqe_cnt]; - if (check_cqe(tmp, cqe_n, cq_ci)) - break; - cqe = tmp; -#ifndef NDEBUG - if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) { - if (!check_cqe_seen(cqe)) - ERROR("unexpected compressed CQE, TX stopped"); - return; - } - if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || - (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { - if (!check_cqe_seen(cqe)) - ERROR("unexpected error CQE, TX stopped"); - return; - } -#endif /* NDEBUG */ - ++cq_ci; - } while (1); - if (unlikely(cqe == NULL)) - return; - txq->wqe_pi = ntohs(cqe->wqe_counter); - ctrl = (volatile struct mlx5_wqe_ctrl *) - tx_mlx5_wqe(txq, txq->wqe_pi); - elts_tail = ctrl->ctrl3; - assert(elts_tail < (1 << txq->wqe_n)); - /* Free buffers. */ - while (elts_free != elts_tail) { - struct rte_mbuf *elt = (*txq->elts)[elts_free]; - unsigned int elts_free_next = - (elts_free + 1) & (elts_n - 1); - struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next]; - -#ifndef NDEBUG - /* Poisoning. */ - memset(&(*txq->elts)[elts_free], - 0x66, - sizeof((*txq->elts)[elts_free])); -#endif - RTE_MBUF_PREFETCH_TO_FREE(elt_next); - /* Only one segment needs to be freed. */ - rte_pktmbuf_free_seg(elt); - elts_free = elts_free_next; - } - txq->cq_ci = cq_ci; - txq->elts_tail = elts_tail; - /* Update the consumer index. */ - rte_wmb(); - *txq->cq_db = htonl(cq_ci); -} - -/** - * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which - * the cloned mbuf is allocated is returned instead. - * - * @param buf - * Pointer to mbuf. - * - * @return - * Memory pool where data is located for given mbuf. - */ -static struct rte_mempool * -txq_mb2mp(struct rte_mbuf *buf) -{ - if (unlikely(RTE_MBUF_INDIRECT(buf))) - return rte_mbuf_from_indirect(buf)->pool; - return buf->pool; -} - -/** - * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, - * remove an entry first. - * - * @param txq - * Pointer to TX queue structure. - * @param[in] mp - * Memory Pool for which a Memory Region lkey must be returned. - * - * @return - * mr->lkey on success, (uint32_t)-1 on failure. - */ -static inline uint32_t -txq_mp2mr(struct txq *txq, struct rte_mempool *mp) -{ - unsigned int i; - uint32_t lkey = (uint32_t)-1; - - for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { - if (unlikely(txq->mp2mr[i].mp == NULL)) { - /* Unknown MP, add a new MR for it. */ - break; - } - if (txq->mp2mr[i].mp == mp) { - assert(txq->mp2mr[i].lkey != (uint32_t)-1); - assert(htonl(txq->mp2mr[i].mr->lkey) == - txq->mp2mr[i].lkey); - lkey = txq->mp2mr[i].lkey; - break; - } - } - if (unlikely(lkey == (uint32_t)-1)) - lkey = txq_mp2mr_reg(txq, mp, i); - return lkey; -} - -/** - * Ring TX queue doorbell. - * - * @param txq - * Pointer to TX queue structure. - * @param wqe - * Pointer to the last WQE posted in the NIC. - */ -static inline void -mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) -{ - uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg); - volatile uint64_t *src = ((volatile uint64_t *)wqe); - - rte_wmb(); - *txq->qp_db = htonl(txq->wqe_ci); - /* Ensure ordering between DB record and BF copy. */ - rte_wmb(); - *dst = *src; -} - /** * DPDK callback to check the status of a tx descriptor. * @@ -415,12 +267,10 @@ int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) { struct txq *txq = tx_queue; - const unsigned int elts_n = 1 << txq->elts_n; - const unsigned int elts_cnt = elts_n - 1; - unsigned int used; + uint16_t used; - txq_complete(txq); - used = (txq->elts_head - txq->elts_tail) & elts_cnt; + mlx5_tx_complete(txq); + used = txq->elts_head - txq->elts_tail; if (offset < used) return RTE_ETH_TX_DESC_FULL; return RTE_ETH_TX_DESC_DONE; @@ -494,11 +344,12 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct txq *txq = (struct txq *)dpdk_txq; uint16_t elts_head = txq->elts_head; - const unsigned int elts_n = 1 << txq->elts_n; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; unsigned int i = 0; unsigned int j = 0; unsigned int k = 0; - unsigned int max; + uint16_t max_elts; unsigned int max_inline = txq->max_inline; const unsigned int inline_en = !!max_inline && txq->inline_en; uint16_t max_wqe; @@ -514,10 +365,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) /* Prefetch first packet cacheline. */ rte_prefetch0(*pkts); /* Start processing. */ - txq_complete(txq); - max = (elts_n - (elts_head - txq->elts_tail)); - if (max > elts_n) - max -= elts_n; + mlx5_tx_complete(txq); + max_elts = (elts_n - (elts_head - txq->elts_tail)); max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); if (unlikely(!max_wqe)) return 0; @@ -533,6 +382,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) uint16_t ehdr; uint8_t cs_flags = 0; uint64_t tso = 0; + uint16_t tso_segsz = 0; #ifdef MLX5_PMD_SOFT_COUNTERS uint32_t total_length = 0; #endif @@ -545,9 +395,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) * that one ring entry remains unused. */ assert(segs_n); - if (max < segs_n + 1) + if (max_elts < segs_n) break; - max -= segs_n; + max_elts -= segs_n; --segs_n; if (unlikely(--max_wqe == 0)) break; @@ -566,7 +416,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (length < (MLX5_WQE_DWORD_SIZE + 2)) break; /* Update element. */ - (*txq->elts)[elts_head] = buf; + (*txq->elts)[elts_head & elts_m] = buf; /* Prefetch next buffer data. */ if (pkts_n - i > 1) rte_prefetch0( @@ -628,6 +478,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) tso_header_sz = buf->l2_len + vlan_sz + buf->l3_len + buf->l4_len; + tso_segsz = buf->tso_segsz; if (is_tunneled && txq->tunnel_en) { tso_header_sz += buf->outer_l2_len + @@ -762,7 +613,7 @@ use_dseg: naddr = htonll(addr); *dseg = (rte_v128u32_t){ htonl(length), - txq_mp2mr(txq, txq_mb2mp(buf)), + mlx5_tx_mb2mr(txq, buf), naddr, naddr >> 32, }; @@ -801,12 +652,11 @@ next_seg: naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); *dseg = (rte_v128u32_t){ htonl(length), - txq_mp2mr(txq, txq_mb2mp(buf)), + mlx5_tx_mb2mr(txq, buf), naddr, naddr >> 32, }; - elts_head = (elts_head + 1) & (elts_n - 1); - (*txq->elts)[elts_head] = buf; + (*txq->elts)[++elts_head & elts_m] = buf; ++sg; /* Advance counter only if all segs are successfully posted. */ if (sg < segs_n) @@ -814,7 +664,7 @@ next_seg: else j += sg; next_pkt: - elts_head = (elts_head + 1) & (elts_n - 1); + ++elts_head; ++pkts; ++i; /* Initialize known and common part of the WQE structure. */ @@ -827,7 +677,7 @@ next_pkt: }; wqe->eseg = (rte_v128u32_t){ 0, - cs_flags | (htons(buf->tso_segsz) << 16), + cs_flags | (htons(tso_segsz) << 16), 0, (ehdr << 16) | htons(tso_header_sz), }; @@ -857,7 +707,7 @@ next_wqe: /* Take a shortcut if nothing must be sent. */ if (unlikely((i + k) == 0)) return 0; - txq->elts_head = (txq->elts_head + i + j) & (elts_n - 1); + txq->elts_head += (i + j); /* Check whether completion threshold has been reached. */ comp = txq->elts_comp + i + j + k; if (comp >= MLX5_TX_COMP_THRESH) { @@ -964,10 +814,11 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct txq *txq = (struct txq *)dpdk_txq; uint16_t elts_head = txq->elts_head; - const unsigned int elts_n = 1 << txq->elts_n; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; unsigned int i = 0; unsigned int j = 0; - unsigned int max; + uint16_t max_elts; uint16_t max_wqe; unsigned int comp; struct mlx5_mpw mpw = { @@ -980,16 +831,13 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); /* Start processing. */ - txq_complete(txq); - max = (elts_n - (elts_head - txq->elts_tail)); - if (max > elts_n) - max -= elts_n; + mlx5_tx_complete(txq); + max_elts = (elts_n - (elts_head - txq->elts_tail)); max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); if (unlikely(!max_wqe)) return 0; do { struct rte_mbuf *buf = *(pkts++); - unsigned int elts_head_next; uint32_t length; unsigned int segs_n = buf->nb_segs; uint32_t cs_flags = 0; @@ -999,12 +847,12 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) * that one ring entry remains unused. */ assert(segs_n); - if (max < segs_n + 1) + if (max_elts < segs_n) break; /* Do not bother with large packets MPW cannot handle. */ if (segs_n > MLX5_MPW_DSEG_MAX) break; - max -= segs_n; + max_elts -= segs_n; --pkts_n; /* Should we enable HW CKSUM offload */ if (buf->ol_flags & @@ -1040,17 +888,15 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) volatile struct mlx5_wqe_data_seg *dseg; uintptr_t addr; - elts_head_next = (elts_head + 1) & (elts_n - 1); assert(buf); - (*txq->elts)[elts_head] = buf; + (*txq->elts)[elts_head++ & elts_m] = buf; dseg = mpw.data.dseg[mpw.pkts_n]; addr = rte_pktmbuf_mtod(buf, uintptr_t); *dseg = (struct mlx5_wqe_data_seg){ .byte_count = htonl(DATA_LEN(buf)), - .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), + .lkey = mlx5_tx_mb2mr(txq, buf), .addr = htonll(addr), }; - elts_head = elts_head_next; #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) length += DATA_LEN(buf); #endif @@ -1061,7 +907,6 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) assert(length == mpw.len); if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) mlx5_mpw_close(txq, &mpw); - elts_head = elts_head_next; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent bytes counter. */ txq->stats.obytes += length; @@ -1179,10 +1024,11 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, { struct txq *txq = (struct txq *)dpdk_txq; uint16_t elts_head = txq->elts_head; - const unsigned int elts_n = 1 << txq->elts_n; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; unsigned int i = 0; unsigned int j = 0; - unsigned int max; + uint16_t max_elts; uint16_t max_wqe; unsigned int comp; unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE; @@ -1208,13 +1054,10 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); /* Start processing. */ - txq_complete(txq); - max = (elts_n - (elts_head - txq->elts_tail)); - if (max > elts_n) - max -= elts_n; + mlx5_tx_complete(txq); + max_elts = (elts_n - (elts_head - txq->elts_tail)); do { struct rte_mbuf *buf = *(pkts++); - unsigned int elts_head_next; uintptr_t addr; uint32_t length; unsigned int segs_n = buf->nb_segs; @@ -1225,12 +1068,12 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, * that one ring entry remains unused. */ assert(segs_n); - if (max < segs_n + 1) + if (max_elts < segs_n) break; /* Do not bother with large packets MPW cannot handle. */ if (segs_n > MLX5_MPW_DSEG_MAX) break; - max -= segs_n; + max_elts -= segs_n; --pkts_n; /* * Compute max_wqe in case less WQE were consumed in previous @@ -1291,18 +1134,15 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, do { volatile struct mlx5_wqe_data_seg *dseg; - elts_head_next = - (elts_head + 1) & (elts_n - 1); assert(buf); - (*txq->elts)[elts_head] = buf; + (*txq->elts)[elts_head++ & elts_m] = buf; dseg = mpw.data.dseg[mpw.pkts_n]; addr = rte_pktmbuf_mtod(buf, uintptr_t); *dseg = (struct mlx5_wqe_data_seg){ .byte_count = htonl(DATA_LEN(buf)), - .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), + .lkey = mlx5_tx_mb2mr(txq, buf), .addr = htonll(addr), }; - elts_head = elts_head_next; #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) length += DATA_LEN(buf); #endif @@ -1319,9 +1159,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, assert(mpw.state == MLX5_MPW_INL_STATE_OPENED); assert(length <= inline_room); assert(length == DATA_LEN(buf)); - elts_head_next = (elts_head + 1) & (elts_n - 1); addr = rte_pktmbuf_mtod(buf, uintptr_t); - (*txq->elts)[elts_head] = buf; + (*txq->elts)[elts_head++ & elts_m] = buf; /* Maximum number of bytes before wrapping. */ max = ((((uintptr_t)(txq->wqes)) + (1 << txq->wqe_n) * @@ -1358,7 +1197,6 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, inline_room -= length; } } - elts_head = elts_head_next; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent bytes counter. */ txq->stats.obytes += length; @@ -1480,10 +1318,11 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct txq *txq = (struct txq *)dpdk_txq; uint16_t elts_head = txq->elts_head; - const unsigned int elts_n = 1 << txq->elts_n; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; unsigned int i = 0; unsigned int j = 0; - unsigned int max_elts; + uint16_t max_elts; uint16_t max_wqe; unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE; unsigned int mpw_room = 0; @@ -1496,10 +1335,8 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (unlikely(!pkts_n)) return 0; /* Start processing. */ - txq_complete(txq); + mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); - if (max_elts > elts_n) - max_elts -= elts_n; /* A CQE slot must always be available. */ assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); @@ -1507,7 +1344,6 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) return 0; do { struct rte_mbuf *buf = *(pkts++); - unsigned int elts_head_next; uintptr_t addr; uint64_t naddr; unsigned int n; @@ -1521,7 +1357,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) * that one ring entry remains unused. */ assert(segs_n); - if (max_elts - j < segs_n + 1) + if (max_elts - j < segs_n) break; /* Do not bother with large packets MPW cannot handle. */ if (segs_n > MLX5_MPW_DSEG_MAX) @@ -1605,18 +1441,15 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) do { volatile struct mlx5_wqe_data_seg *dseg; - elts_head_next = - (elts_head + 1) & (elts_n - 1); assert(buf); - (*txq->elts)[elts_head] = buf; + (*txq->elts)[elts_head++ & elts_m] = buf; dseg = mpw.data.dseg[mpw.pkts_n]; addr = rte_pktmbuf_mtod(buf, uintptr_t); *dseg = (struct mlx5_wqe_data_seg){ .byte_count = htonl(DATA_LEN(buf)), - .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), + .lkey = mlx5_tx_mb2mr(txq, buf), .addr = htonll(addr), }; - elts_head = elts_head_next; #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) length += DATA_LEN(buf); #endif @@ -1667,7 +1500,6 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) /* No need to get completion as the entire packet is * copied to WQ. Free the buf right away. */ - elts_head_next = elts_head; rte_pktmbuf_free_seg(buf); mpw_room -= (inl_pad + sizeof(inl_hdr) + length); /* Add pad in the next packet if any. */ @@ -1690,8 +1522,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) dseg = (volatile void *) ((uintptr_t)mpw.data.raw + inl_pad); - elts_head_next = (elts_head + 1) & (elts_n - 1); - (*txq->elts)[elts_head] = buf; + (*txq->elts)[elts_head++ & elts_m] = buf; addr = rte_pktmbuf_mtod(buf, uintptr_t); for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++) rte_prefetch2((void *)(addr + @@ -1699,7 +1530,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) naddr = htonll(addr); *dseg = (rte_v128u32_t) { htonl(length), - txq_mp2mr(txq, txq_mb2mp(buf)), + mlx5_tx_mb2mr(txq, buf), naddr, naddr >> 32, }; @@ -1710,7 +1541,6 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) mpw_room -= (inl_pad + sizeof(*dseg)); inl_pad = 0; } - elts_head = elts_head_next; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent bytes counter. */ txq->stats.obytes += length; @@ -1764,30 +1594,20 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) static inline uint32_t rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe) { - uint32_t pkt_type; - uint16_t flags = ntohs(cqe->hdr_type_etc); + uint8_t idx; + uint8_t pinfo = cqe->pkt_info; + uint16_t ptype = cqe->hdr_type_etc; - if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) { - pkt_type = - TRANSPOSE(flags, - MLX5_CQE_RX_IPV4_PACKET, - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) | - TRANSPOSE(flags, - MLX5_CQE_RX_IPV6_PACKET, - RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN); - pkt_type |= ((cqe->pkt_info & MLX5_CQE_RX_OUTER_PACKET) ? - RTE_PTYPE_L3_IPV6_EXT_UNKNOWN : - RTE_PTYPE_L3_IPV4_EXT_UNKNOWN); - } else { - pkt_type = - TRANSPOSE(flags, - MLX5_CQE_L3_HDR_TYPE_IPV6, - RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) | - TRANSPOSE(flags, - MLX5_CQE_L3_HDR_TYPE_IPV4, - RTE_PTYPE_L3_IPV4_EXT_UNKNOWN); - } - return pkt_type; + /* + * The index to the array should have: + * bit[1:0] = l3_hdr_type + * bit[4:2] = l4_hdr_type + * bit[5] = ip_frag + * bit[6] = tunneled + * bit[7] = outer_l3_type + */ + idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); + return mlx5_ptype_table[idx]; } /** @@ -1819,7 +1639,7 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, if (zip->ai) { volatile struct mlx5_mini_cqe8 (*mc)[8] = (volatile struct mlx5_mini_cqe8 (*)[8]) - (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt]); + (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info); len = ntohl((*mc)[zip->ai & 7].byte_cnt); *rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result); @@ -1867,7 +1687,7 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe, volatile struct mlx5_mini_cqe8 (*mc)[8] = (volatile struct mlx5_mini_cqe8 (*)[8]) (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci & - cqe_cnt]); + cqe_cnt].pkt_info); /* Fix endianness. */ zip->cqe_cnt = ntohl(cqe->byte_cnt); @@ -2018,7 +1838,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) pkt = seg; assert(len >= (rxq->crc_present << 2)); /* Update packet information. */ - pkt->packet_type = 0; + pkt->packet_type = rxq_cq_to_pkt_type(cqe); pkt->ol_flags = 0; if (rss_hash_res && rxq->rss_hash) { pkt->hash.rss = rss_hash_res; @@ -2036,10 +1856,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) mlx5_flow_mark_get(mark); } } - if (rxq->csum | rxq->csum_l2tun) { - pkt->packet_type = rxq_cq_to_pkt_type(cqe); + if (rxq->csum | rxq->csum_l2tun) pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe); - } if (rxq->vlan_strip && (cqe->hdr_type_etc & htons(MLX5_CQE_VLAN_STRIPPED))) { @@ -2054,9 +1872,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) DATA_LEN(rep) = DATA_LEN(seg); PKT_LEN(rep) = PKT_LEN(seg); SET_DATA_OFF(rep, DATA_OFF(seg)); - NB_SEGS(rep) = NB_SEGS(seg); PORT(rep) = PORT(seg); - NEXT(rep) = NULL; (*rxq->elts)[idx] = rep; /* * Fill NIC descriptor with the new buffer. The lkey and size @@ -2151,75 +1967,70 @@ removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) return 0; } -/** - * DPDK callback for rx queue interrupt enable. - * - * @param dev - * Pointer to Ethernet device structure. - * @param rx_queue_id - * RX queue number - * - * @return - * 0 on success, negative on failure. +/* + * Vectorized Rx/Tx routines are not compiled in when required vector + * instructions are not supported on a target architecture. The following null + * stubs are needed for linkage when those are not included outside of this file + * (e.g. mlx5_rxtx_vec_sse.c for x86). */ -int -mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id) + +uint16_t __attribute__((weak)) +mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { -#ifdef HAVE_UPDATE_CQ_CI - struct priv *priv = mlx5_get_priv(dev); - struct rxq *rxq = (*priv->rxqs)[rx_queue_id]; - struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); - struct ibv_cq *cq = rxq_ctrl->cq; - uint16_t ci = rxq->cq_ci; - int ret = 0; - - ibv_mlx5_exp_update_cq_ci(cq, ci); - ret = ibv_req_notify_cq(cq, 0); -#else - int ret = -1; - (void)dev; - (void)rx_queue_id; -#endif - if (ret) - WARN("unable to arm interrupt on rx queue %d", rx_queue_id); - return ret; + (void)dpdk_txq; + (void)pkts; + (void)pkts_n; + return 0; } -/** - * DPDK callback for rx queue interrupt disable. - * - * @param dev - * Pointer to Ethernet device structure. - * @param rx_queue_id - * RX queue number - * - * @return - * 0 on success, negative on failure. - */ -int -mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id) +uint16_t __attribute__((weak)) +mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { -#ifdef HAVE_UPDATE_CQ_CI - struct priv *priv = mlx5_get_priv(dev); - struct rxq *rxq = (*priv->rxqs)[rx_queue_id]; - struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq); - struct ibv_cq *cq = rxq_ctrl->cq; - struct ibv_cq *ev_cq; - void *ev_ctx; - int ret = 0; - - ret = ibv_get_cq_event(cq->channel, &ev_cq, &ev_ctx); - if (ret || ev_cq != cq) - ret = -1; - else - ibv_ack_cq_events(cq, 1); -#else - int ret = -1; - (void)dev; - (void)rx_queue_id; -#endif - if (ret) - WARN("unable to disable interrupt on rx queue %d", - rx_queue_id); - return ret; + (void)dpdk_txq; + (void)pkts; + (void)pkts_n; + return 0; +} + +uint16_t __attribute__((weak)) +mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + (void)dpdk_rxq; + (void)pkts; + (void)pkts_n; + return 0; +} + +int __attribute__((weak)) +priv_check_raw_vec_tx_support(struct priv *priv) +{ + (void)priv; + return -ENOTSUP; +} + +int __attribute__((weak)) +priv_check_vec_tx_support(struct priv *priv) +{ + (void)priv; + return -ENOTSUP; +} + +int __attribute__((weak)) +rxq_check_vec_support(struct rxq *rxq) +{ + (void)rxq; + return -ENOTSUP; +} + +int __attribute__((weak)) +priv_check_vec_rx_support(struct priv *priv) +{ + (void)priv; + return -ENOTSUP; +} + +void __attribute__((weak)) +priv_prep_vec_rx_function(struct priv *priv) +{ + (void)priv; } diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index 8db8eb14..7de1d108 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -115,10 +115,13 @@ struct rxq { unsigned int port_id:8; unsigned int rss_hash:1; /* RSS hash result is enabled. */ unsigned int mark:1; /* Marked flow available on the queue. */ - unsigned int :8; /* Remaining bits. */ + unsigned int pending_err:1; /* CQE error needs to be handled. */ + unsigned int trim_elts:1; /* Whether elts needs clean-up. */ + unsigned int :6; /* Remaining bits. */ volatile uint32_t *rq_db; volatile uint32_t *cq_db; uint16_t rq_ci; + uint16_t rq_pi; uint16_t cq_ci; volatile struct mlx5_wqe_data_seg(*wqes)[]; volatile struct mlx5_cqe(*cqes)[]; @@ -126,6 +129,8 @@ struct rxq { struct rte_mbuf *(*elts)[]; struct rte_mempool *mp; struct mlx5_rxq_stats stats; + uint64_t mbuf_initializer; /* Default rearm_data for vectorized Rx. */ + struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */ } __rte_cache_aligned; /* RX queue control descriptor. */ @@ -240,10 +245,10 @@ struct hash_rxq { }; /* TX queue descriptor. */ -RTE_STD_C11 +__extension__ struct txq { - uint16_t elts_head; /* Current index in (*elts)[]. */ - uint16_t elts_tail; /* First element awaiting completion. */ + uint16_t elts_head; /* Current counter in (*elts)[]. */ + uint16_t elts_tail; /* Counter of first element awaiting completion. */ uint16_t elts_comp; /* Counter since last completion request. */ uint16_t mpw_comp; /* WQ index since last completion request. */ uint16_t cq_ci; /* Consumer index for completion queue. */ @@ -261,16 +266,19 @@ struct txq { uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */ uint16_t inline_max_packet_sz; /* Max packet size for inlining. */ uint32_t qp_num_8s; /* QP number shifted by 8. */ + uint32_t flags; /* Flags for Tx Queue. */ volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */ volatile void *wqes; /* Work queue (use volatile to write into). */ volatile uint32_t *qp_db; /* Work queue doorbell. */ volatile uint32_t *cq_db; /* Completion queue doorbell. */ volatile void *bf_reg; /* Blueflame register. */ struct { - const struct rte_mempool *mp; /* Cached Memory Pool. */ + uintptr_t start; /* Start address of MR */ + uintptr_t end; /* End address of MR */ struct ibv_mr *mr; /* Memory Region (for mp). */ uint32_t lkey; /* htonl(mr->lkey) */ } mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */ + uint16_t mr_cache_idx; /* Index of last hit entry. */ struct rte_mbuf *(*elts)[]; /* TX elements. */ struct mlx5_txq_stats stats; /* TX queue counters. */ } __rte_cache_aligned; @@ -298,19 +306,17 @@ int priv_create_hash_rxqs(struct priv *); void priv_destroy_hash_rxqs(struct priv *); int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type); int priv_rehash_flows(struct priv *); -int priv_intr_efd_enable(struct priv *priv); -void priv_intr_efd_disable(struct priv *priv); -int priv_create_intr_vec(struct priv *priv); -void priv_destroy_intr_vec(struct priv *priv); void rxq_cleanup(struct rxq_ctrl *); -int rxq_rehash(struct rte_eth_dev *, struct rxq_ctrl *); -int rxq_ctrl_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t, - unsigned int, const struct rte_eth_rxconf *, - struct rte_mempool *); int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int, const struct rte_eth_rxconf *, struct rte_mempool *); void mlx5_rx_queue_release(void *); uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t); +int priv_rx_intr_vec_enable(struct priv *priv); +void priv_rx_intr_vec_disable(struct priv *priv); +#ifdef HAVE_UPDATE_CQ_CI +int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id); +int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id); +#endif /* HAVE_UPDATE_CQ_CI */ /* mlx5_txq.c */ @@ -324,6 +330,9 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t); /* mlx5_rxtx.c */ +extern uint32_t mlx5_ptype_table[]; + +void mlx5_set_ptype_table(void); uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t); uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t); uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t); @@ -333,8 +342,16 @@ uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t); uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t); int mlx5_rx_descriptor_status(void *, uint16_t); int mlx5_tx_descriptor_status(void *, uint16_t); -int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id); -int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id); + +/* Vectorized version of mlx5_rxtx.c */ +int priv_check_raw_vec_tx_support(struct priv *); +int priv_check_vec_tx_support(struct priv *); +int rxq_check_vec_support(struct rxq *); +int priv_check_vec_rx_support(struct priv *); +void priv_prep_vec_rx_function(struct priv *); +uint16_t mlx5_tx_burst_raw_vec(void *, struct rte_mbuf **, uint16_t); +uint16_t mlx5_tx_burst_vec(void *, struct rte_mbuf **, uint16_t); +uint16_t mlx5_rx_burst_vec(void *, struct rte_mbuf **, uint16_t); /* mlx5_mr.c */ @@ -342,4 +359,254 @@ struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *); void txq_mp2mr_iter(struct rte_mempool *, void *); uint32_t txq_mp2mr_reg(struct txq *, struct rte_mempool *, unsigned int); +#ifndef NDEBUG +/** + * Verify or set magic value in CQE. + * + * @param cqe + * Pointer to CQE. + * + * @return + * 0 the first time. + */ +static inline int +check_cqe_seen(volatile struct mlx5_cqe *cqe) +{ + static const uint8_t magic[] = "seen"; + volatile uint8_t (*buf)[sizeof(cqe->rsvd0)] = &cqe->rsvd0; + int ret = 1; + unsigned int i; + + for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i) + if (!ret || (*buf)[i] != magic[i]) { + ret = 0; + (*buf)[i] = magic[i]; + } + return ret; +} +#endif /* NDEBUG */ + +/** + * Check whether CQE is valid. + * + * @param cqe + * Pointer to CQE. + * @param cqes_n + * Size of completion queue. + * @param ci + * Consumer index. + * + * @return + * 0 on success, 1 on failure. + */ +static __rte_always_inline int +check_cqe(volatile struct mlx5_cqe *cqe, + unsigned int cqes_n, const uint16_t ci) +{ + uint16_t idx = ci & cqes_n; + uint8_t op_own = cqe->op_own; + uint8_t op_owner = MLX5_CQE_OWNER(op_own); + uint8_t op_code = MLX5_CQE_OPCODE(op_own); + + if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID))) + return 1; /* No CQE. */ +#ifndef NDEBUG + if ((op_code == MLX5_CQE_RESP_ERR) || + (op_code == MLX5_CQE_REQ_ERR)) { + volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe; + uint8_t syndrome = err_cqe->syndrome; + + if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) || + (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR)) + return 0; + if (!check_cqe_seen(cqe)) + ERROR("unexpected CQE error %u (0x%02x)" + " syndrome 0x%02x", + op_code, op_code, syndrome); + return 1; + } else if ((op_code != MLX5_CQE_RESP_SEND) && + (op_code != MLX5_CQE_REQ)) { + if (!check_cqe_seen(cqe)) + ERROR("unexpected CQE opcode %u (0x%02x)", + op_code, op_code); + return 1; + } +#endif /* NDEBUG */ + return 0; +} + +/** + * Return the address of the WQE. + * + * @param txq + * Pointer to TX queue structure. + * @param wqe_ci + * WQE consumer index. + * + * @return + * WQE address. + */ +static inline uintptr_t * +tx_mlx5_wqe(struct txq *txq, uint16_t ci) +{ + ci &= ((1 << txq->wqe_n) - 1); + return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE); +} + +/** + * Manage TX completions. + * + * When sending a burst, mlx5_tx_burst() posts several WRs. + * + * @param txq + * Pointer to TX queue structure. + */ +static __rte_always_inline void +mlx5_tx_complete(struct txq *txq) +{ + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; + const unsigned int cqe_n = 1 << txq->cqe_n; + const unsigned int cqe_cnt = cqe_n - 1; + uint16_t elts_free = txq->elts_tail; + uint16_t elts_tail; + uint16_t cq_ci = txq->cq_ci; + volatile struct mlx5_cqe *cqe = NULL; + volatile struct mlx5_wqe_ctrl *ctrl; + struct rte_mbuf *m, *free[elts_n]; + struct rte_mempool *pool = NULL; + unsigned int blk_n = 0; + + cqe = &(*txq->cqes)[cq_ci & cqe_cnt]; + if (unlikely(check_cqe(cqe, cqe_n, cq_ci))) + return; +#ifndef NDEBUG + if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) || + (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) { + if (!check_cqe_seen(cqe)) + ERROR("unexpected error CQE, TX stopped"); + return; + } +#endif /* NDEBUG */ + ++cq_ci; + txq->wqe_pi = ntohs(cqe->wqe_counter); + ctrl = (volatile struct mlx5_wqe_ctrl *) + tx_mlx5_wqe(txq, txq->wqe_pi); + elts_tail = ctrl->ctrl3; + assert((elts_tail & elts_m) < (1 << txq->wqe_n)); + /* Free buffers. */ + while (elts_free != elts_tail) { + m = rte_pktmbuf_prefree_seg((*txq->elts)[elts_free++ & elts_m]); + if (likely(m != NULL)) { + if (likely(m->pool == pool)) { + free[blk_n++] = m; + } else { + if (likely(pool != NULL)) + rte_mempool_put_bulk(pool, + (void *)free, + blk_n); + free[0] = m; + pool = m->pool; + blk_n = 1; + } + } + } + if (blk_n) + rte_mempool_put_bulk(pool, (void *)free, blk_n); +#ifndef NDEBUG + elts_free = txq->elts_tail; + /* Poisoning. */ + while (elts_free != elts_tail) { + memset(&(*txq->elts)[elts_free & elts_m], + 0x66, + sizeof((*txq->elts)[elts_free & elts_m])); + ++elts_free; + } +#endif + txq->cq_ci = cq_ci; + txq->elts_tail = elts_tail; + /* Update the consumer index. */ + rte_wmb(); + *txq->cq_db = htonl(cq_ci); +} + +/** + * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which + * the cloned mbuf is allocated is returned instead. + * + * @param buf + * Pointer to mbuf. + * + * @return + * Memory pool where data is located for given mbuf. + */ +static struct rte_mempool * +mlx5_tx_mb2mp(struct rte_mbuf *buf) +{ + if (unlikely(RTE_MBUF_INDIRECT(buf))) + return rte_mbuf_from_indirect(buf)->pool; + return buf->pool; +} + +/** + * Get Memory Region (MR) <-> rte_mbuf association from txq->mp2mr[]. + * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, + * remove an entry first. + * + * @param txq + * Pointer to TX queue structure. + * @param[in] mp + * Memory Pool for which a Memory Region lkey must be returned. + * + * @return + * mr->lkey on success, (uint32_t)-1 on failure. + */ +static __rte_always_inline uint32_t +mlx5_tx_mb2mr(struct txq *txq, struct rte_mbuf *mb) +{ + uint16_t i = txq->mr_cache_idx; + uintptr_t addr = rte_pktmbuf_mtod(mb, uintptr_t); + + assert(i < RTE_DIM(txq->mp2mr)); + if (likely(txq->mp2mr[i].start <= addr && txq->mp2mr[i].end >= addr)) + return txq->mp2mr[i].lkey; + for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { + if (unlikely(txq->mp2mr[i].mr == NULL)) { + /* Unknown MP, add a new MR for it. */ + break; + } + if (txq->mp2mr[i].start <= addr && + txq->mp2mr[i].end >= addr) { + assert(txq->mp2mr[i].lkey != (uint32_t)-1); + assert(htonl(txq->mp2mr[i].mr->lkey) == + txq->mp2mr[i].lkey); + txq->mr_cache_idx = i; + return txq->mp2mr[i].lkey; + } + } + txq->mr_cache_idx = 0; + return txq_mp2mr_reg(txq, mlx5_tx_mb2mp(mb), i); +} + +/** + * Ring TX queue doorbell. + * + * @param txq + * Pointer to TX queue structure. + * @param wqe + * Pointer to the last WQE posted in the NIC. + */ +static __rte_always_inline void +mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe) +{ + uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg); + volatile uint64_t *src = ((volatile uint64_t *)wqe); + + rte_wmb(); + *txq->qp_db = htonl(txq->wqe_ci); + /* Ensure ordering between DB record and BF copy. */ + rte_wmb(); + *dst = *src; +} + #endif /* RTE_PMD_MLX5_RXTX_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.c b/drivers/net/mlx5/mlx5_rxtx_vec_sse.c new file mode 100644 index 00000000..8560f745 --- /dev/null +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.c @@ -0,0 +1,1417 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include +#include +#include +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +/* DPDK headers don't like -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include +#include +#include +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include "mlx5.h" +#include "mlx5_utils.h" +#include "mlx5_rxtx.h" +#include "mlx5_autoconf.h" +#include "mlx5_defs.h" +#include "mlx5_prm.h" + +#ifndef __INTEL_COMPILER +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif + +/** + * Fill in buffer descriptors in a multi-packet send descriptor. + * + * @param txq + * Pointer to TX queue structure. + * @param dseg + * Pointer to buffer descriptor to be writen. + * @param pkts + * Pointer to array of packets to be sent. + * @param n + * Number of packets to be filled. + */ +static inline void +txq_wr_dseg_v(struct txq *txq, __m128i *dseg, + struct rte_mbuf **pkts, unsigned int n) +{ + unsigned int pos; + uintptr_t addr; + const __m128i shuf_mask_dseg = + _mm_set_epi8(8, 9, 10, 11, /* addr, bswap64 */ + 12, 13, 14, 15, + 7, 6, 5, 4, /* lkey */ + 0, 1, 2, 3 /* length, bswap32 */); +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t tx_byte = 0; +#endif + + for (pos = 0; pos < n; ++pos, ++dseg) { + __m128i desc; + struct rte_mbuf *pkt = pkts[pos]; + + addr = rte_pktmbuf_mtod(pkt, uintptr_t); + desc = _mm_set_epi32(addr >> 32, + addr, + mlx5_tx_mb2mr(txq, pkt), + DATA_LEN(pkt)); + desc = _mm_shuffle_epi8(desc, shuf_mask_dseg); + _mm_store_si128(dseg, desc); +#ifdef MLX5_PMD_SOFT_COUNTERS + tx_byte += DATA_LEN(pkt); +#endif + } +#ifdef MLX5_PMD_SOFT_COUNTERS + txq->stats.obytes += tx_byte; +#endif +} + +/** + * Count the number of continuous single segment packets. The first packet must + * be a single segment packet. + * + * @param pkts + * Pointer to array of packets. + * @param pkts_n + * Number of packets. + * + * @return + * Number of continuous single segment packets. + */ +static inline unsigned int +txq_check_multiseg(struct rte_mbuf **pkts, uint16_t pkts_n) +{ + unsigned int pos; + + if (!pkts_n) + return 0; + assert(NB_SEGS(pkts[0]) == 1); + /* Count the number of continuous single segment packets. */ + for (pos = 1; pos < pkts_n; ++pos) + if (NB_SEGS(pkts[pos]) > 1) + break; + return pos; +} + +/** + * Count the number of packets having same ol_flags and calculate cs_flags. + * + * @param txq + * Pointer to TX queue structure. + * @param pkts + * Pointer to array of packets. + * @param pkts_n + * Number of packets. + * @param cs_flags + * Pointer of flags to be returned. + * + * @return + * Number of packets having same ol_flags. + */ +static inline unsigned int +txq_calc_offload(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n, + uint8_t *cs_flags) +{ + unsigned int pos; + const uint64_t ol_mask = + PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | + PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE | + PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM; + + if (!pkts_n) + return 0; + /* Count the number of packets having same ol_flags. */ + for (pos = 1; pos < pkts_n; ++pos) + if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask) + break; + /* Should open another MPW session for the rest. */ + if (pkts[0]->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { + const uint64_t is_tunneled = + pkts[0]->ol_flags & + (PKT_TX_TUNNEL_GRE | + PKT_TX_TUNNEL_VXLAN); + + if (is_tunneled && txq->tunnel_en) { + *cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | + MLX5_ETH_WQE_L4_INNER_CSUM; + if (pkts[0]->ol_flags & PKT_TX_OUTER_IP_CKSUM) + *cs_flags |= MLX5_ETH_WQE_L3_CSUM; + } else { + *cs_flags = MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + } + } + return pos; +} + +/** + * Send multi-segmented packets until it encounters a single segment packet in + * the pkts list. + * + * @param txq + * Pointer to TX queue structure. + * @param pkts + * Pointer to array of packets to be sent. + * @param pkts_n + * Number of packets to be sent. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +static uint16_t +txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + uint16_t elts_head = txq->elts_head; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; + const uint16_t wq_n = 1 << txq->wqe_n; + const uint16_t wq_mask = wq_n - 1; + const unsigned int nb_dword_per_wqebb = + MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; + const unsigned int nb_dword_in_hdr = + sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; + unsigned int n; + volatile struct mlx5_wqe *wqe = NULL; + + assert(elts_n > pkts_n); + mlx5_tx_complete(txq); + if (unlikely(!pkts_n)) + return 0; + for (n = 0; n < pkts_n; ++n) { + struct rte_mbuf *buf = pkts[n]; + unsigned int segs_n = buf->nb_segs; + unsigned int ds = nb_dword_in_hdr; + unsigned int len = PKT_LEN(buf); + uint16_t wqe_ci = txq->wqe_ci; + const __m128i shuf_mask_ctrl = + _mm_set_epi8(15, 14, 13, 12, + 8, 9, 10, 11, /* bswap32 */ + 4, 5, 6, 7, /* bswap32 */ + 0, 1, 2, 3 /* bswap32 */); + uint8_t cs_flags = 0; + uint16_t max_elts; + uint16_t max_wqe; + __m128i *t_wqe, *dseg; + __m128i ctrl; + + assert(segs_n); + max_elts = elts_n - (elts_head - txq->elts_tail); + max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi); + /* + * A MPW session consumes 2 WQEs at most to + * include MLX5_MPW_DSEG_MAX pointers. + */ + if (segs_n == 1 || + max_elts < segs_n || max_wqe < 2) + break; + wqe = &((volatile struct mlx5_wqe64 *) + txq->wqes)[wqe_ci & wq_mask].hdr; + if (buf->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { + const uint64_t is_tunneled = buf->ol_flags & + (PKT_TX_TUNNEL_GRE | + PKT_TX_TUNNEL_VXLAN); + + if (is_tunneled && txq->tunnel_en) { + cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM | + MLX5_ETH_WQE_L4_INNER_CSUM; + if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) + cs_flags |= MLX5_ETH_WQE_L3_CSUM; + } else { + cs_flags = MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + } + } + /* Title WQEBB pointer. */ + t_wqe = (__m128i *)wqe; + dseg = (__m128i *)(wqe + 1); + do { + if (!(ds++ % nb_dword_per_wqebb)) { + dseg = (__m128i *) + &((volatile struct mlx5_wqe64 *) + txq->wqes)[++wqe_ci & wq_mask]; + } + txq_wr_dseg_v(txq, dseg++, &buf, 1); + (*txq->elts)[elts_head++ & elts_m] = buf; + buf = buf->next; + } while (--segs_n); + ++wqe_ci; + /* Fill CTRL in the header. */ + ctrl = _mm_set_epi32(0, 0, txq->qp_num_8s | ds, + MLX5_OPC_MOD_MPW << 24 | + txq->wqe_ci << 8 | MLX5_OPCODE_TSO); + ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); + _mm_store_si128(t_wqe, ctrl); + /* Fill ESEG in the header. */ + _mm_store_si128(t_wqe + 1, + _mm_set_epi16(0, 0, 0, 0, + htons(len), cs_flags, + 0, 0)); + txq->wqe_ci = wqe_ci; + } + if (!n) + return 0; + txq->elts_comp += (uint16_t)(elts_head - txq->elts_head); + txq->elts_head = elts_head; + if (txq->elts_comp >= MLX5_TX_COMP_THRESH) { + wqe->ctrl[2] = htonl(8); + wqe->ctrl[3] = txq->elts_head; + txq->elts_comp = 0; + ++txq->cq_pi; + } +#ifdef MLX5_PMD_SOFT_COUNTERS + txq->stats.opackets += n; +#endif + mlx5_tx_dbrec(txq, wqe); + return n; +} + +/** + * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet, + * it returns to make it processed by txq_scatter_v(). All the packets in + * the pkts list should be single segment packets having same offload flags. + * This must be checked by txq_check_multiseg() and txq_calc_offload(). + * + * @param txq + * Pointer to TX queue structure. + * @param pkts + * Pointer to array of packets to be sent. + * @param pkts_n + * Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST). + * @param cs_flags + * Checksum offload flags to be written in the descriptor. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +static inline uint16_t +txq_burst_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n, + uint8_t cs_flags) +{ + struct rte_mbuf **elts; + uint16_t elts_head = txq->elts_head; + const uint16_t elts_n = 1 << txq->elts_n; + const uint16_t elts_m = elts_n - 1; + const unsigned int nb_dword_per_wqebb = + MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; + const unsigned int nb_dword_in_hdr = + sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; + unsigned int n = 0; + unsigned int pos; + uint16_t max_elts; + uint16_t max_wqe; + uint32_t comp_req = 0; + const uint16_t wq_n = 1 << txq->wqe_n; + const uint16_t wq_mask = wq_n - 1; + uint16_t wq_idx = txq->wqe_ci & wq_mask; + volatile struct mlx5_wqe64 *wq = + &((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx]; + volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq; + const __m128i shuf_mask_ctrl = + _mm_set_epi8(15, 14, 13, 12, + 8, 9, 10, 11, /* bswap32 */ + 4, 5, 6, 7, /* bswap32 */ + 0, 1, 2, 3 /* bswap32 */); + __m128i *t_wqe, *dseg; + __m128i ctrl; + + /* Make sure all packets can fit into a single WQE. */ + assert(elts_n > pkts_n); + mlx5_tx_complete(txq); + max_elts = (elts_n - (elts_head - txq->elts_tail)); + max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); + pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts); + if (unlikely(!pkts_n)) + return 0; + elts = &(*txq->elts)[elts_head & elts_m]; + /* Loop for available tailroom first. */ + n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n); + for (pos = 0; pos < (n & -2); pos += 2) + _mm_storeu_si128((__m128i *)&elts[pos], + _mm_loadu_si128((__m128i *)&pkts[pos])); + if (n & 1) + elts[pos] = pkts[pos]; + /* Check if it crosses the end of the queue. */ + if (unlikely(n < pkts_n)) { + elts = &(*txq->elts)[0]; + for (pos = 0; pos < pkts_n - n; ++pos) + elts[pos] = pkts[n + pos]; + } + txq->elts_head += pkts_n; + /* Save title WQEBB pointer. */ + t_wqe = (__m128i *)wqe; + dseg = (__m128i *)(wqe + 1); + /* Calculate the number of entries to the end. */ + n = RTE_MIN( + (wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr, + pkts_n); + /* Fill DSEGs. */ + txq_wr_dseg_v(txq, dseg, pkts, n); + /* Check if it crosses the end of the queue. */ + if (n < pkts_n) { + dseg = (__m128i *)txq->wqes; + txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n); + } + if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) { + txq->elts_comp += pkts_n; + } else { + /* Request a completion. */ + txq->elts_comp = 0; + ++txq->cq_pi; + comp_req = 8; + } + /* Fill CTRL in the header. */ + ctrl = _mm_set_epi32(txq->elts_head, comp_req, + txq->qp_num_8s | (pkts_n + 2), + MLX5_OPC_MOD_ENHANCED_MPSW << 24 | + txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW); + ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); + _mm_store_si128(t_wqe, ctrl); + /* Fill ESEG in the header. */ + _mm_store_si128(t_wqe + 1, + _mm_set_epi8(0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, cs_flags, + 0, 0, 0, 0)); +#ifdef MLX5_PMD_SOFT_COUNTERS + txq->stats.opackets += pkts_n; +#endif + txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) / + nb_dword_per_wqebb; + /* Ring QP doorbell. */ + mlx5_tx_dbrec(txq, wqe); + return pkts_n; +} + +/** + * DPDK callback for vectorized TX. + * + * @param dpdk_txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n) +{ + struct txq *txq = (struct txq *)dpdk_txq; + uint16_t nb_tx = 0; + + while (pkts_n > nb_tx) { + uint16_t n; + uint16_t ret; + + n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST); + ret = txq_burst_v(txq, &pkts[nb_tx], n, 0); + nb_tx += ret; + if (!ret) + break; + } + return nb_tx; +} + +/** + * DPDK callback for vectorized TX with multi-seg packets and offload. + * + * @param dpdk_txq + * Generic pointer to TX queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct txq *txq = (struct txq *)dpdk_txq; + uint16_t nb_tx = 0; + + while (pkts_n > nb_tx) { + uint8_t cs_flags = 0; + uint16_t n; + uint16_t ret; + + /* Transmit multi-seg packets in the head of pkts list. */ + if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) && + NB_SEGS(pkts[nb_tx]) > 1) + nb_tx += txq_scatter_v(txq, + &pkts[nb_tx], + pkts_n - nb_tx); + n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST); + if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS)) + n = txq_check_multiseg(&pkts[nb_tx], n); + if (!(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS)) + n = txq_calc_offload(txq, &pkts[nb_tx], n, &cs_flags); + ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags); + nb_tx += ret; + if (!ret) + break; + } + return nb_tx; +} + +/** + * Store free buffers to RX SW ring. + * + * @param rxq + * Pointer to RX queue structure. + * @param pkts + * Pointer to array of packets to be stored. + * @param pkts_n + * Number of packets to be stored. + */ +static inline void +rxq_copy_mbuf_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t n) +{ + const uint16_t q_mask = (1 << rxq->elts_n) - 1; + struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; + unsigned int pos; + uint16_t p = n & -2; + + for (pos = 0; pos < p; pos += 2) { + __m128i mbp; + + mbp = _mm_loadu_si128((__m128i *)&elts[pos]); + _mm_storeu_si128((__m128i *)&pkts[pos], mbp); + } + if (n & 1) + pkts[pos] = elts[pos]; +} + +/** + * Replenish buffers for RX in bulk. + * + * @param rxq + * Pointer to RX queue structure. + * @param n + * Number of buffers to be replenished. + */ +static inline void +rxq_replenish_bulk_mbuf(struct rxq *rxq, uint16_t n) +{ + const uint16_t q_n = 1 << rxq->elts_n; + const uint16_t q_mask = q_n - 1; + const uint16_t elts_idx = rxq->rq_ci & q_mask; + struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; + volatile struct mlx5_wqe_data_seg *wq = &(*rxq->wqes)[elts_idx]; + unsigned int i; + + assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH); + assert(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi))); + assert(MLX5_VPMD_RXQ_RPLNSH_THRESH > MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx); + if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { + rxq->stats.rx_nombuf += n; + return; + } + for (i = 0; i < n; ++i) + wq[i].addr = htonll((uintptr_t)elts[i]->buf_addr + + RTE_PKTMBUF_HEADROOM); + rxq->rq_ci += n; + rte_wmb(); + *rxq->rq_db = htonl(rxq->rq_ci); +} + +/** + * Decompress a compressed completion and fill in mbufs in RX SW ring with data + * extracted from the title completion descriptor. + * + * @param rxq + * Pointer to RX queue structure. + * @param cq + * Pointer to completion array having a compressed completion at first. + * @param elts + * Pointer to SW ring to be filled. The first mbuf has to be pre-built from + * the title completion descriptor to be copied to the rest of mbufs. + */ +static inline void +rxq_cq_decompress_v(struct rxq *rxq, + volatile struct mlx5_cqe *cq, + struct rte_mbuf **elts) +{ + volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1); + struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */ + unsigned int pos; + unsigned int i; + unsigned int inv = 0; + /* Mask to shuffle from extracted mini CQE to mbuf. */ + const __m128i shuf_mask1 = + _mm_set_epi8(0, 1, 2, 3, /* rss, bswap32 */ + -1, -1, /* skip vlan_tci */ + 6, 7, /* data_len, bswap16 */ + -1, -1, 6, 7, /* pkt_len, bswap16 */ + -1, -1, -1, -1 /* skip packet_type */); + const __m128i shuf_mask2 = + _mm_set_epi8(8, 9, 10, 11, /* rss, bswap32 */ + -1, -1, /* skip vlan_tci */ + 14, 15, /* data_len, bswap16 */ + -1, -1, 14, 15, /* pkt_len, bswap16 */ + -1, -1, -1, -1 /* skip packet_type */); + /* Restore the compressed count. Must be 16 bits. */ + const uint16_t mcqe_n = t_pkt->data_len + + (rxq->crc_present * ETHER_CRC_LEN); + const __m128i rearm = + _mm_loadu_si128((__m128i *)&t_pkt->rearm_data); + const __m128i rxdf = + _mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1); + const __m128i crc_adj = + _mm_set_epi16(0, 0, 0, + rxq->crc_present * ETHER_CRC_LEN, + 0, + rxq->crc_present * ETHER_CRC_LEN, + 0, 0); + const uint32_t flow_tag = t_pkt->hash.fdir.hi; +#ifdef MLX5_PMD_SOFT_COUNTERS + const __m128i zero = _mm_setzero_si128(); + const __m128i ones = _mm_cmpeq_epi32(zero, zero); + uint32_t rcvd_byte = 0; + /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ + const __m128i len_shuf_mask = + _mm_set_epi8(-1, -1, -1, -1, + -1, -1, -1, -1, + 14, 15, 6, 7, + 10, 11, 2, 3); +#endif + + /* Compile time sanity check for this function. */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); + /* + * A. load mCQEs into a 128bit register. + * B. store rearm data to mbuf. + * C. combine data from mCQEs with rx_descriptor_fields1. + * D. store rx_descriptor_fields1. + * E. store flow tag (rte_flow mark). + */ + for (pos = 0; pos < mcqe_n; ) { + __m128i mcqe1, mcqe2; + __m128i rxdf1, rxdf2; +#ifdef MLX5_PMD_SOFT_COUNTERS + __m128i byte_cnt, invalid_mask; +#endif + + if (!(pos & 0x7) && pos + 8 < mcqe_n) + rte_prefetch0((void *)(cq + pos + 8)); + /* A.1 load mCQEs into a 128bit register. */ + mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]); + mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]); + /* B.1 store rearm data to mbuf. */ + _mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm); + _mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm); + /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ + rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1); + rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2); + rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); + rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); + rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); + rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); + /* D.1 store rx_descriptor_fields1. */ + _mm_storeu_si128((__m128i *) + &elts[pos]->rx_descriptor_fields1, + rxdf1); + _mm_storeu_si128((__m128i *) + &elts[pos + 1]->rx_descriptor_fields1, + rxdf2); + /* B.1 store rearm data to mbuf. */ + _mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm); + _mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm); + /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ + rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1); + rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2); + rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); + rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); + rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); + rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); + /* D.1 store rx_descriptor_fields1. */ + _mm_storeu_si128((__m128i *) + &elts[pos + 2]->rx_descriptor_fields1, + rxdf1); + _mm_storeu_si128((__m128i *) + &elts[pos + 3]->rx_descriptor_fields1, + rxdf2); +#ifdef MLX5_PMD_SOFT_COUNTERS + invalid_mask = _mm_set_epi64x(0, + (mcqe_n - pos) * + sizeof(uint16_t) * 8); + invalid_mask = _mm_sll_epi64(ones, invalid_mask); + mcqe1 = _mm_srli_si128(mcqe1, 4); + byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc); + byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask); + byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); + byte_cnt = _mm_hadd_epi16(byte_cnt, zero); + rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); +#endif + if (rxq->mark) { + /* E.1 store flow tag (rte_flow mark). */ + elts[pos]->hash.fdir.hi = flow_tag; + elts[pos + 1]->hash.fdir.hi = flow_tag; + elts[pos + 2]->hash.fdir.hi = flow_tag; + elts[pos + 3]->hash.fdir.hi = flow_tag; + } + pos += MLX5_VPMD_DESCS_PER_LOOP; + /* Move to next CQE and invalidate consumed CQEs. */ + if (!(pos & 0x7) && pos < mcqe_n) { + mcq = (void *)(cq + pos); + for (i = 0; i < 8; ++i) + cq[inv++].op_own = MLX5_CQE_INVALIDATE; + } + } + /* Invalidate the rest of CQEs. */ + for (; inv < mcqe_n; ++inv) + cq[inv].op_own = MLX5_CQE_INVALIDATE; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += mcqe_n; + rxq->stats.ibytes += rcvd_byte; +#endif + rxq->cq_ci += mcqe_n; +} + +/** + * Calculate packet type and offload flag for mbuf and store it. + * + * @param rxq + * Pointer to RX queue structure. + * @param cqes[4] + * Array of four 16bytes completions extracted from the original completion + * descriptor. + * @param op_err + * Opcode vector having responder error status. Each field is 4B. + * @param pkts + * Pointer to array of packets to be filled. + */ +static inline void +rxq_cq_to_ptype_oflags_v(struct rxq *rxq, __m128i cqes[4], __m128i op_err, + struct rte_mbuf **pkts) +{ + __m128i pinfo0, pinfo1; + __m128i pinfo, ptype; + __m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH); + __m128i cv_flags; + const __m128i zero = _mm_setzero_si128(); + const __m128i ptype_mask = + _mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06); + const __m128i ptype_ol_mask = + _mm_set_epi32(0x106, 0x106, 0x106, 0x106); + const __m128i pinfo_mask = + _mm_set_epi32(0x3, 0x3, 0x3, 0x3); + const __m128i cv_flag_sel = + _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, + (uint8_t)((PKT_RX_IP_CKSUM_GOOD | + PKT_RX_L4_CKSUM_GOOD) >> 1), + 0, + (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), + 0, + (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), + (uint8_t)(PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED), + 0); + const __m128i cv_mask = + _mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED); + const __m128i mbuf_init = + _mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer); + __m128i rearm0, rearm1, rearm2, rearm3; + + /* Extract pkt_info field. */ + pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]); + pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]); + pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1); + /* Extract hdr_type_etc field. */ + pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]); + pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]); + ptype = _mm_unpacklo_epi64(pinfo0, pinfo1); + if (rxq->mark) { + const __m128i pinfo_ft_mask = + _mm_set_epi32(0xffffff00, 0xffffff00, + 0xffffff00, 0xffffff00); + const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR); + const __m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID); + __m128i flow_tag, invalid_mask; + + flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask); + /* Check if flow tag is non-zero then set PKT_RX_FDIR. */ + invalid_mask = _mm_cmpeq_epi32(flow_tag, zero); + ol_flags = _mm_or_si128(ol_flags, + _mm_andnot_si128(invalid_mask, + fdir_flags)); + /* Mask out invalid entries. */ + flow_tag = _mm_andnot_si128(invalid_mask, flow_tag); + /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */ + ol_flags = _mm_or_si128(ol_flags, + _mm_andnot_si128( + _mm_cmpeq_epi32(flow_tag, + pinfo_ft_mask), + fdir_id_flags)); + } + /* + * Merge the two fields to generate the following: + * bit[1] = l3_ok + * bit[2] = l4_ok + * bit[8] = cv + * bit[11:10] = l3_hdr_type + * bit[14:12] = l4_hdr_type + * bit[15] = ip_frag + * bit[16] = tunneled + * bit[17] = outer_l3_type + */ + ptype = _mm_and_si128(ptype, ptype_mask); + pinfo = _mm_and_si128(pinfo, pinfo_mask); + pinfo = _mm_slli_epi32(pinfo, 16); + /* Make pinfo has merged fields for ol_flags calculation. */ + pinfo = _mm_or_si128(ptype, pinfo); + ptype = _mm_srli_epi32(pinfo, 10); + ptype = _mm_packs_epi32(ptype, zero); + /* Errored packets will have RTE_PTYPE_ALL_MASK. */ + op_err = _mm_srli_epi16(op_err, 8); + ptype = _mm_or_si128(ptype, op_err); + pkts[0]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 0)]; + pkts[1]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 2)]; + pkts[2]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 4)]; + pkts[3]->packet_type = mlx5_ptype_table[_mm_extract_epi8(ptype, 6)]; + /* Fill flags for checksum and VLAN. */ + pinfo = _mm_and_si128(pinfo, ptype_ol_mask); + pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo); + /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */ + cv_flags = _mm_slli_epi32(pinfo, 9); + cv_flags = _mm_or_si128(pinfo, cv_flags); + /* Move back flags to start from byte[0]. */ + cv_flags = _mm_srli_epi32(cv_flags, 8); + /* Mask out garbage bits. */ + cv_flags = _mm_and_si128(cv_flags, cv_mask); + /* Merge to ol_flags. */ + ol_flags = _mm_or_si128(ol_flags, cv_flags); + /* Merge mbuf_init and ol_flags. */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != + offsetof(struct rte_mbuf, rearm_data) + 8); + rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30); + rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30); + rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30); + rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30); + /* Write 8B rearm_data and 8B ol_flags. */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != + RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); + _mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0); + _mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1); + _mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2); + _mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3); +} + +/** + * Skip error packets. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +static uint16_t +rxq_handle_pending_error(struct rxq *rxq, struct rte_mbuf **pkts, + uint16_t pkts_n) +{ + uint16_t n = 0; + unsigned int i; +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t err_bytes = 0; +#endif + + for (i = 0; i < pkts_n; ++i) { + struct rte_mbuf *pkt = pkts[i]; + + if (pkt->packet_type == RTE_PTYPE_ALL_MASK) { +#ifdef MLX5_PMD_SOFT_COUNTERS + err_bytes += PKT_LEN(pkt); +#endif + rte_pktmbuf_free_seg(pkt); + } else { + pkts[n++] = pkt; + } + } + rxq->stats.idropped += (pkts_n - n); +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Correct counters of errored completions. */ + rxq->stats.ipackets -= (pkts_n - n); + rxq->stats.ibytes -= err_bytes; +#endif + rxq->pending_err = 0; + return n; +} + +/** + * Receive burst of packets. An errored completion also consumes a mbuf, but the + * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed + * before returning to application. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets received including errors (<= pkts_n). + */ +static inline uint16_t +rxq_burst_v(struct rxq *rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + const uint16_t q_n = 1 << rxq->cqe_n; + const uint16_t q_mask = q_n - 1; + volatile struct mlx5_cqe *cq; + struct rte_mbuf **elts; + unsigned int pos; + uint64_t n; + uint16_t repl_n; + uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; + uint16_t nocmp_n = 0; + uint16_t rcvd_pkt = 0; + unsigned int cq_idx = rxq->cq_ci & q_mask; + unsigned int elts_idx; + unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); + const __m128i owner_check = + _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL); + const __m128i opcode_check = + _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL); + const __m128i format_check = + _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL); + const __m128i resp_err_check = + _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL); +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t rcvd_byte = 0; + /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ + const __m128i len_shuf_mask = + _mm_set_epi8(-1, -1, -1, -1, + -1, -1, -1, -1, + 12, 13, 8, 9, + 4, 5, 0, 1); +#endif + /* Mask to shuffle from extracted CQE to mbuf. */ + const __m128i shuf_mask = + _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */ + 12, 13, 14, 15, /* rss, bswap32 */ + 10, 11, /* vlan_tci, bswap16 */ + 4, 5, /* data_len, bswap16 */ + -1, -1, /* zero out 2nd half of pkt_len */ + 4, 5 /* pkt_len, bswap16 */); + /* Mask to blend from the last Qword to the first DQword. */ + const __m128i blend_mask = + _mm_set_epi8(-1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, -1); + const __m128i zero = _mm_setzero_si128(); + const __m128i ones = _mm_cmpeq_epi32(zero, zero); + const __m128i crc_adj = + _mm_set_epi16(0, 0, 0, 0, 0, + rxq->crc_present * ETHER_CRC_LEN, + 0, + rxq->crc_present * ETHER_CRC_LEN); + const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0); + + /* Compile time sanity check for this function. */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, pkt_info) != 0); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rx_hash_res) != + offsetof(struct mlx5_cqe, pkt_info) + 12); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rsvd1) + + sizeof(((struct mlx5_cqe *)0)->rsvd1) != + offsetof(struct mlx5_cqe, hdr_type_etc)); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, vlan_info) != + offsetof(struct mlx5_cqe, hdr_type_etc) + 2); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, rsvd2) + + sizeof(((struct mlx5_cqe *)0)->rsvd2) != + offsetof(struct mlx5_cqe, byte_cnt)); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, sop_drop_qpn) != + RTE_ALIGN(offsetof(struct mlx5_cqe, sop_drop_qpn), 8)); + RTE_BUILD_BUG_ON(offsetof(struct mlx5_cqe, op_own) != + offsetof(struct mlx5_cqe, sop_drop_qpn) + 7); + assert(rxq->sges_n == 0); + assert(rxq->cqe_n == rxq->elts_n); + cq = &(*rxq->cqes)[cq_idx]; + rte_prefetch0(cq); + rte_prefetch0(cq + 1); + rte_prefetch0(cq + 2); + rte_prefetch0(cq + 3); + pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); + /* + * Order of indexes: + * rq_ci >= cq_ci >= rq_pi + * Definition of indexes: + * rq_ci - cq_ci := # of buffers owned by HW (posted). + * cq_ci - rq_pi := # of buffers not returned to app (decompressed). + * N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished). + */ + repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); + if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH) + rxq_replenish_bulk_mbuf(rxq, repl_n); + /* See if there're unreturned mbufs from compressed CQE. */ + rcvd_pkt = rxq->cq_ci - rxq->rq_pi; + if (rcvd_pkt > 0) { + rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); + rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); + rxq->rq_pi += rcvd_pkt; + pkts += rcvd_pkt; + } + elts_idx = rxq->rq_pi & q_mask; + elts = &(*rxq->elts)[elts_idx]; + /* Not to overflow pkts array. */ + pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); + if (!pkts_n) + return rcvd_pkt; + /* At this point, there shouldn't be any remained packets. */ + assert(rxq->rq_pi == rxq->cq_ci); + /* + * A. load first Qword (8bytes) in one loop. + * B. copy 4 mbuf pointers from elts ring to returing pkts. + * C. load remained CQE data and extract necessary fields. + * Final 16bytes cqes[] extracted from original 64bytes CQE has the + * following structure: + * struct { + * uint8_t pkt_info; + * uint8_t flow_tag[3]; + * uint16_t byte_cnt; + * uint8_t rsvd4; + * uint8_t op_own; + * uint16_t hdr_type_etc; + * uint16_t vlan_info; + * uint32_t rx_has_res; + * } c; + * D. fill in mbuf. + * E. get valid CQEs. + * F. find compressed CQE. + */ + for (pos = 0; + pos < pkts_n; + pos += MLX5_VPMD_DESCS_PER_LOOP) { + __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP]; + __m128i cqe_tmp1, cqe_tmp2; + __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; + __m128i op_own, op_own_tmp1, op_own_tmp2; + __m128i opcode, owner_mask, invalid_mask; + __m128i comp_mask; + __m128i mask; +#ifdef MLX5_PMD_SOFT_COUNTERS + __m128i byte_cnt; +#endif + __m128i mbp1, mbp2; + __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0); + unsigned int p1, p2, p3; + + /* Prefetch next 4 CQEs. */ + if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]); + } + /* A.0 do not cross the end of CQ. */ + mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + p = _mm_andnot_si128(mask, p); + /* A.1 load cqes. */ + p3 = _mm_extract_epi16(p, 3); + cqes[3] = _mm_loadl_epi64((__m128i *) + &cq[pos + p3].sop_drop_qpn); + rte_compiler_barrier(); + p2 = _mm_extract_epi16(p, 2); + cqes[2] = _mm_loadl_epi64((__m128i *) + &cq[pos + p2].sop_drop_qpn); + rte_compiler_barrier(); + /* B.1 load mbuf pointers. */ + mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]); + mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]); + /* A.1 load a block having op_own. */ + p1 = _mm_extract_epi16(p, 1); + cqes[1] = _mm_loadl_epi64((__m128i *) + &cq[pos + p1].sop_drop_qpn); + rte_compiler_barrier(); + cqes[0] = _mm_loadl_epi64((__m128i *) + &cq[pos].sop_drop_qpn); + /* B.2 copy mbuf pointers. */ + _mm_storeu_si128((__m128i *)&pkts[pos], mbp1); + _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2); + rte_compiler_barrier(); + /* C.1 load remained CQE data and extract necessary fields. */ + cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]); + cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]); + cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask); + cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask); + cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].rsvd1[3]); + cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].rsvd1[3]); + cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30); + cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30); + cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd2[10]); + cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd2[10]); + cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04); + cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04); + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask); + pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask); + /* C.3 adjust CRC length. */ + pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj); + pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj); + /* C.4 adjust flow mark. */ + pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj); + pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj); + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3); + _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2); + /* E.1 extract op_own field. */ + op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]); + /* C.1 load remained CQE data and extract necessary fields. */ + cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]); + cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]); + cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask); + cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask); + cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].rsvd1[3]); + cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].rsvd1[3]); + cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30); + cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30); + cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd2[10]); + cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd2[10]); + cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04); + cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04); + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask); + pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask); + /* C.3 adjust CRC length. */ + pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj); + pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj); + /* C.4 adjust flow mark. */ + pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj); + pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj); + /* E.1 extract op_own byte. */ + op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]); + op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2); + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1); + _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0); + /* E.2 flip owner bit to mark CQEs from last round. */ + owner_mask = _mm_and_si128(op_own, owner_check); + if (ownership) + owner_mask = _mm_xor_si128(owner_mask, owner_check); + owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check); + owner_mask = _mm_packs_epi32(owner_mask, zero); + /* E.3 get mask for invalidated CQEs. */ + opcode = _mm_and_si128(op_own, opcode_check); + invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode); + invalid_mask = _mm_packs_epi32(invalid_mask, zero); + /* E.4 mask out beyond boundary. */ + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* E.5 merge invalid_mask with invalid owner. */ + invalid_mask = _mm_or_si128(invalid_mask, owner_mask); + /* F.1 find compressed CQE format. */ + comp_mask = _mm_and_si128(op_own, format_check); + comp_mask = _mm_cmpeq_epi32(comp_mask, format_check); + comp_mask = _mm_packs_epi32(comp_mask, zero); + /* F.2 mask out invalid entries. */ + comp_mask = _mm_andnot_si128(invalid_mask, comp_mask); + comp_idx = _mm_cvtsi128_si64(comp_mask); + /* F.3 get the first compressed CQE. */ + comp_idx = comp_idx ? + __builtin_ctzll(comp_idx) / + (sizeof(uint16_t) * 8) : + MLX5_VPMD_DESCS_PER_LOOP; + /* E.6 mask out entries after the compressed CQE. */ + mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* E.7 count non-compressed valid CQEs. */ + n = _mm_cvtsi128_si64(invalid_mask); + n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) : + MLX5_VPMD_DESCS_PER_LOOP; + nocmp_n += n; + /* D.2 get the final invalid mask. */ + mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* D.3 check error in opcode. */ + opcode = _mm_cmpeq_epi32(resp_err_check, opcode); + opcode = _mm_packs_epi32(opcode, zero); + opcode = _mm_andnot_si128(invalid_mask, opcode); + /* D.4 mark if any error is set */ + rxq->pending_err |= !!_mm_cvtsi128_si64(opcode); + /* D.5 fill in mbuf - rearm_data and packet_type. */ + rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]); +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Add up received bytes count. */ + byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask); + byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); + byte_cnt = _mm_hadd_epi16(byte_cnt, zero); + rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); +#endif + /* + * Break the loop unless more valid CQE is expected, or if + * there's a compressed CQE. + */ + if (n != MLX5_VPMD_DESCS_PER_LOOP) + break; + } + /* If no new CQE seen, return without updating cq_db. */ + if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) + return rcvd_pkt; + /* Update the consumer indexes for non-compressed CQEs. */ + assert(nocmp_n <= pkts_n); + rxq->cq_ci += nocmp_n; + rxq->rq_pi += nocmp_n; + rcvd_pkt += nocmp_n; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += nocmp_n; + rxq->stats.ibytes += rcvd_byte; +#endif + /* Decompress the last CQE if compressed. */ + if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { + assert(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); + rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]); + /* Return more packets if needed. */ + if (nocmp_n < pkts_n) { + uint16_t n = rxq->cq_ci - rxq->rq_pi; + + n = RTE_MIN(n, pkts_n - nocmp_n); + rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); + rxq->rq_pi += n; + rcvd_pkt += n; + } + } + rte_wmb(); + *rxq->cq_db = htonl(rxq->cq_ci); + return rcvd_pkt; +} + +/** + * DPDK callback for vectorized RX. + * + * @param dpdk_rxq + * Generic pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct rxq *rxq = dpdk_rxq; + uint16_t nb_rx; + + nb_rx = rxq_burst_v(rxq, pkts, pkts_n); + if (unlikely(rxq->pending_err)) + nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx); + return nb_rx; +} + +/** + * Check Tx queue flags are set for raw vectorized Tx. + * + * @param priv + * Pointer to private structure. + * + * @return + * 1 if supported, negative errno value if not. + */ +int __attribute__((cold)) +priv_check_raw_vec_tx_support(struct priv *priv) +{ + uint16_t i; + + /* All the configured queues should support. */ + for (i = 0; i < priv->txqs_n; ++i) { + struct txq *txq = (*priv->txqs)[i]; + + if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) || + !(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS)) + break; + } + if (i != priv->txqs_n) + return -ENOTSUP; + return 1; +} + +/** + * Check a device can support vectorized TX. + * + * @param priv + * Pointer to private structure. + * + * @return + * 1 if supported, negative errno value if not. + */ +int __attribute__((cold)) +priv_check_vec_tx_support(struct priv *priv) +{ + if (!priv->tx_vec_en || + priv->txqs_n > MLX5_VPMD_MIN_TXQS || + priv->mps != MLX5_MPW_ENHANCED || + priv->tso) + return -ENOTSUP; + return 1; +} + +/** + * Check a RX queue can support vectorized RX. + * + * @param rxq + * Pointer to RX queue. + * + * @return + * 1 if supported, negative errno value if not. + */ +int __attribute__((cold)) +rxq_check_vec_support(struct rxq *rxq) +{ + struct rxq_ctrl *ctrl = container_of(rxq, struct rxq_ctrl, rxq); + + if (!ctrl->priv->rx_vec_en || rxq->sges_n != 0) + return -ENOTSUP; + return 1; +} + +/** + * Check a device can support vectorized RX. + * + * @param priv + * Pointer to private structure. + * + * @return + * 1 if supported, negative errno value if not. + */ +int __attribute__((cold)) +priv_check_vec_rx_support(struct priv *priv) +{ + uint16_t i; + + if (!priv->rx_vec_en) + return -ENOTSUP; + /* All the configured queues should support. */ + for (i = 0; i < priv->rxqs_n; ++i) { + struct rxq *rxq = (*priv->rxqs)[i]; + + if (rxq_check_vec_support(rxq) < 0) + break; + } + if (i != priv->rxqs_n) + return -ENOTSUP; + return 1; +} + +/** + * Prepare for vectorized RX. + * + * @param priv + * Pointer to private structure. + */ +void +priv_prep_vec_rx_function(struct priv *priv) +{ + uint16_t i; + + for (i = 0; i < priv->rxqs_n; ++i) { + struct rxq *rxq = (*priv->rxqs)[i]; + struct rte_mbuf *mbuf_init = &rxq->fake_mbuf; + const uint16_t desc = 1 << rxq->elts_n; + int j; + + assert(rxq->elts_n == rxq->cqe_n); + /* Initialize default rearm_data for vPMD. */ + mbuf_init->data_off = RTE_PKTMBUF_HEADROOM; + rte_mbuf_refcnt_set(mbuf_init, 1); + mbuf_init->nb_segs = 1; + mbuf_init->port = rxq->port_id; + /* + * prevent compiler reordering: + * rearm_data covers previous fields. + */ + rte_compiler_barrier(); + rxq->mbuf_initializer = + *(uint64_t *)&mbuf_init->rearm_data; + /* Padding with a fake mbuf for vectorized Rx. */ + for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j) + (*rxq->elts)[desc + j] = &rxq->fake_mbuf; + /* Mark that it need to be cleaned up for rxq_alloc_elts(). */ + rxq->trim_elts = 1; + } +} diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c index 8c5aa691..595a9e06 100644 --- a/drivers/net/mlx5/mlx5_trigger.c +++ b/drivers/net/mlx5/mlx5_trigger.c @@ -72,6 +72,9 @@ mlx5_dev_start(struct rte_eth_dev *dev) priv_unlock(priv); return 0; } + /* Update Rx/Tx callback. */ + priv_select_tx_function(priv); + priv_select_rx_function(priv); DEBUG("%p: allocating and configuring hash RX queues", (void *)dev); err = priv_create_hash_rxqs(priv); if (!err) @@ -94,12 +97,13 @@ mlx5_dev_start(struct rte_eth_dev *dev) (void *)priv, strerror(err)); goto error; } - priv_dev_interrupt_handler_install(priv, dev); - if (dev->data->dev_conf.intr_conf.rxq) { - err = priv_intr_efd_enable(priv); - if (!err) - err = priv_create_intr_vec(priv); + err = priv_rx_intr_vec_enable(priv); + if (err) { + ERROR("%p: RX interrupt vector creation failed", + (void *)priv); + goto error; } + priv_dev_interrupt_handler_install(priv, dev); priv_xstats_init(priv); priv_unlock(priv); return 0; @@ -140,11 +144,8 @@ mlx5_dev_stop(struct rte_eth_dev *dev) priv_destroy_hash_rxqs(priv); priv_fdir_disable(priv); priv_flow_stop(priv); + priv_rx_intr_vec_disable(priv); priv_dev_interrupt_handler_uninstall(priv, dev); - if (priv->dev->data->dev_conf.intr_conf.rxq) { - priv_destroy_intr_vec(priv); - priv_intr_efd_disable(priv); - } priv->started = 0; priv_unlock(priv); } diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index bf72468d..98aaa7ca 100644 --- a/drivers/net/mlx5/mlx5_txq.c +++ b/drivers/net/mlx5/mlx5_txq.c @@ -64,7 +64,6 @@ #include "mlx5.h" #include "mlx5_rxtx.h" #include "mlx5_autoconf.h" -#include "mlx5_defs.h" /** * Allocate TX queue elements. @@ -103,9 +102,10 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n) static void txq_free_elts(struct txq_ctrl *txq_ctrl) { - unsigned int elts_n = 1 << txq_ctrl->txq.elts_n; - unsigned int elts_head = txq_ctrl->txq.elts_head; - unsigned int elts_tail = txq_ctrl->txq.elts_tail; + const uint16_t elts_n = 1 << txq_ctrl->txq.elts_n; + const uint16_t elts_m = elts_n - 1; + uint16_t elts_head = txq_ctrl->txq.elts_head; + uint16_t elts_tail = txq_ctrl->txq.elts_tail; struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts; DEBUG("%p: freeing WRs", (void *)txq_ctrl); @@ -114,18 +114,17 @@ txq_free_elts(struct txq_ctrl *txq_ctrl) txq_ctrl->txq.elts_comp = 0; while (elts_tail != elts_head) { - struct rte_mbuf *elt = (*elts)[elts_tail]; + struct rte_mbuf *elt = (*elts)[elts_tail & elts_m]; assert(elt != NULL); rte_pktmbuf_free_seg(elt); #ifndef NDEBUG /* Poisoning. */ - memset(&(*elts)[elts_tail], + memset(&(*elts)[elts_tail & elts_m], 0x77, - sizeof((*elts)[elts_tail])); + sizeof((*elts)[elts_tail & elts_m])); #endif - if (++elts_tail == elts_n) - elts_tail = 0; + ++elts_tail; } } @@ -149,9 +148,8 @@ txq_cleanup(struct txq_ctrl *txq_ctrl) if (txq_ctrl->cq != NULL) claim_zero(ibv_destroy_cq(txq_ctrl->cq)); for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) { - if (txq_ctrl->txq.mp2mr[i].mp == NULL) + if (txq_ctrl->txq.mp2mr[i].mr == NULL) break; - assert(txq_ctrl->txq.mp2mr[i].mr != NULL); claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[i].mr)); } memset(txq_ctrl, 0, sizeof(*txq_ctrl)); @@ -244,7 +242,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set"); goto error; } - (void)conf; /* Thresholds configuration (ignored). */ + tmpl.txq.flags = conf->txq_flags; assert(desc > MLX5_TX_COMP_THRESH); tmpl.txq.elts_n = log2above(desc); if (priv->mps == MLX5_MPW_ENHANCED) @@ -497,8 +495,6 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, DEBUG("%p: adding TX queue %p to list", (void *)dev, (void *)txq_ctrl); (*priv->txqs)[idx] = &txq_ctrl->txq; - /* Update send callback. */ - priv_select_tx_function(priv); } priv_unlock(priv); return -ret; -- cgit 1.2.3-korg