diff options
author | Luca Boccassi <luca.boccassi@gmail.com> | 2017-11-08 14:15:11 +0000 |
---|---|---|
committer | Luca Boccassi <luca.boccassi@gmail.com> | 2017-11-08 14:45:54 +0000 |
commit | 055c52583a2794da8ba1e85a48cce3832372b12f (patch) | |
tree | 8ceb1cb78fbb46a0f341f8ee24feb3c6b5540013 /drivers/net/mlx4 | |
parent | f239aed5e674965691846e8ce3f187dd47523689 (diff) |
New upstream version 17.11-rc3
Change-Id: I6a5baa40612fe0c20f30b5fa773a6cbbac63a685
Signed-off-by: Luca Boccassi <luca.boccassi@gmail.com>
Diffstat (limited to 'drivers/net/mlx4')
-rw-r--r-- | drivers/net/mlx4/Makefile | 46 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4.c | 6139 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4.h | 401 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_ethdev.c | 1047 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_flow.c | 2062 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_flow.h | 73 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_intr.c | 397 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_mr.c | 293 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_prm.h | 177 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_rxq.c | 873 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_rxtx.c | 1071 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_rxtx.h | 214 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_txq.c | 414 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_utils.c | 217 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_utils.h | 133 |
15 files changed, 6318 insertions, 7239 deletions
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile index c045bd79..f1f47c28 100644 --- a/drivers/net/mlx4/Makefile +++ b/drivers/net/mlx4/Makefile @@ -1,7 +1,7 @@ # BSD LICENSE # -# Copyright 2012-2015 6WIND S.A. -# Copyright 2012 Mellanox. +# Copyright 2012 6WIND S.A. +# Copyright 2012 Mellanox # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -36,7 +36,14 @@ LIB = librte_pmd_mlx4.a # Sources. SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_ethdev.c SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_flow.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mr.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxq.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_txq.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_utils.c # Basic CFLAGS. CFLAGS += -O3 @@ -47,7 +54,10 @@ CFLAGS += -D_BSD_SOURCE CFLAGS += -D_DEFAULT_SOURCE CFLAGS += -D_XOPEN_SOURCE=600 CFLAGS += $(WERROR_FLAGS) -LDLIBS += -libverbs +LDLIBS += -libverbs -lmlx4 +LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring +LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs +LDLIBS += -lrte_bus_pci # A few warnings cannot be avoided in external headers. CFLAGS += -Wno-error=cast-qual @@ -68,22 +78,10 @@ else CFLAGS += -DNDEBUG -UPEDANTIC endif -ifdef CONFIG_RTE_LIBRTE_MLX4_SGE_WR_N -CFLAGS += -DMLX4_PMD_SGE_WR_N=$(CONFIG_RTE_LIBRTE_MLX4_SGE_WR_N) -endif - -ifdef CONFIG_RTE_LIBRTE_MLX4_MAX_INLINE -CFLAGS += -DMLX4_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX4_MAX_INLINE) -endif - ifdef CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE CFLAGS += -DMLX4_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE) endif -ifdef CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS -CFLAGS += -DMLX4_PMD_SOFT_COUNTERS=$(CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS) -endif - ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DEBUG_BROKEN_VERBS),y) CFLAGS += -DMLX4_PMD_DEBUG_BROKEN_VERBS endif @@ -103,23 +101,7 @@ mlx4_autoconf.h.new: FORCE mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh $Q $(RM) -f -- '$@' - $Q sh -- '$<' '$@' \ - RSS_SUPPORT \ - infiniband/verbs.h \ - enum IBV_EXP_DEVICE_UD_RSS $(AUTOCONF_OUTPUT) - $Q sh -- '$<' '$@' \ - INLINE_RECV \ - infiniband/verbs.h \ - enum IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ $(AUTOCONF_OUTPUT) - $Q sh -- '$<' '$@' \ - HAVE_EXP_QUERY_DEVICE \ - infiniband/verbs.h \ - type 'struct ibv_exp_device_attr' $(AUTOCONF_OUTPUT) - $Q sh -- '$<' '$@' \ - HAVE_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK \ - infiniband/verbs.h \ - enum IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK \ - $(AUTOCONF_OUTPUT) + $Q : > '$@' # Create mlx4_autoconf.h or update it in case it differs from the new one. diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index 055de49a..f9e4f9d7 100644 --- a/drivers/net/mlx4/mlx4.c +++ b/drivers/net/mlx4/mlx4.c @@ -1,8 +1,8 @@ /*- * BSD LICENSE * - * Copyright 2012-2017 6WIND S.A. - * Copyright 2012-2017 Mellanox. + * Copyright 2012 6WIND S.A. + * Copyright 2012 Mellanox * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,95 +31,52 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* - * Known limitations: - * - RSS hash key and options cannot be modified. - * - Hardware counters aren't implemented. +/** + * @file + * mlx4 driver initialization. */ -/* System headers. */ +#include <assert.h> +#include <errno.h> +#include <inttypes.h> #include <stddef.h> +#include <stdint.h> #include <stdio.h> #include <stdlib.h> -#include <stdint.h> -#include <inttypes.h> #include <string.h> -#include <errno.h> -#include <unistd.h> -#include <limits.h> -#include <assert.h> -#include <arpa/inet.h> -#include <net/if.h> -#include <dirent.h> -#include <sys/ioctl.h> -#include <sys/socket.h> -#include <netinet/in.h> -#include <linux/ethtool.h> -#include <linux/sockios.h> -#include <fcntl.h> -#include <rte_ether.h> -#include <rte_ethdev.h> -#include <rte_ethdev_pci.h> +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_common.h> #include <rte_dev.h> -#include <rte_mbuf.h> #include <rte_errno.h> -#include <rte_mempool.h> -#include <rte_prefetch.h> -#include <rte_malloc.h> -#include <rte_spinlock.h> -#include <rte_atomic.h> -#include <rte_version.h> -#include <rte_log.h> -#include <rte_alarm.h> -#include <rte_memory.h> +#include <rte_ethdev.h> +#include <rte_ethdev_pci.h> +#include <rte_ether.h> #include <rte_flow.h> -#include <rte_kvargs.h> #include <rte_interrupts.h> +#include <rte_kvargs.h> +#include <rte_malloc.h> +#include <rte_mbuf.h> -/* Generated configuration header. */ -#include "mlx4_autoconf.h" - -/* PMD headers. */ #include "mlx4.h" #include "mlx4_flow.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" -/* Convenience macros for accessing mbuf fields. */ -#define NEXT(m) ((m)->next) -#define DATA_LEN(m) ((m)->data_len) -#define PKT_LEN(m) ((m)->pkt_len) -#define DATA_OFF(m) ((m)->data_off) -#define SET_DATA_OFF(m, o) ((m)->data_off = (o)) -#define NB_SEGS(m) ((m)->nb_segs) -#define PORT(m) ((m)->port) - -/* Work Request ID data type (64 bit). */ -typedef union { - struct { - uint32_t id; - uint16_t offset; - } data; - uint64_t raw; -} wr_id_t; - -#define WR_ID(o) (((wr_id_t *)&(o))->data) - -/* Transpose flags. Useful to convert IBV to DPDK flags. */ -#define TRANSPOSE(val, from, to) \ - (((from) >= (to)) ? \ - (((val) & (from)) / ((from) / (to))) : \ - (((val) & (from)) * ((to) / (from)))) - -/* Local storage for secondary process data. */ -struct mlx4_secondary_data { - struct rte_eth_dev_data data; /* Local device data. */ - struct priv *primary_priv; /* Private structure from primary. */ - struct rte_eth_dev_data *shared_dev_data; /* Shared device data. */ - rte_spinlock_t lock; /* Port configuration lock. */ -} mlx4_secondary_data[RTE_MAX_ETHPORTS]; - +/** Configuration structure for device arguments. */ struct mlx4_conf { - uint8_t active_ports; + struct { + uint32_t present; /**< Bit-field for existing ports. */ + uint32_t enabled; /**< Bit-field for user-enabled ports. */ + } ports; }; /* Available parameters list. */ @@ -128,593 +85,6 @@ const char *pmd_mlx4_init_params[] = { NULL, }; -static int -mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx); - -static int -mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx); - -static int -priv_rx_intr_vec_enable(struct priv *priv); - -static void -priv_rx_intr_vec_disable(struct priv *priv); - -/** - * Check if running as a secondary process. - * - * @return - * Nonzero if running as a secondary process. - */ -static inline int -mlx4_is_secondary(void) -{ - return rte_eal_process_type() != RTE_PROC_PRIMARY; -} - -/** - * Return private structure associated with an Ethernet device. - * - * @param dev - * Pointer to Ethernet device structure. - * - * @return - * Pointer to private structure. - */ -static struct priv * -mlx4_get_priv(struct rte_eth_dev *dev) -{ - struct mlx4_secondary_data *sd; - - if (!mlx4_is_secondary()) - return dev->data->dev_private; - sd = &mlx4_secondary_data[dev->data->port_id]; - return sd->data.dev_private; -} - -/** - * Lock private structure to protect it from concurrent access in the - * control path. - * - * @param priv - * Pointer to private structure. - */ -void priv_lock(struct priv *priv) -{ - rte_spinlock_lock(&priv->lock); -} - -/** - * Unlock private structure. - * - * @param priv - * Pointer to private structure. - */ -void priv_unlock(struct priv *priv) -{ - rte_spinlock_unlock(&priv->lock); -} - -/* Allocate a buffer on the stack and fill it with a printf format string. */ -#define MKSTR(name, ...) \ - char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \ - \ - snprintf(name, sizeof(name), __VA_ARGS__) - -/** - * Get interface name from private structure. - * - * @param[in] priv - * Pointer to private structure. - * @param[out] ifname - * Interface name output buffer. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) -{ - DIR *dir; - struct dirent *dent; - unsigned int dev_type = 0; - unsigned int dev_port_prev = ~0u; - char match[IF_NAMESIZE] = ""; - - { - MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); - - dir = opendir(path); - if (dir == NULL) - return -1; - } - while ((dent = readdir(dir)) != NULL) { - char *name = dent->d_name; - FILE *file; - unsigned int dev_port; - int r; - - if ((name[0] == '.') && - ((name[1] == '\0') || - ((name[1] == '.') && (name[2] == '\0')))) - continue; - - MKSTR(path, "%s/device/net/%s/%s", - priv->ctx->device->ibdev_path, name, - (dev_type ? "dev_id" : "dev_port")); - - file = fopen(path, "rb"); - if (file == NULL) { - if (errno != ENOENT) - continue; - /* - * Switch to dev_id when dev_port does not exist as - * is the case with Linux kernel versions < 3.15. - */ -try_dev_id: - match[0] = '\0'; - if (dev_type) - break; - dev_type = 1; - dev_port_prev = ~0u; - rewinddir(dir); - continue; - } - r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); - fclose(file); - if (r != 1) - continue; - /* - * Switch to dev_id when dev_port returns the same value for - * all ports. May happen when using a MOFED release older than - * 3.0 with a Linux kernel >= 3.15. - */ - if (dev_port == dev_port_prev) - goto try_dev_id; - dev_port_prev = dev_port; - if (dev_port == (priv->port - 1u)) - snprintf(match, sizeof(match), "%s", name); - } - closedir(dir); - if (match[0] == '\0') - return -1; - strncpy(*ifname, match, sizeof(*ifname)); - return 0; -} - -/** - * Read from sysfs entry. - * - * @param[in] priv - * Pointer to private structure. - * @param[in] entry - * Entry name relative to sysfs path. - * @param[out] buf - * Data output buffer. - * @param size - * Buffer size. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_sysfs_read(const struct priv *priv, const char *entry, - char *buf, size_t size) -{ - char ifname[IF_NAMESIZE]; - FILE *file; - int ret; - int err; - - if (priv_get_ifname(priv, &ifname)) - return -1; - - MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, - ifname, entry); - - file = fopen(path, "rb"); - if (file == NULL) - return -1; - ret = fread(buf, 1, size, file); - err = errno; - if (((size_t)ret < size) && (ferror(file))) - ret = -1; - else - ret = size; - fclose(file); - errno = err; - return ret; -} - -/** - * Write to sysfs entry. - * - * @param[in] priv - * Pointer to private structure. - * @param[in] entry - * Entry name relative to sysfs path. - * @param[in] buf - * Data buffer. - * @param size - * Buffer size. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_sysfs_write(const struct priv *priv, const char *entry, - char *buf, size_t size) -{ - char ifname[IF_NAMESIZE]; - FILE *file; - int ret; - int err; - - if (priv_get_ifname(priv, &ifname)) - return -1; - - MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, - ifname, entry); - - file = fopen(path, "wb"); - if (file == NULL) - return -1; - ret = fwrite(buf, 1, size, file); - err = errno; - if (((size_t)ret < size) || (ferror(file))) - ret = -1; - else - ret = size; - fclose(file); - errno = err; - return ret; -} - -/** - * Get unsigned long sysfs property. - * - * @param priv - * Pointer to private structure. - * @param[in] name - * Entry name relative to sysfs path. - * @param[out] value - * Value output buffer. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) -{ - int ret; - unsigned long value_ret; - char value_str[32]; - - ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); - if (ret == -1) { - DEBUG("cannot read %s value from sysfs: %s", - name, strerror(errno)); - return -1; - } - value_str[ret] = '\0'; - errno = 0; - value_ret = strtoul(value_str, NULL, 0); - if (errno) { - DEBUG("invalid %s value `%s': %s", name, value_str, - strerror(errno)); - return -1; - } - *value = value_ret; - return 0; -} - -/** - * Set unsigned long sysfs property. - * - * @param priv - * Pointer to private structure. - * @param[in] name - * Entry name relative to sysfs path. - * @param value - * Value to set. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) -{ - int ret; - MKSTR(value_str, "%lu", value); - - ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); - if (ret == -1) { - DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", - name, value_str, value, strerror(errno)); - return -1; - } - return 0; -} - -/** - * Perform ifreq ioctl() on associated Ethernet device. - * - * @param[in] priv - * Pointer to private structure. - * @param req - * Request number to pass to ioctl(). - * @param[out] ifr - * Interface request structure output buffer. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) -{ - int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); - int ret = -1; - - if (sock == -1) - return ret; - if (priv_get_ifname(priv, &ifr->ifr_name) == 0) - ret = ioctl(sock, req, ifr); - close(sock); - return ret; -} - -/** - * Get device MTU. - * - * @param priv - * Pointer to private structure. - * @param[out] mtu - * MTU value output buffer. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_get_mtu(struct priv *priv, uint16_t *mtu) -{ - unsigned long ulong_mtu; - - if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) - return -1; - *mtu = ulong_mtu; - return 0; -} - -/** - * Set device MTU. - * - * @param priv - * Pointer to private structure. - * @param mtu - * MTU value to set. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_set_mtu(struct priv *priv, uint16_t mtu) -{ - uint16_t new_mtu; - - if (priv_set_sysfs_ulong(priv, "mtu", mtu) || - priv_get_mtu(priv, &new_mtu)) - return -1; - if (new_mtu == mtu) - return 0; - errno = EINVAL; - return -1; -} - -/** - * Set device flags. - * - * @param priv - * Pointer to private structure. - * @param keep - * Bitmask for flags that must remain untouched. - * @param flags - * Bitmask for flags to modify. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) -{ - unsigned long tmp; - - if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) - return -1; - tmp &= keep; - tmp |= (flags & (~keep)); - return priv_set_sysfs_ulong(priv, "flags", tmp); -} - -/* Device configuration. */ - -static int -txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, - unsigned int socket, const struct rte_eth_txconf *conf); - -static void -txq_cleanup(struct txq *txq); - -static int -rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, - unsigned int socket, int inactive, - const struct rte_eth_rxconf *conf, - struct rte_mempool *mp, int children_n, - struct rxq *rxq_parent); - -static void -rxq_cleanup(struct rxq *rxq); - -/** - * Create RSS parent queue. - * - * The new parent is inserted in front of the list in the private structure. - * - * @param priv - * Pointer to private structure. - * @param queues - * Queues indices array, if NULL use all Rx queues. - * @param children_n - * The number of entries in queues[]. - * - * @return - * Pointer to a parent rxq structure, NULL on failure. - */ -struct rxq * -priv_parent_create(struct priv *priv, - uint16_t queues[], - uint16_t children_n) -{ - int ret; - uint16_t i; - struct rxq *parent; - - parent = rte_zmalloc("parent queue", - sizeof(*parent), - RTE_CACHE_LINE_SIZE); - if (!parent) { - ERROR("cannot allocate memory for RSS parent queue"); - return NULL; - } - ret = rxq_setup(priv->dev, parent, 0, 0, 0, - NULL, NULL, children_n, NULL); - if (ret) { - rte_free(parent); - return NULL; - } - parent->rss.queues_n = children_n; - if (queues) { - for (i = 0; i < children_n; ++i) - parent->rss.queues[i] = queues[i]; - } else { - /* the default RSS ring case */ - assert(priv->rxqs_n == children_n); - for (i = 0; i < priv->rxqs_n; ++i) - parent->rss.queues[i] = i; - } - LIST_INSERT_HEAD(&priv->parents, parent, next); - return parent; -} - -/** - * Clean up RX queue parent structure. - * - * @param parent - * RX queue parent structure. - */ -void -rxq_parent_cleanup(struct rxq *parent) -{ - LIST_REMOVE(parent, next); - rxq_cleanup(parent); - rte_free(parent); -} - -/** - * Clean up parent structures from the parent list. - * - * @param priv - * Pointer to private structure. - */ -static void -priv_parent_list_cleanup(struct priv *priv) -{ - while (!LIST_EMPTY(&priv->parents)) - rxq_parent_cleanup(LIST_FIRST(&priv->parents)); -} - -/** - * Ethernet device configuration. - * - * Prepare the driver for a given number of TX and RX queues. - * Allocate parent RSS queue when several RX queues are requested. - * - * @param dev - * Pointer to Ethernet device structure. - * - * @return - * 0 on success, errno value on failure. - */ -static int -dev_configure(struct rte_eth_dev *dev) -{ - struct priv *priv = dev->data->dev_private; - unsigned int rxqs_n = dev->data->nb_rx_queues; - unsigned int txqs_n = dev->data->nb_tx_queues; - unsigned int tmp; - - priv->rxqs = (void *)dev->data->rx_queues; - priv->txqs = (void *)dev->data->tx_queues; - if (txqs_n != priv->txqs_n) { - INFO("%p: TX queues number update: %u -> %u", - (void *)dev, priv->txqs_n, txqs_n); - priv->txqs_n = txqs_n; - } - if (rxqs_n == priv->rxqs_n) - return 0; - if (!rte_is_power_of_2(rxqs_n) && !priv->isolated) { - unsigned n_active; - - n_active = rte_align32pow2(rxqs_n + 1) >> 1; - WARN("%p: number of RX queues must be a power" - " of 2: %u queues among %u will be active", - (void *)dev, n_active, rxqs_n); - } - - INFO("%p: RX queues number update: %u -> %u", - (void *)dev, priv->rxqs_n, rxqs_n); - /* If RSS is enabled, disable it first. */ - if (priv->rss) { - unsigned int i; - - /* Only if there are no remaining child RX queues. */ - for (i = 0; (i != priv->rxqs_n); ++i) - if ((*priv->rxqs)[i] != NULL) - return EINVAL; - priv_parent_list_cleanup(priv); - priv->rss = 0; - priv->rxqs_n = 0; - } - if (rxqs_n <= 1) { - /* Nothing else to do. */ - priv->rxqs_n = rxqs_n; - return 0; - } - /* Allocate a new RSS parent queue if supported by hardware. */ - if (!priv->hw_rss) { - ERROR("%p: only a single RX queue can be configured when" - " hardware doesn't support RSS", - (void *)dev); - return EINVAL; - } - /* Fail if hardware doesn't support that many RSS queues. */ - if (rxqs_n >= priv->max_rss_tbl_sz) { - ERROR("%p: only %u RX queues can be configured for RSS", - (void *)dev, priv->max_rss_tbl_sz); - return EINVAL; - } - priv->rss = 1; - tmp = priv->rxqs_n; - priv->rxqs_n = rxqs_n; - if (priv->isolated) - return 0; - if (priv_parent_create(priv, NULL, priv->rxqs_n)) - return 0; - /* Failure, rollback. */ - priv->rss = 0; - priv->rxqs_n = tmp; - return ENOMEM; -} - /** * DPDK callback for Ethernet device configuration. * @@ -722,3490 +92,78 @@ dev_configure(struct rte_eth_dev *dev) * Pointer to Ethernet device structure. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, negative errno value otherwise and rte_errno is set. */ static int mlx4_dev_configure(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; + struct rte_flow_error error; int ret; - if (mlx4_is_secondary()) - return -E_RTE_SECONDARY; - priv_lock(priv); - ret = dev_configure(dev); - assert(ret >= 0); - priv_unlock(priv); - return -ret; -} - -static uint16_t mlx4_tx_burst(void *, struct rte_mbuf **, uint16_t); -static uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t); - -/** - * Configure secondary process queues from a private data pointer (primary - * or secondary) and update burst callbacks. Can take place only once. - * - * All queues must have been previously created by the primary process to - * avoid undefined behavior. - * - * @param priv - * Private data pointer from either primary or secondary process. - * - * @return - * Private data pointer from secondary process, NULL in case of error. - */ -static struct priv * -mlx4_secondary_data_setup(struct priv *priv) -{ - unsigned int port_id = 0; - struct mlx4_secondary_data *sd; - void **tx_queues; - void **rx_queues; - unsigned int nb_tx_queues; - unsigned int nb_rx_queues; - unsigned int i; - - /* priv must be valid at this point. */ - assert(priv != NULL); - /* priv->dev must also be valid but may point to local memory from - * another process, possibly with the same address and must not - * be dereferenced yet. */ - assert(priv->dev != NULL); - /* Determine port ID by finding out where priv comes from. */ - while (1) { - sd = &mlx4_secondary_data[port_id]; - rte_spinlock_lock(&sd->lock); - /* Primary process? */ - if (sd->primary_priv == priv) - break; - /* Secondary process? */ - if (sd->data.dev_private == priv) - break; - rte_spinlock_unlock(&sd->lock); - if (++port_id == RTE_DIM(mlx4_secondary_data)) - port_id = 0; - } - /* Switch to secondary private structure. If private data has already - * been updated by another thread, there is nothing else to do. */ - priv = sd->data.dev_private; - if (priv->dev->data == &sd->data) - goto end; - /* Sanity checks. Secondary private structure is supposed to point - * to local eth_dev, itself still pointing to the shared device data - * structure allocated by the primary process. */ - assert(sd->shared_dev_data != &sd->data); - assert(sd->data.nb_tx_queues == 0); - assert(sd->data.tx_queues == NULL); - assert(sd->data.nb_rx_queues == 0); - assert(sd->data.rx_queues == NULL); - assert(priv != sd->primary_priv); - assert(priv->dev->data == sd->shared_dev_data); - assert(priv->txqs_n == 0); - assert(priv->txqs == NULL); - assert(priv->rxqs_n == 0); - assert(priv->rxqs == NULL); - nb_tx_queues = sd->shared_dev_data->nb_tx_queues; - nb_rx_queues = sd->shared_dev_data->nb_rx_queues; - /* Allocate local storage for queues. */ - tx_queues = rte_zmalloc("secondary ethdev->tx_queues", - sizeof(sd->data.tx_queues[0]) * nb_tx_queues, - RTE_CACHE_LINE_SIZE); - rx_queues = rte_zmalloc("secondary ethdev->rx_queues", - sizeof(sd->data.rx_queues[0]) * nb_rx_queues, - RTE_CACHE_LINE_SIZE); - if (tx_queues == NULL || rx_queues == NULL) - goto error; - /* Lock to prevent control operations during setup. */ - priv_lock(priv); - /* TX queues. */ - for (i = 0; i != nb_tx_queues; ++i) { - struct txq *primary_txq = (*sd->primary_priv->txqs)[i]; - struct txq *txq; - - if (primary_txq == NULL) - continue; - txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, - primary_txq->socket); - if (txq != NULL) { - if (txq_setup(priv->dev, - txq, - primary_txq->elts_n * MLX4_PMD_SGE_WR_N, - primary_txq->socket, - NULL) == 0) { - txq->stats.idx = primary_txq->stats.idx; - tx_queues[i] = txq; - continue; - } - rte_free(txq); - } - while (i) { - txq = tx_queues[--i]; - txq_cleanup(txq); - rte_free(txq); - } - goto error; - } - /* RX queues. */ - for (i = 0; i != nb_rx_queues; ++i) { - struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i]; - - if (primary_rxq == NULL) - continue; - /* Not supported yet. */ - rx_queues[i] = NULL; - } - /* Update everything. */ - priv->txqs = (void *)tx_queues; - priv->txqs_n = nb_tx_queues; - priv->rxqs = (void *)rx_queues; - priv->rxqs_n = nb_rx_queues; - sd->data.rx_queues = rx_queues; - sd->data.tx_queues = tx_queues; - sd->data.nb_rx_queues = nb_rx_queues; - sd->data.nb_tx_queues = nb_tx_queues; - sd->data.dev_link = sd->shared_dev_data->dev_link; - sd->data.mtu = sd->shared_dev_data->mtu; - memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state, - sizeof(sd->data.rx_queue_state)); - memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state, - sizeof(sd->data.tx_queue_state)); - sd->data.dev_flags = sd->shared_dev_data->dev_flags; - /* Use local data from now on. */ - rte_mb(); - priv->dev->data = &sd->data; - rte_mb(); - priv->dev->tx_pkt_burst = mlx4_tx_burst; - priv->dev->rx_pkt_burst = removed_rx_burst; - priv_unlock(priv); -end: - /* More sanity checks. */ - assert(priv->dev->tx_pkt_burst == mlx4_tx_burst); - assert(priv->dev->rx_pkt_burst == removed_rx_burst); - assert(priv->dev->data == &sd->data); - rte_spinlock_unlock(&sd->lock); - return priv; -error: - priv_unlock(priv); - rte_free(tx_queues); - rte_free(rx_queues); - rte_spinlock_unlock(&sd->lock); - return NULL; -} - -/* TX queues handling. */ - -/** - * Allocate TX queue elements. - * - * @param txq - * Pointer to TX queue structure. - * @param elts_n - * Number of elements to allocate. - * - * @return - * 0 on success, errno value on failure. - */ -static int -txq_alloc_elts(struct txq *txq, unsigned int elts_n) -{ - unsigned int i; - struct txq_elt (*elts)[elts_n] = - rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket); - linear_t (*elts_linear)[elts_n] = - rte_calloc_socket("TXQ", 1, sizeof(*elts_linear), 0, - txq->socket); - struct ibv_mr *mr_linear = NULL; - int ret = 0; - - if ((elts == NULL) || (elts_linear == NULL)) { - ERROR("%p: can't allocate packets array", (void *)txq); - ret = ENOMEM; - goto error; - } - mr_linear = - ibv_reg_mr(txq->priv->pd, elts_linear, sizeof(*elts_linear), - IBV_ACCESS_LOCAL_WRITE); - if (mr_linear == NULL) { - ERROR("%p: unable to configure MR, ibv_reg_mr() failed", - (void *)txq); - ret = EINVAL; - goto error; - } - for (i = 0; (i != elts_n); ++i) { - struct txq_elt *elt = &(*elts)[i]; - - elt->buf = NULL; - } - DEBUG("%p: allocated and configured %u WRs", (void *)txq, elts_n); - txq->elts_n = elts_n; - txq->elts = elts; - txq->elts_head = 0; - txq->elts_tail = 0; - txq->elts_comp = 0; - /* Request send completion every MLX4_PMD_TX_PER_COMP_REQ packets or - * at least 4 times per ring. */ - txq->elts_comp_cd_init = - ((MLX4_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ? - MLX4_PMD_TX_PER_COMP_REQ : (elts_n / 4)); - txq->elts_comp_cd = txq->elts_comp_cd_init; - txq->elts_linear = elts_linear; - txq->mr_linear = mr_linear; - assert(ret == 0); - return 0; -error: - if (mr_linear != NULL) - claim_zero(ibv_dereg_mr(mr_linear)); - - rte_free(elts_linear); - rte_free(elts); - - DEBUG("%p: failed, freed everything", (void *)txq); - assert(ret > 0); - return ret; -} - -/** - * Free TX queue elements. - * - * @param txq - * Pointer to TX queue structure. - */ -static void -txq_free_elts(struct txq *txq) -{ - unsigned int elts_n = txq->elts_n; - unsigned int elts_head = txq->elts_head; - unsigned int elts_tail = txq->elts_tail; - struct txq_elt (*elts)[elts_n] = txq->elts; - linear_t (*elts_linear)[elts_n] = txq->elts_linear; - struct ibv_mr *mr_linear = txq->mr_linear; - - DEBUG("%p: freeing WRs", (void *)txq); - txq->elts_n = 0; - txq->elts_head = 0; - txq->elts_tail = 0; - txq->elts_comp = 0; - txq->elts_comp_cd = 0; - txq->elts_comp_cd_init = 0; - txq->elts = NULL; - txq->elts_linear = NULL; - txq->mr_linear = NULL; - if (mr_linear != NULL) - claim_zero(ibv_dereg_mr(mr_linear)); - - rte_free(elts_linear); - if (elts == NULL) - return; - while (elts_tail != elts_head) { - struct txq_elt *elt = &(*elts)[elts_tail]; - - assert(elt->buf != NULL); - rte_pktmbuf_free(elt->buf); -#ifndef NDEBUG - /* Poisoning. */ - memset(elt, 0x77, sizeof(*elt)); -#endif - if (++elts_tail == elts_n) - elts_tail = 0; - } - rte_free(elts); -} - - -/** - * Clean up a TX queue. - * - * Destroy objects, free allocated memory and reset the structure for reuse. - * - * @param txq - * Pointer to TX queue structure. - */ -static void -txq_cleanup(struct txq *txq) -{ - struct ibv_exp_release_intf_params params; - size_t i; - - DEBUG("cleaning up %p", (void *)txq); - txq_free_elts(txq); - if (txq->if_qp != NULL) { - assert(txq->priv != NULL); - assert(txq->priv->ctx != NULL); - assert(txq->qp != NULL); - params = (struct ibv_exp_release_intf_params){ - .comp_mask = 0, - }; - claim_zero(ibv_exp_release_intf(txq->priv->ctx, - txq->if_qp, - ¶ms)); - } - if (txq->if_cq != NULL) { - assert(txq->priv != NULL); - assert(txq->priv->ctx != NULL); - assert(txq->cq != NULL); - params = (struct ibv_exp_release_intf_params){ - .comp_mask = 0, - }; - claim_zero(ibv_exp_release_intf(txq->priv->ctx, - txq->if_cq, - ¶ms)); - } - if (txq->qp != NULL) - claim_zero(ibv_destroy_qp(txq->qp)); - if (txq->cq != NULL) - claim_zero(ibv_destroy_cq(txq->cq)); - if (txq->rd != NULL) { - struct ibv_exp_destroy_res_domain_attr attr = { - .comp_mask = 0, - }; - - assert(txq->priv != NULL); - assert(txq->priv->ctx != NULL); - claim_zero(ibv_exp_destroy_res_domain(txq->priv->ctx, - txq->rd, - &attr)); - } - for (i = 0; (i != elemof(txq->mp2mr)); ++i) { - if (txq->mp2mr[i].mp == NULL) - break; - assert(txq->mp2mr[i].mr != NULL); - claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr)); - } - memset(txq, 0, sizeof(*txq)); -} - -/** - * Manage TX completions. - * - * When sending a burst, mlx4_tx_burst() posts several WRs. - * To improve performance, a completion event is only required once every - * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information - * for other WRs, but this information would not be used anyway. - * - * @param txq - * Pointer to TX queue structure. - * - * @return - * 0 on success, -1 on failure. - */ -static int -txq_complete(struct txq *txq) -{ - unsigned int elts_comp = txq->elts_comp; - unsigned int elts_tail = txq->elts_tail; - const unsigned int elts_n = txq->elts_n; - int wcs_n; - - if (unlikely(elts_comp == 0)) - return 0; -#ifdef DEBUG_SEND - DEBUG("%p: processing %u work requests completions", - (void *)txq, elts_comp); -#endif - wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp); - if (unlikely(wcs_n == 0)) - return 0; - if (unlikely(wcs_n < 0)) { - DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", - (void *)txq, wcs_n); - return -1; - } - elts_comp -= wcs_n; - assert(elts_comp <= txq->elts_comp); - /* - * Assume WC status is successful as nothing can be done about it - * anyway. - */ - elts_tail += wcs_n * txq->elts_comp_cd_init; - if (elts_tail >= elts_n) - elts_tail -= elts_n; - txq->elts_tail = elts_tail; - txq->elts_comp = elts_comp; - return 0; -} - -struct mlx4_check_mempool_data { - int ret; - char *start; - char *end; -}; - -/* Called by mlx4_check_mempool() when iterating the memory chunks. */ -static void mlx4_check_mempool_cb(struct rte_mempool *mp, - void *opaque, struct rte_mempool_memhdr *memhdr, - unsigned mem_idx) -{ - struct mlx4_check_mempool_data *data = opaque; - - (void)mp; - (void)mem_idx; - - /* It already failed, skip the next chunks. */ - if (data->ret != 0) - return; - /* It is the first chunk. */ - if (data->start == NULL && data->end == NULL) { - data->start = memhdr->addr; - data->end = data->start + memhdr->len; - return; - } - if (data->end == memhdr->addr) { - data->end += memhdr->len; - return; - } - if (data->start == (char *)memhdr->addr + memhdr->len) { - data->start -= memhdr->len; - return; - } - /* Error, mempool is not virtually contigous. */ - data->ret = -1; -} - -/** - * Check if a mempool can be used: it must be virtually contiguous. - * - * @param[in] mp - * Pointer to memory pool. - * @param[out] start - * Pointer to the start address of the mempool virtual memory area - * @param[out] end - * Pointer to the end address of the mempool virtual memory area - * - * @return - * 0 on success (mempool is virtually contiguous), -1 on error. - */ -static int mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start, - uintptr_t *end) -{ - struct mlx4_check_mempool_data data; - - memset(&data, 0, sizeof(data)); - rte_mempool_mem_iter(mp, mlx4_check_mempool_cb, &data); - *start = (uintptr_t)data.start; - *end = (uintptr_t)data.end; - - return data.ret; -} - -/* For best performance, this function should not be inlined. */ -static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *) - __rte_noinline; - -/** - * Register mempool as a memory region. - * - * @param pd - * Pointer to protection domain. - * @param mp - * Pointer to memory pool. - * - * @return - * Memory region pointer, NULL in case of error. - */ -static struct ibv_mr * -mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp) -{ - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - uintptr_t start; - uintptr_t end; - unsigned int i; - - if (mlx4_check_mempool(mp, &start, &end) != 0) { - ERROR("mempool %p: not virtually contiguous", - (void *)mp); - return NULL; - } - - DEBUG("mempool %p area start=%p end=%p size=%zu", - (void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - /* Round start and end to page boundary if found in memory segments. */ - for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { - uintptr_t addr = (uintptr_t)ms[i].addr; - size_t len = ms[i].len; - unsigned int align = ms[i].hugepage_sz; - - if ((start > addr) && (start < addr + len)) - start = RTE_ALIGN_FLOOR(start, align); - if ((end > addr) && (end < addr + len)) - end = RTE_ALIGN_CEIL(end, align); - } - DEBUG("mempool %p using start=%p end=%p size=%zu for MR", - (void *)mp, (void *)start, (void *)end, - (size_t)(end - start)); - return ibv_reg_mr(pd, - (void *)start, - end - start, - IBV_ACCESS_LOCAL_WRITE); -} - -/** - * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which - * the cloned mbuf is allocated is returned instead. - * - * @param buf - * Pointer to mbuf. - * - * @return - * Memory pool where data is located for given mbuf. - */ -static struct rte_mempool * -txq_mb2mp(struct rte_mbuf *buf) -{ - if (unlikely(RTE_MBUF_INDIRECT(buf))) - return rte_mbuf_from_indirect(buf)->pool; - return buf->pool; -} - -/** - * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, - * remove an entry first. - * - * @param txq - * Pointer to TX queue structure. - * @param[in] mp - * Memory Pool for which a Memory Region lkey must be returned. - * - * @return - * mr->lkey on success, (uint32_t)-1 on failure. - */ -static uint32_t -txq_mp2mr(struct txq *txq, struct rte_mempool *mp) -{ - unsigned int i; - struct ibv_mr *mr; - - for (i = 0; (i != elemof(txq->mp2mr)); ++i) { - if (unlikely(txq->mp2mr[i].mp == NULL)) { - /* Unknown MP, add a new MR for it. */ - break; - } - if (txq->mp2mr[i].mp == mp) { - assert(txq->mp2mr[i].lkey != (uint32_t)-1); - assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); - return txq->mp2mr[i].lkey; - } - } - /* Add a new entry, register MR first. */ - DEBUG("%p: discovered new memory pool \"%s\" (%p)", - (void *)txq, mp->name, (void *)mp); - mr = mlx4_mp2mr(txq->priv->pd, mp); - if (unlikely(mr == NULL)) { - DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", - (void *)txq); - return (uint32_t)-1; - } - if (unlikely(i == elemof(txq->mp2mr))) { - /* Table is full, remove oldest entry. */ - DEBUG("%p: MR <-> MP table full, dropping oldest entry.", - (void *)txq); - --i; - claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); - memmove(&txq->mp2mr[0], &txq->mp2mr[1], - (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); - } - /* Store the new entry. */ - txq->mp2mr[i].mp = mp; - txq->mp2mr[i].mr = mr; - txq->mp2mr[i].lkey = mr->lkey; - DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, - (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey); - return txq->mp2mr[i].lkey; -} - -struct txq_mp2mr_mbuf_check_data { - int ret; -}; - -/** - * Callback function for rte_mempool_obj_iter() to check whether a given - * mempool object looks like a mbuf. - * - * @param[in] mp - * The mempool pointer - * @param[in] arg - * Context data (struct txq_mp2mr_mbuf_check_data). Contains the - * return value. - * @param[in] obj - * Object address. - * @param index - * Object index, unused. - */ -static void -txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj, - uint32_t index __rte_unused) -{ - struct txq_mp2mr_mbuf_check_data *data = arg; - struct rte_mbuf *buf = obj; - - /* Check whether mbuf structure fits element size and whether mempool - * pointer is valid. */ - if (sizeof(*buf) > mp->elt_size || buf->pool != mp) - data->ret = -1; -} - -/** - * Iterator function for rte_mempool_walk() to register existing mempools and - * fill the MP to MR cache of a TX queue. - * - * @param[in] mp - * Memory Pool to register. - * @param *arg - * Pointer to TX queue structure. - */ -static void -txq_mp2mr_iter(struct rte_mempool *mp, void *arg) -{ - struct txq *txq = arg; - struct txq_mp2mr_mbuf_check_data data = { - .ret = 0, - }; - - /* Register mempool only if the first element looks like a mbuf. */ - if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 || - data.ret == -1) - return; - txq_mp2mr(txq, mp); -} - -#if MLX4_PMD_SGE_WR_N > 1 - -/** - * Copy scattered mbuf contents to a single linear buffer. - * - * @param[out] linear - * Linear output buffer. - * @param[in] buf - * Scattered input buffer. - * - * @return - * Number of bytes copied to the output buffer or 0 if not large enough. - */ -static unsigned int -linearize_mbuf(linear_t *linear, struct rte_mbuf *buf) -{ - unsigned int size = 0; - unsigned int offset; - - do { - unsigned int len = DATA_LEN(buf); - - offset = size; - size += len; - if (unlikely(size > sizeof(*linear))) - return 0; - memcpy(&(*linear)[offset], - rte_pktmbuf_mtod(buf, uint8_t *), - len); - buf = NEXT(buf); - } while (buf != NULL); - return size; -} - -/** - * Handle scattered buffers for mlx4_tx_burst(). - * - * @param txq - * TX queue structure. - * @param segs - * Number of segments in buf. - * @param elt - * TX queue element to fill. - * @param[in] buf - * Buffer to process. - * @param elts_head - * Index of the linear buffer to use if necessary (normally txq->elts_head). - * @param[out] sges - * Array filled with SGEs on success. - * - * @return - * A structure containing the processed packet size in bytes and the - * number of SGEs. Both fields are set to (unsigned int)-1 in case of - * failure. - */ -static struct tx_burst_sg_ret { - unsigned int length; - unsigned int num; -} -tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt, - struct rte_mbuf *buf, unsigned int elts_head, - struct ibv_sge (*sges)[MLX4_PMD_SGE_WR_N]) -{ - unsigned int sent_size = 0; - unsigned int j; - int linearize = 0; - - /* When there are too many segments, extra segments are - * linearized in the last SGE. */ - if (unlikely(segs > elemof(*sges))) { - segs = (elemof(*sges) - 1); - linearize = 1; - } - /* Update element. */ - elt->buf = buf; - /* Register segments as SGEs. */ - for (j = 0; (j != segs); ++j) { - struct ibv_sge *sge = &(*sges)[j]; - uint32_t lkey; - - /* Retrieve Memory Region key for this memory pool. */ - lkey = txq_mp2mr(txq, txq_mb2mp(buf)); - if (unlikely(lkey == (uint32_t)-1)) { - /* MR does not exist. */ - DEBUG("%p: unable to get MP <-> MR association", - (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; - } - /* Update SGE. */ - sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); - if (txq->priv->vf) - rte_prefetch0((volatile void *) - (uintptr_t)sge->addr); - sge->length = DATA_LEN(buf); - sge->lkey = lkey; - sent_size += sge->length; - buf = NEXT(buf); - } - /* If buf is not NULL here and is not going to be linearized, - * nb_segs is not valid. */ - assert(j == segs); - assert((buf == NULL) || (linearize)); - /* Linearize extra segments. */ - if (linearize) { - struct ibv_sge *sge = &(*sges)[segs]; - linear_t *linear = &(*txq->elts_linear)[elts_head]; - unsigned int size = linearize_mbuf(linear, buf); - - assert(segs == (elemof(*sges) - 1)); - if (size == 0) { - /* Invalid packet. */ - DEBUG("%p: packet too large to be linearized.", - (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; - } - /* If MLX4_PMD_SGE_WR_N is 1, free mbuf immediately. */ - if (elemof(*sges) == 1) { - do { - struct rte_mbuf *next = NEXT(buf); - - rte_pktmbuf_free_seg(buf); - buf = next; - } while (buf != NULL); - elt->buf = NULL; - } - /* Update SGE. */ - sge->addr = (uintptr_t)&(*linear)[0]; - sge->length = size; - sge->lkey = txq->mr_linear->lkey; - sent_size += size; - /* Include last segment. */ - segs++; - } - return (struct tx_burst_sg_ret){ - .length = sent_size, - .num = segs, - }; -stop: - return (struct tx_burst_sg_ret){ - .length = -1, - .num = -1, - }; -} - -#endif /* MLX4_PMD_SGE_WR_N > 1 */ - -/** - * DPDK callback for TX. - * - * @param dpdk_txq - * Generic pointer to TX queue structure. - * @param[in] pkts - * Packets to transmit. - * @param pkts_n - * Number of packets in array. - * - * @return - * Number of packets successfully transmitted (<= pkts_n). - */ -static uint16_t -mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) -{ - struct txq *txq = (struct txq *)dpdk_txq; - unsigned int elts_head = txq->elts_head; - const unsigned int elts_n = txq->elts_n; - unsigned int elts_comp_cd = txq->elts_comp_cd; - unsigned int elts_comp = 0; - unsigned int i; - unsigned int max; - int err; - - assert(elts_comp_cd != 0); - txq_complete(txq); - max = (elts_n - (elts_head - txq->elts_tail)); - if (max > elts_n) - max -= elts_n; - assert(max >= 1); - assert(max <= elts_n); - /* Always leave one free entry in the ring. */ - --max; - if (max == 0) - return 0; - if (max > pkts_n) - max = pkts_n; - for (i = 0; (i != max); ++i) { - struct rte_mbuf *buf = pkts[i]; - unsigned int elts_head_next = - (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); - struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; - struct txq_elt *elt = &(*txq->elts)[elts_head]; - unsigned int segs = NB_SEGS(buf); -#ifdef MLX4_PMD_SOFT_COUNTERS - unsigned int sent_size = 0; -#endif - uint32_t send_flags = 0; - - /* Clean up old buffer. */ - if (likely(elt->buf != NULL)) { - struct rte_mbuf *tmp = elt->buf; - -#ifndef NDEBUG - /* Poisoning. */ - memset(elt, 0x66, sizeof(*elt)); -#endif - /* Faster than rte_pktmbuf_free(). */ - do { - struct rte_mbuf *next = NEXT(tmp); - - rte_pktmbuf_free_seg(tmp); - tmp = next; - } while (tmp != NULL); - } - /* Request TX completion. */ - if (unlikely(--elts_comp_cd == 0)) { - elts_comp_cd = txq->elts_comp_cd_init; - ++elts_comp; - send_flags |= IBV_EXP_QP_BURST_SIGNALED; - } - /* Should we enable HW CKSUM offload */ - if (buf->ol_flags & - (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { - send_flags |= IBV_EXP_QP_BURST_IP_CSUM; - /* HW does not support checksum offloads at arbitrary - * offsets but automatically recognizes the packet - * type. For inner L3/L4 checksums, only VXLAN (UDP) - * tunnels are currently supported. */ - if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) - send_flags |= IBV_EXP_QP_BURST_TUNNEL; - } - if (likely(segs == 1)) { - uintptr_t addr; - uint32_t length; - uint32_t lkey; - - /* Retrieve buffer information. */ - addr = rte_pktmbuf_mtod(buf, uintptr_t); - length = DATA_LEN(buf); - /* Retrieve Memory Region key for this memory pool. */ - lkey = txq_mp2mr(txq, txq_mb2mp(buf)); - if (unlikely(lkey == (uint32_t)-1)) { - /* MR does not exist. */ - DEBUG("%p: unable to get MP <-> MR" - " association", (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; - } - /* Update element. */ - elt->buf = buf; - if (txq->priv->vf) - rte_prefetch0((volatile void *) - (uintptr_t)addr); - RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); - /* Put packet into send queue. */ -#if MLX4_PMD_MAX_INLINE > 0 - if (length <= txq->max_inline) - err = txq->if_qp->send_pending_inline - (txq->qp, - (void *)addr, - length, - send_flags); - else -#endif - err = txq->if_qp->send_pending - (txq->qp, - addr, - length, - lkey, - send_flags); - if (unlikely(err)) - goto stop; -#ifdef MLX4_PMD_SOFT_COUNTERS - sent_size += length; -#endif - } else { -#if MLX4_PMD_SGE_WR_N > 1 - struct ibv_sge sges[MLX4_PMD_SGE_WR_N]; - struct tx_burst_sg_ret ret; - - ret = tx_burst_sg(txq, segs, elt, buf, elts_head, - &sges); - if (ret.length == (unsigned int)-1) - goto stop; - RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); - /* Put SG list into send queue. */ - err = txq->if_qp->send_pending_sg_list - (txq->qp, - sges, - ret.num, - send_flags); - if (unlikely(err)) - goto stop; -#ifdef MLX4_PMD_SOFT_COUNTERS - sent_size += ret.length; -#endif -#else /* MLX4_PMD_SGE_WR_N > 1 */ - DEBUG("%p: TX scattered buffers support not" - " compiled in", (void *)txq); - goto stop; -#endif /* MLX4_PMD_SGE_WR_N > 1 */ - } - elts_head = elts_head_next; -#ifdef MLX4_PMD_SOFT_COUNTERS - /* Increment sent bytes counter. */ - txq->stats.obytes += sent_size; -#endif - } -stop: - /* Take a shortcut if nothing must be sent. */ - if (unlikely(i == 0)) - return 0; -#ifdef MLX4_PMD_SOFT_COUNTERS - /* Increment sent packets counter. */ - txq->stats.opackets += i; -#endif - /* Ring QP doorbell. */ - err = txq->if_qp->send_flush(txq->qp); - if (unlikely(err)) { - /* A nonzero value is not supposed to be returned. - * Nothing can be done about it. */ - DEBUG("%p: send_flush() failed with error %d", - (void *)txq, err); - } - txq->elts_head = elts_head; - txq->elts_comp += elts_comp; - txq->elts_comp_cd = elts_comp_cd; - return i; -} - -/** - * DPDK callback for TX in secondary processes. - * - * This function configures all queues from primary process information - * if necessary before reverting to the normal TX burst callback. - * - * @param dpdk_txq - * Generic pointer to TX queue structure. - * @param[in] pkts - * Packets to transmit. - * @param pkts_n - * Number of packets in array. - * - * @return - * Number of packets successfully transmitted (<= pkts_n). - */ -static uint16_t -mlx4_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts, - uint16_t pkts_n) -{ - struct txq *txq = dpdk_txq; - struct priv *priv = mlx4_secondary_data_setup(txq->priv); - struct priv *primary_priv; - unsigned int index; - - if (priv == NULL) - return 0; - primary_priv = - mlx4_secondary_data[priv->dev->data->port_id].primary_priv; - /* Look for queue index in both private structures. */ - for (index = 0; index != priv->txqs_n; ++index) - if (((*primary_priv->txqs)[index] == txq) || - ((*priv->txqs)[index] == txq)) - break; - if (index == priv->txqs_n) - return 0; - txq = (*priv->txqs)[index]; - return priv->dev->tx_pkt_burst(txq, pkts, pkts_n); -} - -/** - * Configure a TX queue. - * - * @param dev - * Pointer to Ethernet device structure. - * @param txq - * Pointer to TX queue structure. - * @param desc - * Number of descriptors to configure in queue. - * @param socket - * NUMA socket on which memory must be allocated. - * @param[in] conf - * Thresholds parameters. - * - * @return - * 0 on success, errno value on failure. - */ -static int -txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, - unsigned int socket, const struct rte_eth_txconf *conf) -{ - struct priv *priv = mlx4_get_priv(dev); - struct txq tmpl = { - .priv = priv, - .socket = socket - }; - union { - struct ibv_exp_query_intf_params params; - struct ibv_exp_qp_init_attr init; - struct ibv_exp_res_domain_init_attr rd; - struct ibv_exp_cq_init_attr cq; - struct ibv_exp_qp_attr mod; - } attr; - enum ibv_exp_query_intf_status status; - int ret = 0; - - (void)conf; /* Thresholds configuration (ignored). */ - if (priv == NULL) - return EINVAL; - if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) { - ERROR("%p: invalid number of TX descriptors (must be a" - " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N); - return EINVAL; - } - desc /= MLX4_PMD_SGE_WR_N; - /* MRs will be registered in mp2mr[] later. */ - attr.rd = (struct ibv_exp_res_domain_init_attr){ - .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL | - IBV_EXP_RES_DOMAIN_MSG_MODEL), - .thread_model = IBV_EXP_THREAD_SINGLE, - .msg_model = IBV_EXP_MSG_HIGH_BW, - }; - tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd); - if (tmpl.rd == NULL) { - ret = ENOMEM; - ERROR("%p: RD creation failure: %s", - (void *)dev, strerror(ret)); - goto error; - } - attr.cq = (struct ibv_exp_cq_init_attr){ - .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, - .res_domain = tmpl.rd, - }; - tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq); - if (tmpl.cq == NULL) { - ret = ENOMEM; - ERROR("%p: CQ creation failure: %s", - (void *)dev, strerror(ret)); - goto error; - } - DEBUG("priv->device_attr.max_qp_wr is %d", - priv->device_attr.max_qp_wr); - DEBUG("priv->device_attr.max_sge is %d", - priv->device_attr.max_sge); - attr.init = (struct ibv_exp_qp_init_attr){ - /* CQ to be associated with the send queue. */ - .send_cq = tmpl.cq, - /* CQ to be associated with the receive queue. */ - .recv_cq = tmpl.cq, - .cap = { - /* Max number of outstanding WRs. */ - .max_send_wr = ((priv->device_attr.max_qp_wr < desc) ? - priv->device_attr.max_qp_wr : - desc), - /* Max number of scatter/gather elements in a WR. */ - .max_send_sge = ((priv->device_attr.max_sge < - MLX4_PMD_SGE_WR_N) ? - priv->device_attr.max_sge : - MLX4_PMD_SGE_WR_N), -#if MLX4_PMD_MAX_INLINE > 0 - .max_inline_data = MLX4_PMD_MAX_INLINE, -#endif - }, - .qp_type = IBV_QPT_RAW_PACKET, - /* Do *NOT* enable this, completions events are managed per - * TX burst. */ - .sq_sig_all = 0, - .pd = priv->pd, - .res_domain = tmpl.rd, - .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD | - IBV_EXP_QP_INIT_ATTR_RES_DOMAIN), - }; - tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init); - if (tmpl.qp == NULL) { - ret = (errno ? errno : EINVAL); - ERROR("%p: QP creation failure: %s", - (void *)dev, strerror(ret)); - goto error; - } -#if MLX4_PMD_MAX_INLINE > 0 - /* ibv_create_qp() updates this value. */ - tmpl.max_inline = attr.init.cap.max_inline_data; -#endif - attr.mod = (struct ibv_exp_qp_attr){ - /* Move the QP to this state. */ - .qp_state = IBV_QPS_INIT, - /* Primary port number. */ - .port_num = priv->port - }; - ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, - (IBV_EXP_QP_STATE | IBV_EXP_QP_PORT)); - if (ret) { - ERROR("%p: QP state to IBV_QPS_INIT failed: %s", - (void *)dev, strerror(ret)); - goto error; - } - ret = txq_alloc_elts(&tmpl, desc); - if (ret) { - ERROR("%p: TXQ allocation failed: %s", - (void *)dev, strerror(ret)); - goto error; - } - attr.mod = (struct ibv_exp_qp_attr){ - .qp_state = IBV_QPS_RTR - }; - ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE); - if (ret) { - ERROR("%p: QP state to IBV_QPS_RTR failed: %s", - (void *)dev, strerror(ret)); - goto error; - } - attr.mod.qp_state = IBV_QPS_RTS; - ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE); - if (ret) { - ERROR("%p: QP state to IBV_QPS_RTS failed: %s", - (void *)dev, strerror(ret)); - goto error; - } - attr.params = (struct ibv_exp_query_intf_params){ - .intf_scope = IBV_EXP_INTF_GLOBAL, - .intf = IBV_EXP_INTF_CQ, - .obj = tmpl.cq, - }; - tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status); - if (tmpl.if_cq == NULL) { - ERROR("%p: CQ interface family query failed with status %d", - (void *)dev, status); - goto error; - } - attr.params = (struct ibv_exp_query_intf_params){ - .intf_scope = IBV_EXP_INTF_GLOBAL, - .intf = IBV_EXP_INTF_QP_BURST, - .obj = tmpl.qp, -#ifdef HAVE_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK - /* MC loopback must be disabled when not using a VF. */ - .family_flags = - (!priv->vf ? - IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK : - 0), -#endif - }; - tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status); - if (tmpl.if_qp == NULL) { - ERROR("%p: QP interface family query failed with status %d", - (void *)dev, status); - goto error; - } - /* Clean up txq in case we're reinitializing it. */ - DEBUG("%p: cleaning-up old txq just in case", (void *)txq); - txq_cleanup(txq); - *txq = tmpl; - DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl); - /* Pre-register known mempools. */ - rte_mempool_walk(txq_mp2mr_iter, txq); - assert(ret == 0); - return 0; -error: - txq_cleanup(&tmpl); - assert(ret > 0); - return ret; -} - -/** - * DPDK callback to configure a TX queue. - * - * @param dev - * Pointer to Ethernet device structure. - * @param idx - * TX queue index. - * @param desc - * Number of descriptors to configure in queue. - * @param socket - * NUMA socket on which memory must be allocated. - * @param[in] conf - * Thresholds parameters. - * - * @return - * 0 on success, negative errno value on failure. - */ -static int -mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, - unsigned int socket, const struct rte_eth_txconf *conf) -{ - struct priv *priv = dev->data->dev_private; - struct txq *txq = (*priv->txqs)[idx]; - int ret; - - if (mlx4_is_secondary()) - return -E_RTE_SECONDARY; - priv_lock(priv); - DEBUG("%p: configuring queue %u for %u descriptors", - (void *)dev, idx, desc); - if (idx >= priv->txqs_n) { - ERROR("%p: queue index out of range (%u >= %u)", - (void *)dev, idx, priv->txqs_n); - priv_unlock(priv); - return -EOVERFLOW; - } - if (txq != NULL) { - DEBUG("%p: reusing already allocated queue index %u (%p)", - (void *)dev, idx, (void *)txq); - if (priv->started) { - priv_unlock(priv); - return -EEXIST; - } - (*priv->txqs)[idx] = NULL; - txq_cleanup(txq); - } else { - txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, socket); - if (txq == NULL) { - ERROR("%p: unable to allocate queue index %u", - (void *)dev, idx); - priv_unlock(priv); - return -ENOMEM; - } - } - ret = txq_setup(dev, txq, desc, socket, conf); - if (ret) - rte_free(txq); - else { - txq->stats.idx = idx; - DEBUG("%p: adding TX queue %p to list", - (void *)dev, (void *)txq); - (*priv->txqs)[idx] = txq; - /* Update send callback. */ - dev->tx_pkt_burst = mlx4_tx_burst; - } - priv_unlock(priv); - return -ret; -} - -/** - * DPDK callback to release a TX queue. - * - * @param dpdk_txq - * Generic TX queue pointer. - */ -static void -mlx4_tx_queue_release(void *dpdk_txq) -{ - struct txq *txq = (struct txq *)dpdk_txq; - struct priv *priv; - unsigned int i; - - if (mlx4_is_secondary()) - return; - if (txq == NULL) - return; - priv = txq->priv; - priv_lock(priv); - for (i = 0; (i != priv->txqs_n); ++i) - if ((*priv->txqs)[i] == txq) { - DEBUG("%p: removing TX queue %p from list", - (void *)priv->dev, (void *)txq); - (*priv->txqs)[i] = NULL; - break; - } - txq_cleanup(txq); - rte_free(txq); - priv_unlock(priv); -} - -/* RX queues handling. */ - -/** - * Allocate RX queue elements with scattered packets support. - * - * @param rxq - * Pointer to RX queue structure. - * @param elts_n - * Number of elements to allocate. - * @param[in] pool - * If not NULL, fetch buffers from this array instead of allocating them - * with rte_pktmbuf_alloc(). - * - * @return - * 0 on success, errno value on failure. - */ -static int -rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n, - struct rte_mbuf **pool) -{ - unsigned int i; - struct rxq_elt_sp (*elts)[elts_n] = - rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0, - rxq->socket); - int ret = 0; - - if (elts == NULL) { - ERROR("%p: can't allocate packets array", (void *)rxq); - ret = ENOMEM; - goto error; - } - /* For each WR (packet). */ - for (i = 0; (i != elts_n); ++i) { - unsigned int j; - struct rxq_elt_sp *elt = &(*elts)[i]; - struct ibv_recv_wr *wr = &elt->wr; - struct ibv_sge (*sges)[(elemof(elt->sges))] = &elt->sges; - - /* These two arrays must have the same size. */ - assert(elemof(elt->sges) == elemof(elt->bufs)); - /* Configure WR. */ - wr->wr_id = i; - wr->next = &(*elts)[(i + 1)].wr; - wr->sg_list = &(*sges)[0]; - wr->num_sge = elemof(*sges); - /* For each SGE (segment). */ - for (j = 0; (j != elemof(elt->bufs)); ++j) { - struct ibv_sge *sge = &(*sges)[j]; - struct rte_mbuf *buf; - - if (pool != NULL) { - buf = *(pool++); - assert(buf != NULL); - rte_pktmbuf_reset(buf); - } else - buf = rte_pktmbuf_alloc(rxq->mp); - if (buf == NULL) { - assert(pool == NULL); - ERROR("%p: empty mbuf pool", (void *)rxq); - ret = ENOMEM; - goto error; - } - elt->bufs[j] = buf; - /* Headroom is reserved by rte_pktmbuf_alloc(). */ - assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); - /* Buffer is supposed to be empty. */ - assert(rte_pktmbuf_data_len(buf) == 0); - assert(rte_pktmbuf_pkt_len(buf) == 0); - /* sge->addr must be able to store a pointer. */ - assert(sizeof(sge->addr) >= sizeof(uintptr_t)); - if (j == 0) { - /* The first SGE keeps its headroom. */ - sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); - sge->length = (buf->buf_len - - RTE_PKTMBUF_HEADROOM); - } else { - /* Subsequent SGEs lose theirs. */ - assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); - SET_DATA_OFF(buf, 0); - sge->addr = (uintptr_t)buf->buf_addr; - sge->length = buf->buf_len; - } - sge->lkey = rxq->mr->lkey; - /* Redundant check for tailroom. */ - assert(sge->length == rte_pktmbuf_tailroom(buf)); - } - } - /* The last WR pointer must be NULL. */ - (*elts)[(i - 1)].wr.next = NULL; - DEBUG("%p: allocated and configured %u WRs (%zu segments)", - (void *)rxq, elts_n, (elts_n * elemof((*elts)[0].sges))); - rxq->elts_n = elts_n; - rxq->elts_head = 0; - rxq->elts.sp = elts; - assert(ret == 0); - return 0; -error: - if (elts != NULL) { - assert(pool == NULL); - for (i = 0; (i != elemof(*elts)); ++i) { - unsigned int j; - struct rxq_elt_sp *elt = &(*elts)[i]; - - for (j = 0; (j != elemof(elt->bufs)); ++j) { - struct rte_mbuf *buf = elt->bufs[j]; - - if (buf != NULL) - rte_pktmbuf_free_seg(buf); - } - } - rte_free(elts); - } - DEBUG("%p: failed, freed everything", (void *)rxq); - assert(ret > 0); - return ret; -} - -/** - * Free RX queue elements with scattered packets support. - * - * @param rxq - * Pointer to RX queue structure. - */ -static void -rxq_free_elts_sp(struct rxq *rxq) -{ - unsigned int i; - unsigned int elts_n = rxq->elts_n; - struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp; - - DEBUG("%p: freeing WRs", (void *)rxq); - rxq->elts_n = 0; - rxq->elts.sp = NULL; - if (elts == NULL) - return; - for (i = 0; (i != elemof(*elts)); ++i) { - unsigned int j; - struct rxq_elt_sp *elt = &(*elts)[i]; - - for (j = 0; (j != elemof(elt->bufs)); ++j) { - struct rte_mbuf *buf = elt->bufs[j]; - - if (buf != NULL) - rte_pktmbuf_free_seg(buf); - } - } - rte_free(elts); -} - -/** - * Allocate RX queue elements. - * - * @param rxq - * Pointer to RX queue structure. - * @param elts_n - * Number of elements to allocate. - * @param[in] pool - * If not NULL, fetch buffers from this array instead of allocating them - * with rte_pktmbuf_alloc(). - * - * @return - * 0 on success, errno value on failure. - */ -static int -rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool) -{ - unsigned int i; - struct rxq_elt (*elts)[elts_n] = - rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0, - rxq->socket); - int ret = 0; - - if (elts == NULL) { - ERROR("%p: can't allocate packets array", (void *)rxq); - ret = ENOMEM; - goto error; - } - /* For each WR (packet). */ - for (i = 0; (i != elts_n); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct ibv_recv_wr *wr = &elt->wr; - struct ibv_sge *sge = &(*elts)[i].sge; - struct rte_mbuf *buf; - - if (pool != NULL) { - buf = *(pool++); - assert(buf != NULL); - rte_pktmbuf_reset(buf); - } else - buf = rte_pktmbuf_alloc(rxq->mp); - if (buf == NULL) { - assert(pool == NULL); - ERROR("%p: empty mbuf pool", (void *)rxq); - ret = ENOMEM; - goto error; - } - /* Configure WR. Work request ID contains its own index in - * the elts array and the offset between SGE buffer header and - * its data. */ - WR_ID(wr->wr_id).id = i; - WR_ID(wr->wr_id).offset = - (((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) - - (uintptr_t)buf); - wr->next = &(*elts)[(i + 1)].wr; - wr->sg_list = sge; - wr->num_sge = 1; - /* Headroom is reserved by rte_pktmbuf_alloc(). */ - assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); - /* Buffer is supposed to be empty. */ - assert(rte_pktmbuf_data_len(buf) == 0); - assert(rte_pktmbuf_pkt_len(buf) == 0); - /* sge->addr must be able to store a pointer. */ - assert(sizeof(sge->addr) >= sizeof(uintptr_t)); - /* SGE keeps its headroom. */ - sge->addr = (uintptr_t) - ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM); - sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM); - sge->lkey = rxq->mr->lkey; - /* Redundant check for tailroom. */ - assert(sge->length == rte_pktmbuf_tailroom(buf)); - /* Make sure elts index and SGE mbuf pointer can be deduced - * from WR ID. */ - if ((WR_ID(wr->wr_id).id != i) || - ((void *)((uintptr_t)sge->addr - - WR_ID(wr->wr_id).offset) != buf)) { - ERROR("%p: cannot store index and offset in WR ID", - (void *)rxq); - sge->addr = 0; - rte_pktmbuf_free(buf); - ret = EOVERFLOW; - goto error; - } - } - /* The last WR pointer must be NULL. */ - (*elts)[(i - 1)].wr.next = NULL; - DEBUG("%p: allocated and configured %u single-segment WRs", - (void *)rxq, elts_n); - rxq->elts_n = elts_n; - rxq->elts_head = 0; - rxq->elts.no_sp = elts; - assert(ret == 0); - return 0; -error: - if (elts != NULL) { - assert(pool == NULL); - for (i = 0; (i != elemof(*elts)); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct rte_mbuf *buf; - - if (elt->sge.addr == 0) - continue; - assert(WR_ID(elt->wr.wr_id).id == i); - buf = (void *)((uintptr_t)elt->sge.addr - - WR_ID(elt->wr.wr_id).offset); - rte_pktmbuf_free_seg(buf); - } - rte_free(elts); - } - DEBUG("%p: failed, freed everything", (void *)rxq); - assert(ret > 0); - return ret; -} - -/** - * Free RX queue elements. - * - * @param rxq - * Pointer to RX queue structure. - */ -static void -rxq_free_elts(struct rxq *rxq) -{ - unsigned int i; - unsigned int elts_n = rxq->elts_n; - struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp; - - DEBUG("%p: freeing WRs", (void *)rxq); - rxq->elts_n = 0; - rxq->elts.no_sp = NULL; - if (elts == NULL) - return; - for (i = 0; (i != elemof(*elts)); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct rte_mbuf *buf; - - if (elt->sge.addr == 0) - continue; - assert(WR_ID(elt->wr.wr_id).id == i); - buf = (void *)((uintptr_t)elt->sge.addr - - WR_ID(elt->wr.wr_id).offset); - rte_pktmbuf_free_seg(buf); - } - rte_free(elts); -} - -/** - * Delete flow steering rule. - * - * @param rxq - * Pointer to RX queue structure. - * @param mac_index - * MAC address index. - * @param vlan_index - * VLAN index. - */ -static void -rxq_del_flow(struct rxq *rxq, unsigned int mac_index, unsigned int vlan_index) -{ -#ifndef NDEBUG - struct priv *priv = rxq->priv; - const uint8_t (*mac)[ETHER_ADDR_LEN] = - (const uint8_t (*)[ETHER_ADDR_LEN]) - priv->mac[mac_index].addr_bytes; -#endif - assert(rxq->mac_flow[mac_index][vlan_index] != NULL); - DEBUG("%p: removing MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u" - " (VLAN ID %" PRIu16 ")", - (void *)rxq, - (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5], - mac_index, priv->vlan_filter[vlan_index].id); - claim_zero(ibv_destroy_flow(rxq->mac_flow[mac_index][vlan_index])); - rxq->mac_flow[mac_index][vlan_index] = NULL; -} - -/** - * Unregister a MAC address from a RX queue. - * - * @param rxq - * Pointer to RX queue structure. - * @param mac_index - * MAC address index. - */ -static void -rxq_mac_addr_del(struct rxq *rxq, unsigned int mac_index) -{ - struct priv *priv = rxq->priv; - unsigned int i; - unsigned int vlans = 0; - - assert(mac_index < elemof(priv->mac)); - if (!BITFIELD_ISSET(rxq->mac_configured, mac_index)) - return; - for (i = 0; (i != elemof(priv->vlan_filter)); ++i) { - if (!priv->vlan_filter[i].enabled) - continue; - rxq_del_flow(rxq, mac_index, i); - vlans++; - } - if (!vlans) { - rxq_del_flow(rxq, mac_index, 0); - } - BITFIELD_RESET(rxq->mac_configured, mac_index); -} - -/** - * Unregister all MAC addresses from a RX queue. - * - * @param rxq - * Pointer to RX queue structure. - */ -static void -rxq_mac_addrs_del(struct rxq *rxq) -{ - struct priv *priv = rxq->priv; - unsigned int i; - - for (i = 0; (i != elemof(priv->mac)); ++i) - rxq_mac_addr_del(rxq, i); -} - -static int rxq_promiscuous_enable(struct rxq *); -static void rxq_promiscuous_disable(struct rxq *); - -/** - * Add single flow steering rule. - * - * @param rxq - * Pointer to RX queue structure. - * @param mac_index - * MAC address index to register. - * @param vlan_index - * VLAN index. Use -1 for a flow without VLAN. - * - * @return - * 0 on success, errno value on failure. - */ -static int -rxq_add_flow(struct rxq *rxq, unsigned int mac_index, unsigned int vlan_index) -{ - struct ibv_flow *flow; - struct priv *priv = rxq->priv; - const uint8_t (*mac)[ETHER_ADDR_LEN] = - (const uint8_t (*)[ETHER_ADDR_LEN]) - priv->mac[mac_index].addr_bytes; - - /* Allocate flow specification on the stack. */ - struct __attribute__((packed)) { - struct ibv_flow_attr attr; - struct ibv_flow_spec_eth spec; - } data; - struct ibv_flow_attr *attr = &data.attr; - struct ibv_flow_spec_eth *spec = &data.spec; - - assert(mac_index < elemof(priv->mac)); - assert((vlan_index < elemof(priv->vlan_filter)) || (vlan_index == -1u)); - /* - * No padding must be inserted by the compiler between attr and spec. - * This layout is expected by libibverbs. - */ - assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec); - *attr = (struct ibv_flow_attr){ - .type = IBV_FLOW_ATTR_NORMAL, - .priority = 3, - .num_of_specs = 1, - .port = priv->port, - .flags = 0 - }; - *spec = (struct ibv_flow_spec_eth){ - .type = IBV_FLOW_SPEC_ETH, - .size = sizeof(*spec), - .val = { - .dst_mac = { - (*mac)[0], (*mac)[1], (*mac)[2], - (*mac)[3], (*mac)[4], (*mac)[5] - }, - .vlan_tag = ((vlan_index != -1u) ? - htons(priv->vlan_filter[vlan_index].id) : - 0), - }, - .mask = { - .dst_mac = "\xff\xff\xff\xff\xff\xff", - .vlan_tag = ((vlan_index != -1u) ? htons(0xfff) : 0), - } - }; - DEBUG("%p: adding MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u" - " (VLAN %s %" PRIu16 ")", - (void *)rxq, - (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5], - mac_index, - ((vlan_index != -1u) ? "ID" : "index"), - ((vlan_index != -1u) ? priv->vlan_filter[vlan_index].id : -1u)); - /* Create related flow. */ - errno = 0; - flow = ibv_create_flow(rxq->qp, attr); - if (flow == NULL) { - /* It's not clear whether errno is always set in this case. */ - ERROR("%p: flow configuration failed, errno=%d: %s", - (void *)rxq, errno, - (errno ? strerror(errno) : "Unknown error")); - if (errno) - return errno; - return EINVAL; - } - if (vlan_index == -1u) - vlan_index = 0; - assert(rxq->mac_flow[mac_index][vlan_index] == NULL); - rxq->mac_flow[mac_index][vlan_index] = flow; - return 0; -} - -/** - * Register a MAC address in a RX queue. - * - * @param rxq - * Pointer to RX queue structure. - * @param mac_index - * MAC address index to register. - * - * @return - * 0 on success, errno value on failure. - */ -static int -rxq_mac_addr_add(struct rxq *rxq, unsigned int mac_index) -{ - struct priv *priv = rxq->priv; - unsigned int i; - unsigned int vlans = 0; - int ret; - - assert(mac_index < elemof(priv->mac)); - if (BITFIELD_ISSET(rxq->mac_configured, mac_index)) - rxq_mac_addr_del(rxq, mac_index); - /* Fill VLAN specifications. */ - for (i = 0; (i != elemof(priv->vlan_filter)); ++i) { - if (!priv->vlan_filter[i].enabled) - continue; - /* Create related flow. */ - ret = rxq_add_flow(rxq, mac_index, i); - if (!ret) { - vlans++; - continue; - } - /* Failure, rollback. */ - while (i != 0) - if (priv->vlan_filter[--i].enabled) - rxq_del_flow(rxq, mac_index, i); - assert(ret > 0); - return ret; - } - /* In case there is no VLAN filter. */ - if (!vlans) { - ret = rxq_add_flow(rxq, mac_index, -1); - if (ret) - return ret; - } - BITFIELD_SET(rxq->mac_configured, mac_index); - return 0; -} - -/** - * Register all MAC addresses in a RX queue. - * - * @param rxq - * Pointer to RX queue structure. - * - * @return - * 0 on success, errno value on failure. - */ -static int -rxq_mac_addrs_add(struct rxq *rxq) -{ - struct priv *priv = rxq->priv; - unsigned int i; - int ret; - - for (i = 0; (i != elemof(priv->mac)); ++i) { - if (!BITFIELD_ISSET(priv->mac_configured, i)) - continue; - ret = rxq_mac_addr_add(rxq, i); - if (!ret) - continue; - /* Failure, rollback. */ - while (i != 0) - rxq_mac_addr_del(rxq, --i); - assert(ret > 0); - return ret; - } - return 0; -} - -/** - * Unregister a MAC address. - * - * In RSS mode, the MAC address is unregistered from the parent queue, - * otherwise it is unregistered from each queue directly. - * - * @param priv - * Pointer to private structure. - * @param mac_index - * MAC address index. - */ -static void -priv_mac_addr_del(struct priv *priv, unsigned int mac_index) -{ - unsigned int i; - - assert(!priv->isolated); - assert(mac_index < elemof(priv->mac)); - if (!BITFIELD_ISSET(priv->mac_configured, mac_index)) - return; - if (priv->rss) { - rxq_mac_addr_del(LIST_FIRST(&priv->parents), mac_index); - goto end; - } - for (i = 0; (i != priv->dev->data->nb_rx_queues); ++i) - rxq_mac_addr_del((*priv->rxqs)[i], mac_index); -end: - BITFIELD_RESET(priv->mac_configured, mac_index); -} - -/** - * Register a MAC address. - * - * In RSS mode, the MAC address is registered in the parent queue, - * otherwise it is registered in each queue directly. - * - * @param priv - * Pointer to private structure. - * @param mac_index - * MAC address index to use. - * @param mac - * MAC address to register. - * - * @return - * 0 on success, errno value on failure. - */ -static int -priv_mac_addr_add(struct priv *priv, unsigned int mac_index, - const uint8_t (*mac)[ETHER_ADDR_LEN]) -{ - unsigned int i; - int ret; - - assert(mac_index < elemof(priv->mac)); - /* First, make sure this address isn't already configured. */ - for (i = 0; (i != elemof(priv->mac)); ++i) { - /* Skip this index, it's going to be reconfigured. */ - if (i == mac_index) - continue; - if (!BITFIELD_ISSET(priv->mac_configured, i)) - continue; - if (memcmp(priv->mac[i].addr_bytes, *mac, sizeof(*mac))) - continue; - /* Address already configured elsewhere, return with error. */ - return EADDRINUSE; - } - if (BITFIELD_ISSET(priv->mac_configured, mac_index)) - priv_mac_addr_del(priv, mac_index); - priv->mac[mac_index] = (struct ether_addr){ - { - (*mac)[0], (*mac)[1], (*mac)[2], - (*mac)[3], (*mac)[4], (*mac)[5] - } - }; - /* If device isn't started, this is all we need to do. */ - if (!priv->started) { -#ifndef NDEBUG - /* Verify that all queues have this index disabled. */ - for (i = 0; (i != priv->rxqs_n); ++i) { - if ((*priv->rxqs)[i] == NULL) - continue; - assert(!BITFIELD_ISSET - ((*priv->rxqs)[i]->mac_configured, mac_index)); - } -#endif - goto end; - } - if (priv->rss) { - ret = rxq_mac_addr_add(LIST_FIRST(&priv->parents), mac_index); - if (ret) - return ret; - goto end; - } - for (i = 0; (i != priv->rxqs_n); ++i) { - if ((*priv->rxqs)[i] == NULL) - continue; - ret = rxq_mac_addr_add((*priv->rxqs)[i], mac_index); - if (!ret) - continue; - /* Failure, rollback. */ - while (i != 0) - if ((*priv->rxqs)[(--i)] != NULL) - rxq_mac_addr_del((*priv->rxqs)[i], mac_index); - return ret; - } -end: - BITFIELD_SET(priv->mac_configured, mac_index); - return 0; -} - -/** - * Enable allmulti mode in a RX queue. - * - * @param rxq - * Pointer to RX queue structure. - * - * @return - * 0 on success, errno value on failure. - */ -static int -rxq_allmulticast_enable(struct rxq *rxq) -{ - struct ibv_flow *flow; - struct ibv_flow_attr attr = { - .type = IBV_FLOW_ATTR_MC_DEFAULT, - .num_of_specs = 0, - .port = rxq->priv->port, - .flags = 0 - }; - - DEBUG("%p: enabling allmulticast mode", (void *)rxq); - if (rxq->allmulti_flow != NULL) - return EBUSY; - errno = 0; - flow = ibv_create_flow(rxq->qp, &attr); - if (flow == NULL) { - /* It's not clear whether errno is always set in this case. */ - ERROR("%p: flow configuration failed, errno=%d: %s", - (void *)rxq, errno, - (errno ? strerror(errno) : "Unknown error")); - if (errno) - return errno; - return EINVAL; - } - rxq->allmulti_flow = flow; - DEBUG("%p: allmulticast mode enabled", (void *)rxq); - return 0; -} - -/** - * Disable allmulti mode in a RX queue. - * - * @param rxq - * Pointer to RX queue structure. - */ -static void -rxq_allmulticast_disable(struct rxq *rxq) -{ - DEBUG("%p: disabling allmulticast mode", (void *)rxq); - if (rxq->allmulti_flow == NULL) - return; - claim_zero(ibv_destroy_flow(rxq->allmulti_flow)); - rxq->allmulti_flow = NULL; - DEBUG("%p: allmulticast mode disabled", (void *)rxq); -} - -/** - * Enable promiscuous mode in a RX queue. - * - * @param rxq - * Pointer to RX queue structure. - * - * @return - * 0 on success, errno value on failure. - */ -static int -rxq_promiscuous_enable(struct rxq *rxq) -{ - struct ibv_flow *flow; - struct ibv_flow_attr attr = { - .type = IBV_FLOW_ATTR_ALL_DEFAULT, - .num_of_specs = 0, - .port = rxq->priv->port, - .flags = 0 - }; - - if (rxq->priv->vf) - return 0; - DEBUG("%p: enabling promiscuous mode", (void *)rxq); - if (rxq->promisc_flow != NULL) - return EBUSY; - errno = 0; - flow = ibv_create_flow(rxq->qp, &attr); - if (flow == NULL) { - /* It's not clear whether errno is always set in this case. */ - ERROR("%p: flow configuration failed, errno=%d: %s", - (void *)rxq, errno, - (errno ? strerror(errno) : "Unknown error")); - if (errno) - return errno; - return EINVAL; - } - rxq->promisc_flow = flow; - DEBUG("%p: promiscuous mode enabled", (void *)rxq); - return 0; -} - -/** - * Disable promiscuous mode in a RX queue. - * - * @param rxq - * Pointer to RX queue structure. - */ -static void -rxq_promiscuous_disable(struct rxq *rxq) -{ - if (rxq->priv->vf) - return; - DEBUG("%p: disabling promiscuous mode", (void *)rxq); - if (rxq->promisc_flow == NULL) - return; - claim_zero(ibv_destroy_flow(rxq->promisc_flow)); - rxq->promisc_flow = NULL; - DEBUG("%p: promiscuous mode disabled", (void *)rxq); -} - -/** - * Clean up a RX queue. - * - * Destroy objects, free allocated memory and reset the structure for reuse. - * - * @param rxq - * Pointer to RX queue structure. - */ -static void -rxq_cleanup(struct rxq *rxq) -{ - struct ibv_exp_release_intf_params params; - - DEBUG("cleaning up %p", (void *)rxq); - if (rxq->sp) - rxq_free_elts_sp(rxq); - else - rxq_free_elts(rxq); - if (rxq->if_qp != NULL) { - assert(rxq->priv != NULL); - assert(rxq->priv->ctx != NULL); - assert(rxq->qp != NULL); - params = (struct ibv_exp_release_intf_params){ - .comp_mask = 0, - }; - claim_zero(ibv_exp_release_intf(rxq->priv->ctx, - rxq->if_qp, - ¶ms)); - } - if (rxq->if_cq != NULL) { - assert(rxq->priv != NULL); - assert(rxq->priv->ctx != NULL); - assert(rxq->cq != NULL); - params = (struct ibv_exp_release_intf_params){ - .comp_mask = 0, - }; - claim_zero(ibv_exp_release_intf(rxq->priv->ctx, - rxq->if_cq, - ¶ms)); - } - if (rxq->qp != NULL && !rxq->priv->isolated) { - rxq_promiscuous_disable(rxq); - rxq_allmulticast_disable(rxq); - rxq_mac_addrs_del(rxq); - } - if (rxq->qp != NULL) - claim_zero(ibv_destroy_qp(rxq->qp)); - if (rxq->cq != NULL) - claim_zero(ibv_destroy_cq(rxq->cq)); - if (rxq->channel != NULL) - claim_zero(ibv_destroy_comp_channel(rxq->channel)); - if (rxq->rd != NULL) { - struct ibv_exp_destroy_res_domain_attr attr = { - .comp_mask = 0, - }; - - assert(rxq->priv != NULL); - assert(rxq->priv->ctx != NULL); - claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx, - rxq->rd, - &attr)); - } - if (rxq->mr != NULL) - claim_zero(ibv_dereg_mr(rxq->mr)); - memset(rxq, 0, sizeof(*rxq)); -} - -/** - * Translate RX completion flags to packet type. - * - * @param flags - * RX completion flags returned by poll_length_flags(). - * - * @note: fix mlx4_dev_supported_ptypes_get() if any change here. - * - * @return - * Packet type for struct rte_mbuf. - */ -static inline uint32_t -rxq_cq_to_pkt_type(uint32_t flags) -{ - uint32_t pkt_type; - - if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) - pkt_type = - TRANSPOSE(flags, - IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, - RTE_PTYPE_L3_IPV4_EXT_UNKNOWN) | - TRANSPOSE(flags, - IBV_EXP_CQ_RX_OUTER_IPV6_PACKET, - RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) | - TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV4_PACKET, - RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) | - TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV6_PACKET, - RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN); - else - pkt_type = - TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV4_PACKET, - RTE_PTYPE_L3_IPV4_EXT_UNKNOWN) | - TRANSPOSE(flags, - IBV_EXP_CQ_RX_IPV6_PACKET, - RTE_PTYPE_L3_IPV6_EXT_UNKNOWN); - return pkt_type; -} - -/** - * Translate RX completion flags to offload flags. - * - * @param[in] rxq - * Pointer to RX queue structure. - * @param flags - * RX completion flags returned by poll_length_flags(). - * - * @return - * Offload flags (ol_flags) for struct rte_mbuf. - */ -static inline uint32_t -rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) -{ - uint32_t ol_flags = 0; - - if (rxq->csum) - ol_flags |= - TRANSPOSE(flags, - IBV_EXP_CQ_RX_IP_CSUM_OK, - PKT_RX_IP_CKSUM_GOOD) | - TRANSPOSE(flags, - IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK, - PKT_RX_L4_CKSUM_GOOD); - if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) - ol_flags |= - TRANSPOSE(flags, - IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK, - PKT_RX_IP_CKSUM_GOOD) | - TRANSPOSE(flags, - IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK, - PKT_RX_L4_CKSUM_GOOD); - return ol_flags; -} - -static uint16_t -mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n); - -/** - * DPDK callback for RX with scattered packets support. - * - * @param dpdk_rxq - * Generic pointer to RX queue structure. - * @param[out] pkts - * Array to store received packets. - * @param pkts_n - * Maximum number of packets in array. - * - * @return - * Number of packets successfully received (<= pkts_n). - */ -static uint16_t -mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) -{ - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; - const unsigned int elts_n = rxq->elts_n; - unsigned int elts_head = rxq->elts_head; - struct ibv_recv_wr head; - struct ibv_recv_wr **next = &head.next; - struct ibv_recv_wr *bad_wr; - unsigned int i; - unsigned int pkts_ret = 0; - int ret; - - if (unlikely(!rxq->sp)) - return mlx4_rx_burst(dpdk_rxq, pkts, pkts_n); - if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */ - return 0; - for (i = 0; (i != pkts_n); ++i) { - struct rxq_elt_sp *elt = &(*elts)[elts_head]; - struct ibv_recv_wr *wr = &elt->wr; - uint64_t wr_id = wr->wr_id; - unsigned int len; - unsigned int pkt_buf_len; - struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */ - struct rte_mbuf **pkt_buf_next = &pkt_buf; - unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM; - unsigned int j = 0; - uint32_t flags; - - /* Sanity checks. */ -#ifdef NDEBUG - (void)wr_id; -#endif - assert(wr_id < rxq->elts_n); - assert(wr->sg_list == elt->sges); - assert(wr->num_sge == elemof(elt->sges)); - assert(elts_head < rxq->elts_n); - assert(rxq->elts_head < rxq->elts_n); - ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL, - &flags); - if (unlikely(ret < 0)) { - struct ibv_wc wc; - int wcs_n; - - DEBUG("rxq=%p, poll_length() failed (ret=%d)", - (void *)rxq, ret); - /* ibv_poll_cq() must be used in case of failure. */ - wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); - if (unlikely(wcs_n == 0)) - break; - if (unlikely(wcs_n < 0)) { - DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", - (void *)rxq, wcs_n); - break; - } - assert(wcs_n == 1); - if (unlikely(wc.status != IBV_WC_SUCCESS)) { - /* Whatever, just repost the offending WR. */ - DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" - " completion status (%d): %s", - (void *)rxq, wc.wr_id, wc.status, - ibv_wc_status_str(wc.status)); -#ifdef MLX4_PMD_SOFT_COUNTERS - /* Increment dropped packets counter. */ - ++rxq->stats.idropped; -#endif - /* Link completed WRs together for repost. */ - *next = wr; - next = &wr->next; - goto repost; - } - ret = wc.byte_len; - } - if (ret == 0) - break; - len = ret; - pkt_buf_len = len; - /* Link completed WRs together for repost. */ - *next = wr; - next = &wr->next; - /* - * Replace spent segments with new ones, concatenate and - * return them as pkt_buf. - */ - while (1) { - struct ibv_sge *sge = &elt->sges[j]; - struct rte_mbuf *seg = elt->bufs[j]; - struct rte_mbuf *rep; - unsigned int seg_tailroom; - - /* - * Fetch initial bytes of packet descriptor into a - * cacheline while allocating rep. - */ - rte_prefetch0(seg); - rep = rte_mbuf_raw_alloc(rxq->mp); - if (unlikely(rep == NULL)) { - /* - * Unable to allocate a replacement mbuf, - * repost WR. - */ - DEBUG("rxq=%p, wr_id=%" PRIu64 ":" - " can't allocate a new mbuf", - (void *)rxq, wr_id); - if (pkt_buf != NULL) { - *pkt_buf_next = NULL; - rte_pktmbuf_free(pkt_buf); - } - /* Increase out of memory counters. */ - ++rxq->stats.rx_nombuf; - ++rxq->priv->dev->data->rx_mbuf_alloc_failed; - goto repost; - } -#ifndef NDEBUG - /* Poison user-modifiable fields in rep. */ - NEXT(rep) = (void *)((uintptr_t)-1); - SET_DATA_OFF(rep, 0xdead); - DATA_LEN(rep) = 0xd00d; - PKT_LEN(rep) = 0xdeadd00d; - NB_SEGS(rep) = 0x2a; - PORT(rep) = 0x2a; - rep->ol_flags = -1; - /* - * Clear special flags in mbuf to avoid - * crashing while freeing. - */ - rep->ol_flags &= - ~(uint64_t)(IND_ATTACHED_MBUF | - CTRL_MBUF_FLAG); -#endif - assert(rep->buf_len == seg->buf_len); - /* Reconfigure sge to use rep instead of seg. */ - assert(sge->lkey == rxq->mr->lkey); - sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom); - elt->bufs[j] = rep; - ++j; - /* Update pkt_buf if it's the first segment, or link - * seg to the previous one and update pkt_buf_next. */ - *pkt_buf_next = seg; - pkt_buf_next = &NEXT(seg); - /* Update seg information. */ - seg_tailroom = (seg->buf_len - seg_headroom); - assert(sge->length == seg_tailroom); - SET_DATA_OFF(seg, seg_headroom); - if (likely(len <= seg_tailroom)) { - /* Last segment. */ - DATA_LEN(seg) = len; - PKT_LEN(seg) = len; - /* Sanity check. */ - assert(rte_pktmbuf_headroom(seg) == - seg_headroom); - assert(rte_pktmbuf_tailroom(seg) == - (seg_tailroom - len)); - break; - } - DATA_LEN(seg) = seg_tailroom; - PKT_LEN(seg) = seg_tailroom; - /* Sanity check. */ - assert(rte_pktmbuf_headroom(seg) == seg_headroom); - assert(rte_pktmbuf_tailroom(seg) == 0); - /* Fix len and clear headroom for next segments. */ - len -= seg_tailroom; - seg_headroom = 0; - } - /* Update head and tail segments. */ - *pkt_buf_next = NULL; - assert(pkt_buf != NULL); - assert(j != 0); - NB_SEGS(pkt_buf) = j; - PORT(pkt_buf) = rxq->port_id; - PKT_LEN(pkt_buf) = pkt_buf_len; - pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); - pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); - - /* Return packet. */ - *(pkts++) = pkt_buf; - ++pkts_ret; -#ifdef MLX4_PMD_SOFT_COUNTERS - /* Increase bytes counter. */ - rxq->stats.ibytes += pkt_buf_len; -#endif -repost: - if (++elts_head >= elts_n) - elts_head = 0; - continue; - } - if (unlikely(i == 0)) - return 0; - *next = NULL; - /* Repost WRs. */ -#ifdef DEBUG_RECV - DEBUG("%p: reposting %d WRs", (void *)rxq, i); -#endif - ret = ibv_post_recv(rxq->qp, head.next, &bad_wr); - if (unlikely(ret)) { - /* Inability to repost WRs is fatal. */ - DEBUG("%p: ibv_post_recv(): failed for WR %p: %s", - (void *)rxq->priv, - (void *)bad_wr, - strerror(ret)); - abort(); - } - rxq->elts_head = elts_head; -#ifdef MLX4_PMD_SOFT_COUNTERS - /* Increase packets counter. */ - rxq->stats.ipackets += pkts_ret; -#endif - return pkts_ret; -} - -/** - * DPDK callback for RX. - * - * The following function is the same as mlx4_rx_burst_sp(), except it doesn't - * manage scattered packets. Improves performance when MRU is lower than the - * size of the first segment. - * - * @param dpdk_rxq - * Generic pointer to RX queue structure. - * @param[out] pkts - * Array to store received packets. - * @param pkts_n - * Maximum number of packets in array. - * - * @return - * Number of packets successfully received (<= pkts_n). - */ -static uint16_t -mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) -{ - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; - const unsigned int elts_n = rxq->elts_n; - unsigned int elts_head = rxq->elts_head; - struct ibv_sge sges[pkts_n]; - unsigned int i; - unsigned int pkts_ret = 0; - int ret; - - if (unlikely(rxq->sp)) - return mlx4_rx_burst_sp(dpdk_rxq, pkts, pkts_n); - for (i = 0; (i != pkts_n); ++i) { - struct rxq_elt *elt = &(*elts)[elts_head]; - struct ibv_recv_wr *wr = &elt->wr; - uint64_t wr_id = wr->wr_id; - unsigned int len; - struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr - - WR_ID(wr_id).offset); - struct rte_mbuf *rep; - uint32_t flags; - - /* Sanity checks. */ - assert(WR_ID(wr_id).id < rxq->elts_n); - assert(wr->sg_list == &elt->sge); - assert(wr->num_sge == 1); - assert(elts_head < rxq->elts_n); - assert(rxq->elts_head < rxq->elts_n); - /* - * Fetch initial bytes of packet descriptor into a - * cacheline while allocating rep. - */ - rte_mbuf_prefetch_part1(seg); - rte_mbuf_prefetch_part2(seg); - ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL, - &flags); - if (unlikely(ret < 0)) { - struct ibv_wc wc; - int wcs_n; - - DEBUG("rxq=%p, poll_length() failed (ret=%d)", - (void *)rxq, ret); - /* ibv_poll_cq() must be used in case of failure. */ - wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); - if (unlikely(wcs_n == 0)) - break; - if (unlikely(wcs_n < 0)) { - DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", - (void *)rxq, wcs_n); - break; - } - assert(wcs_n == 1); - if (unlikely(wc.status != IBV_WC_SUCCESS)) { - /* Whatever, just repost the offending WR. */ - DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" - " completion status (%d): %s", - (void *)rxq, wc.wr_id, wc.status, - ibv_wc_status_str(wc.status)); -#ifdef MLX4_PMD_SOFT_COUNTERS - /* Increment dropped packets counter. */ - ++rxq->stats.idropped; -#endif - /* Add SGE to array for repost. */ - sges[i] = elt->sge; - goto repost; - } - ret = wc.byte_len; - } - if (ret == 0) - break; - len = ret; - rep = rte_mbuf_raw_alloc(rxq->mp); - if (unlikely(rep == NULL)) { - /* - * Unable to allocate a replacement mbuf, - * repost WR. - */ - DEBUG("rxq=%p, wr_id=%" PRIu32 ":" - " can't allocate a new mbuf", - (void *)rxq, WR_ID(wr_id).id); - /* Increase out of memory counters. */ - ++rxq->stats.rx_nombuf; - ++rxq->priv->dev->data->rx_mbuf_alloc_failed; - /* Add SGE to array for repost. */ - sges[i] = elt->sge; - goto repost; - } - - /* Reconfigure sge to use rep instead of seg. */ - elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; - assert(elt->sge.lkey == rxq->mr->lkey); - WR_ID(wr->wr_id).offset = - (((uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM) - - (uintptr_t)rep); - assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id); - - /* Add SGE to array for repost. */ - sges[i] = elt->sge; - - /* Update seg information. */ - SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM); - NB_SEGS(seg) = 1; - PORT(seg) = rxq->port_id; - NEXT(seg) = NULL; - PKT_LEN(seg) = len; - DATA_LEN(seg) = len; - seg->packet_type = rxq_cq_to_pkt_type(flags); - seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); - - /* Return packet. */ - *(pkts++) = seg; - ++pkts_ret; -#ifdef MLX4_PMD_SOFT_COUNTERS - /* Increase bytes counter. */ - rxq->stats.ibytes += len; -#endif -repost: - if (++elts_head >= elts_n) - elts_head = 0; - continue; - } - if (unlikely(i == 0)) - return 0; - /* Repost WRs. */ -#ifdef DEBUG_RECV - DEBUG("%p: reposting %u WRs", (void *)rxq, i); -#endif - ret = rxq->if_qp->recv_burst(rxq->qp, sges, i); - if (unlikely(ret)) { - /* Inability to repost WRs is fatal. */ - DEBUG("%p: recv_burst(): failed (ret=%d)", - (void *)rxq->priv, - ret); - abort(); - } - rxq->elts_head = elts_head; -#ifdef MLX4_PMD_SOFT_COUNTERS - /* Increase packets counter. */ - rxq->stats.ipackets += pkts_ret; -#endif - return pkts_ret; -} - -/** - * DPDK callback for RX in secondary processes. - * - * This function configures all queues from primary process information - * if necessary before reverting to the normal RX burst callback. - * - * @param dpdk_rxq - * Generic pointer to RX queue structure. - * @param[out] pkts - * Array to store received packets. - * @param pkts_n - * Maximum number of packets in array. - * - * @return - * Number of packets successfully received (<= pkts_n). - */ -static uint16_t -mlx4_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts, - uint16_t pkts_n) -{ - struct rxq *rxq = dpdk_rxq; - struct priv *priv = mlx4_secondary_data_setup(rxq->priv); - struct priv *primary_priv; - unsigned int index; - - if (priv == NULL) - return 0; - primary_priv = - mlx4_secondary_data[priv->dev->data->port_id].primary_priv; - /* Look for queue index in both private structures. */ - for (index = 0; index != priv->rxqs_n; ++index) - if (((*primary_priv->rxqs)[index] == rxq) || - ((*priv->rxqs)[index] == rxq)) - break; - if (index == priv->rxqs_n) - return 0; - rxq = (*priv->rxqs)[index]; - return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n); -} - -/** - * Allocate a Queue Pair. - * Optionally setup inline receive if supported. - * - * @param priv - * Pointer to private structure. - * @param cq - * Completion queue to associate with QP. - * @param desc - * Number of descriptors in QP (hint only). - * - * @return - * QP pointer or NULL in case of error. - */ -static struct ibv_qp * -rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc, - struct ibv_exp_res_domain *rd) -{ - struct ibv_exp_qp_init_attr attr = { - /* CQ to be associated with the send queue. */ - .send_cq = cq, - /* CQ to be associated with the receive queue. */ - .recv_cq = cq, - .cap = { - /* Max number of outstanding WRs. */ - .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ? - priv->device_attr.max_qp_wr : - desc), - /* Max number of scatter/gather elements in a WR. */ - .max_recv_sge = ((priv->device_attr.max_sge < - MLX4_PMD_SGE_WR_N) ? - priv->device_attr.max_sge : - MLX4_PMD_SGE_WR_N), - }, - .qp_type = IBV_QPT_RAW_PACKET, - .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD | - IBV_EXP_QP_INIT_ATTR_RES_DOMAIN), - .pd = priv->pd, - .res_domain = rd, - }; - -#ifdef INLINE_RECV - attr.max_inl_recv = priv->inl_recv_size; - attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV; -#endif - return ibv_exp_create_qp(priv->ctx, &attr); -} - -#ifdef RSS_SUPPORT - -/** - * Allocate a RSS Queue Pair. - * Optionally setup inline receive if supported. - * - * @param priv - * Pointer to private structure. - * @param cq - * Completion queue to associate with QP. - * @param desc - * Number of descriptors in QP (hint only). - * @param children_n - * If nonzero, a number of children for parent QP and zero for a child. - * @param rxq_parent - * Pointer for a parent in a child case, NULL otherwise. - * - * @return - * QP pointer or NULL in case of error. - */ -static struct ibv_qp * -rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc, - int children_n, struct ibv_exp_res_domain *rd, - struct rxq *rxq_parent) -{ - struct ibv_exp_qp_init_attr attr = { - /* CQ to be associated with the send queue. */ - .send_cq = cq, - /* CQ to be associated with the receive queue. */ - .recv_cq = cq, - .cap = { - /* Max number of outstanding WRs. */ - .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ? - priv->device_attr.max_qp_wr : - desc), - /* Max number of scatter/gather elements in a WR. */ - .max_recv_sge = ((priv->device_attr.max_sge < - MLX4_PMD_SGE_WR_N) ? - priv->device_attr.max_sge : - MLX4_PMD_SGE_WR_N), - }, - .qp_type = IBV_QPT_RAW_PACKET, - .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD | - IBV_EXP_QP_INIT_ATTR_RES_DOMAIN | - IBV_EXP_QP_INIT_ATTR_QPG), - .pd = priv->pd, - .res_domain = rd, - }; - -#ifdef INLINE_RECV - attr.max_inl_recv = priv->inl_recv_size, - attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV; -#endif - if (children_n > 0) { - attr.qpg.qpg_type = IBV_EXP_QPG_PARENT; - /* TSS isn't necessary. */ - attr.qpg.parent_attrib.tss_child_count = 0; - attr.qpg.parent_attrib.rss_child_count = - rte_align32pow2(children_n + 1) >> 1; - DEBUG("initializing parent RSS queue"); - } else { - attr.qpg.qpg_type = IBV_EXP_QPG_CHILD_RX; - attr.qpg.qpg_parent = rxq_parent->qp; - DEBUG("initializing child RSS queue"); - } - return ibv_exp_create_qp(priv->ctx, &attr); -} - -#endif /* RSS_SUPPORT */ - -/** - * Reconfigure a RX queue with new parameters. - * - * rxq_rehash() does not allocate mbufs, which, if not done from the right - * thread (such as a control thread), may corrupt the pool. - * In case of failure, the queue is left untouched. - * - * @param dev - * Pointer to Ethernet device structure. - * @param rxq - * RX queue pointer. - * - * @return - * 0 on success, errno value on failure. - */ -static int -rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq) -{ - struct priv *priv = rxq->priv; - struct rxq tmpl = *rxq; - unsigned int mbuf_n; - unsigned int desc_n; - struct rte_mbuf **pool; - unsigned int i, k; - struct ibv_exp_qp_attr mod; - struct ibv_recv_wr *bad_wr; - unsigned int mb_len; - int err; - - mb_len = rte_pktmbuf_data_room_size(rxq->mp); - DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq); - /* Number of descriptors and mbufs currently allocated. */ - desc_n = (tmpl.elts_n * (tmpl.sp ? MLX4_PMD_SGE_WR_N : 1)); - mbuf_n = desc_n; - /* Toggle RX checksum offload if hardware supports it. */ - if (priv->hw_csum) { - tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum; - rxq->csum = tmpl.csum; - } - if (priv->hw_csum_l2tun) { - tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum; - rxq->csum_l2tun = tmpl.csum_l2tun; - } - /* Enable scattered packets support for this queue if necessary. */ - assert(mb_len >= RTE_PKTMBUF_HEADROOM); - if (dev->data->dev_conf.rxmode.enable_scatter && - (dev->data->dev_conf.rxmode.max_rx_pkt_len > - (mb_len - RTE_PKTMBUF_HEADROOM))) { - tmpl.sp = 1; - desc_n /= MLX4_PMD_SGE_WR_N; - } else - tmpl.sp = 0; - DEBUG("%p: %s scattered packets support (%u WRs)", - (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n); - /* If scatter mode is the same as before, nothing to do. */ - if (tmpl.sp == rxq->sp) { - DEBUG("%p: nothing to do", (void *)dev); - return 0; - } - /* Remove attached flows if RSS is disabled (no parent queue). */ - if (!priv->rss && !priv->isolated) { - rxq_allmulticast_disable(&tmpl); - rxq_promiscuous_disable(&tmpl); - rxq_mac_addrs_del(&tmpl); - /* Update original queue in case of failure. */ - rxq->allmulti_flow = tmpl.allmulti_flow; - rxq->promisc_flow = tmpl.promisc_flow; - memcpy(rxq->mac_configured, tmpl.mac_configured, - sizeof(rxq->mac_configured)); - memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow)); - } - /* From now on, any failure will render the queue unusable. - * Reinitialize QP. */ - if (!tmpl.qp) - goto skip_init; - mod = (struct ibv_exp_qp_attr){ .qp_state = IBV_QPS_RESET }; - err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE); - if (err) { - ERROR("%p: cannot reset QP: %s", (void *)dev, strerror(err)); - assert(err > 0); - return err; - } - mod = (struct ibv_exp_qp_attr){ - /* Move the QP to this state. */ - .qp_state = IBV_QPS_INIT, - /* Primary port number. */ - .port_num = priv->port - }; - err = ibv_exp_modify_qp(tmpl.qp, &mod, - (IBV_EXP_QP_STATE | - IBV_EXP_QP_PORT)); - if (err) { - ERROR("%p: QP state to IBV_QPS_INIT failed: %s", - (void *)dev, strerror(err)); - assert(err > 0); - return err; - }; -skip_init: - err = ibv_resize_cq(tmpl.cq, desc_n); - if (err) { - ERROR("%p: cannot resize CQ: %s", (void *)dev, strerror(err)); - assert(err > 0); - return err; - } - /* Reconfigure flows. Do not care for errors. */ - if (!priv->rss && !priv->isolated) { - rxq_mac_addrs_add(&tmpl); - if (priv->promisc) - rxq_promiscuous_enable(&tmpl); - if (priv->allmulti) - rxq_allmulticast_enable(&tmpl); - /* Update original queue in case of failure. */ - rxq->allmulti_flow = tmpl.allmulti_flow; - rxq->promisc_flow = tmpl.promisc_flow; - memcpy(rxq->mac_configured, tmpl.mac_configured, - sizeof(rxq->mac_configured)); - memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow)); - } - /* Allocate pool. */ - pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0); - if (pool == NULL) { - ERROR("%p: cannot allocate memory", (void *)dev); - return ENOBUFS; - } - /* Snatch mbufs from original queue. */ - k = 0; - if (rxq->sp) { - struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; - - for (i = 0; (i != elemof(*elts)); ++i) { - struct rxq_elt_sp *elt = &(*elts)[i]; - unsigned int j; - - for (j = 0; (j != elemof(elt->bufs)); ++j) { - assert(elt->bufs[j] != NULL); - pool[k++] = elt->bufs[j]; - } - } - } else { - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; - - for (i = 0; (i != elemof(*elts)); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct rte_mbuf *buf = (void *) - ((uintptr_t)elt->sge.addr - - WR_ID(elt->wr.wr_id).offset); - - assert(WR_ID(elt->wr.wr_id).id == i); - pool[k++] = buf; - } - } - assert(k == mbuf_n); - tmpl.elts_n = 0; - tmpl.elts.sp = NULL; - assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp); - err = ((tmpl.sp) ? - rxq_alloc_elts_sp(&tmpl, desc_n, pool) : - rxq_alloc_elts(&tmpl, desc_n, pool)); - if (err) { - ERROR("%p: cannot reallocate WRs, aborting", (void *)dev); - rte_free(pool); - assert(err > 0); - return err; - } - assert(tmpl.elts_n == desc_n); - assert(tmpl.elts.sp != NULL); - rte_free(pool); - /* Clean up original data. */ - rxq->elts_n = 0; - rte_free(rxq->elts.sp); - rxq->elts.sp = NULL; - if (!tmpl.qp) - goto skip_rtr; - /* Post WRs. */ - err = ibv_post_recv(tmpl.qp, - (tmpl.sp ? - &(*tmpl.elts.sp)[0].wr : - &(*tmpl.elts.no_sp)[0].wr), - &bad_wr); - if (err) { - ERROR("%p: ibv_post_recv() failed for WR %p: %s", - (void *)dev, - (void *)bad_wr, - strerror(err)); - goto skip_rtr; - } - mod = (struct ibv_exp_qp_attr){ - .qp_state = IBV_QPS_RTR - }; - err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE); - if (err) - ERROR("%p: QP state to IBV_QPS_RTR failed: %s", - (void *)dev, strerror(err)); -skip_rtr: - *rxq = tmpl; - assert(err >= 0); - return err; -} - -/** - * Create verbs QP resources associated with a rxq. - * - * @param rxq - * Pointer to RX queue structure. - * @param desc - * Number of descriptors to configure in queue. - * @param inactive - * If true, the queue is disabled because its index is higher or - * equal to the real number of queues, which must be a power of 2. - * @param children_n - * The number of children in a parent case, zero for a child. - * @param rxq_parent - * The pointer to a parent RX structure for a child in RSS case, - * NULL for parent. - * - * @return - * 0 on success, errno value on failure. - */ -int -rxq_create_qp(struct rxq *rxq, - uint16_t desc, - int inactive, - int children_n, - struct rxq *rxq_parent) -{ - int ret; - struct ibv_exp_qp_attr mod; - struct ibv_exp_query_intf_params params; - enum ibv_exp_query_intf_status status; - struct ibv_recv_wr *bad_wr; - int parent = (children_n > 0); - struct priv *priv = rxq->priv; - -#ifdef RSS_SUPPORT - if (priv->rss && !inactive && (rxq_parent || parent)) - rxq->qp = rxq_setup_qp_rss(priv, rxq->cq, desc, - children_n, rxq->rd, - rxq_parent); - else -#endif /* RSS_SUPPORT */ - rxq->qp = rxq_setup_qp(priv, rxq->cq, desc, rxq->rd); - if (rxq->qp == NULL) { - ret = (errno ? errno : EINVAL); - ERROR("QP creation failure: %s", - strerror(ret)); - return ret; - } - mod = (struct ibv_exp_qp_attr){ - /* Move the QP to this state. */ - .qp_state = IBV_QPS_INIT, - /* Primary port number. */ - .port_num = priv->port - }; - ret = ibv_exp_modify_qp(rxq->qp, &mod, - (IBV_EXP_QP_STATE | -#ifdef RSS_SUPPORT - (parent ? IBV_EXP_QP_GROUP_RSS : 0) | -#endif /* RSS_SUPPORT */ - IBV_EXP_QP_PORT)); - if (ret) { - ERROR("QP state to IBV_QPS_INIT failed: %s", - strerror(ret)); - return ret; - } - if (!priv->isolated && (parent || !priv->rss)) { - /* Configure MAC and broadcast addresses. */ - ret = rxq_mac_addrs_add(rxq); - if (ret) { - ERROR("QP flow attachment failed: %s", - strerror(ret)); - return ret; - } - } - if (!parent) { - ret = ibv_post_recv(rxq->qp, - (rxq->sp ? - &(*rxq->elts.sp)[0].wr : - &(*rxq->elts.no_sp)[0].wr), - &bad_wr); - if (ret) { - ERROR("ibv_post_recv() failed for WR %p: %s", - (void *)bad_wr, - strerror(ret)); - return ret; - } - } - mod = (struct ibv_exp_qp_attr){ - .qp_state = IBV_QPS_RTR - }; - ret = ibv_exp_modify_qp(rxq->qp, &mod, IBV_EXP_QP_STATE); - if (ret) { - ERROR("QP state to IBV_QPS_RTR failed: %s", - strerror(ret)); - return ret; - } - params = (struct ibv_exp_query_intf_params){ - .intf_scope = IBV_EXP_INTF_GLOBAL, - .intf = IBV_EXP_INTF_QP_BURST, - .obj = rxq->qp, - }; - rxq->if_qp = ibv_exp_query_intf(priv->ctx, ¶ms, &status); - if (rxq->if_qp == NULL) { - ERROR("QP interface family query failed with status %d", - status); - return errno; - } - return 0; -} - -/** - * Configure a RX queue. - * - * @param dev - * Pointer to Ethernet device structure. - * @param rxq - * Pointer to RX queue structure. - * @param desc - * Number of descriptors to configure in queue. - * @param socket - * NUMA socket on which memory must be allocated. - * @param inactive - * If true, the queue is disabled because its index is higher or - * equal to the real number of queues, which must be a power of 2. - * @param[in] conf - * Thresholds parameters. - * @param mp - * Memory pool for buffer allocations. - * @param children_n - * The number of children in a parent case, zero for a child. - * @param rxq_parent - * The pointer to a parent RX structure (or NULL) in a child case, - * NULL for parent. - * - * @return - * 0 on success, errno value on failure. - */ -static int -rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, - unsigned int socket, int inactive, - const struct rte_eth_rxconf *conf, - struct rte_mempool *mp, int children_n, - struct rxq *rxq_parent) -{ - struct priv *priv = dev->data->dev_private; - struct rxq tmpl = { - .priv = priv, - .mp = mp, - .socket = socket - }; - union { - struct ibv_exp_query_intf_params params; - struct ibv_exp_cq_init_attr cq; - struct ibv_exp_res_domain_init_attr rd; - } attr; - enum ibv_exp_query_intf_status status; - unsigned int mb_len; - int ret = 0; - int parent = (children_n > 0); - - (void)conf; /* Thresholds configuration (ignored). */ - /* - * If this is a parent queue, hardware must support RSS and - * RSS must be enabled. - */ - assert((!parent) || ((priv->hw_rss) && (priv->rss))); - if (parent) { - /* Even if unused, ibv_create_cq() requires at least one - * descriptor. */ - desc = 1; - goto skip_mr; - } - mb_len = rte_pktmbuf_data_room_size(mp); - if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) { - ERROR("%p: invalid number of RX descriptors (must be a" - " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N); - return EINVAL; - } - /* Toggle RX checksum offload if hardware supports it. */ - if (priv->hw_csum) - tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum; - if (priv->hw_csum_l2tun) - tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum; - /* Enable scattered packets support for this queue if necessary. */ - assert(mb_len >= RTE_PKTMBUF_HEADROOM); - if (dev->data->dev_conf.rxmode.max_rx_pkt_len <= - (mb_len - RTE_PKTMBUF_HEADROOM)) { - tmpl.sp = 0; - } else if (dev->data->dev_conf.rxmode.enable_scatter) { - tmpl.sp = 1; - desc /= MLX4_PMD_SGE_WR_N; - } else { - WARN("%p: the requested maximum Rx packet size (%u) is" - " larger than a single mbuf (%u) and scattered" - " mode has not been requested", - (void *)dev, - dev->data->dev_conf.rxmode.max_rx_pkt_len, - mb_len - RTE_PKTMBUF_HEADROOM); - } - DEBUG("%p: %s scattered packets support (%u WRs)", - (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc); - /* Use the entire RX mempool as the memory region. */ - tmpl.mr = mlx4_mp2mr(priv->pd, mp); - if (tmpl.mr == NULL) { - ret = EINVAL; - ERROR("%p: MR creation failure: %s", - (void *)dev, strerror(ret)); - goto error; - } -skip_mr: - attr.rd = (struct ibv_exp_res_domain_init_attr){ - .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL | - IBV_EXP_RES_DOMAIN_MSG_MODEL), - .thread_model = IBV_EXP_THREAD_SINGLE, - .msg_model = IBV_EXP_MSG_HIGH_BW, - }; - tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd); - if (tmpl.rd == NULL) { - ret = ENOMEM; - ERROR("%p: RD creation failure: %s", - (void *)dev, strerror(ret)); - goto error; - } - if (dev->data->dev_conf.intr_conf.rxq) { - tmpl.channel = ibv_create_comp_channel(priv->ctx); - if (tmpl.channel == NULL) { - ret = ENOMEM; - ERROR("%p: Rx interrupt completion channel creation" - " failure: %s", - (void *)dev, strerror(ret)); - goto error; - } - } - attr.cq = (struct ibv_exp_cq_init_attr){ - .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, - .res_domain = tmpl.rd, - }; - tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, tmpl.channel, 0, - &attr.cq); - if (tmpl.cq == NULL) { - ret = ENOMEM; - ERROR("%p: CQ creation failure: %s", - (void *)dev, strerror(ret)); - goto error; - } - DEBUG("priv->device_attr.max_qp_wr is %d", - priv->device_attr.max_qp_wr); - DEBUG("priv->device_attr.max_sge is %d", - priv->device_attr.max_sge); - /* Allocate descriptors for RX queues, except for the RSS parent. */ - if (parent) - goto skip_alloc; - if (tmpl.sp) - ret = rxq_alloc_elts_sp(&tmpl, desc, NULL); - else - ret = rxq_alloc_elts(&tmpl, desc, NULL); + /* Prepare internal flow rules. */ + ret = mlx4_flow_sync(priv, &error); if (ret) { - ERROR("%p: RXQ allocation failed: %s", - (void *)dev, strerror(ret)); - return ret; + ERROR("cannot set up internal flow rules (code %d, \"%s\")," + " flow error type %d, cause %p, message: %s", + -ret, strerror(-ret), error.type, error.cause, + error.message ? error.message : "(unspecified)"); } -skip_alloc: - if (parent || rxq_parent || !priv->rss) { - ret = rxq_create_qp(&tmpl, desc, inactive, - children_n, rxq_parent); - if (ret) - goto error; - } - /* Save port ID. */ - tmpl.port_id = dev->data->port_id; - DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id); - attr.params = (struct ibv_exp_query_intf_params){ - .intf_scope = IBV_EXP_INTF_GLOBAL, - .intf = IBV_EXP_INTF_CQ, - .obj = tmpl.cq, - }; - tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status); - if (tmpl.if_cq == NULL) { - ret = EINVAL; - ERROR("%p: CQ interface family query failed with status %d", - (void *)dev, status); - goto error; - } - /* Clean up rxq in case we're reinitializing it. */ - DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq); - rxq_cleanup(rxq); - *rxq = tmpl; - DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl); - assert(ret == 0); - return 0; -error: - rxq_cleanup(&tmpl); - assert(ret > 0); return ret; } /** - * DPDK callback to configure a RX queue. - * - * @param dev - * Pointer to Ethernet device structure. - * @param idx - * RX queue index. - * @param desc - * Number of descriptors to configure in queue. - * @param socket - * NUMA socket on which memory must be allocated. - * @param[in] conf - * Thresholds parameters. - * @param mp - * Memory pool for buffer allocations. - * - * @return - * 0 on success, negative errno value on failure. - */ -static int -mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, - unsigned int socket, const struct rte_eth_rxconf *conf, - struct rte_mempool *mp) -{ - struct rxq *parent; - struct priv *priv = dev->data->dev_private; - struct rxq *rxq = (*priv->rxqs)[idx]; - int inactive = 0; - int ret; - - if (mlx4_is_secondary()) - return -E_RTE_SECONDARY; - priv_lock(priv); - DEBUG("%p: configuring queue %u for %u descriptors", - (void *)dev, idx, desc); - if (idx >= priv->rxqs_n) { - ERROR("%p: queue index out of range (%u >= %u)", - (void *)dev, idx, priv->rxqs_n); - priv_unlock(priv); - return -EOVERFLOW; - } - if (rxq != NULL) { - DEBUG("%p: reusing already allocated queue index %u (%p)", - (void *)dev, idx, (void *)rxq); - if (priv->started) { - priv_unlock(priv); - return -EEXIST; - } - (*priv->rxqs)[idx] = NULL; - rxq_cleanup(rxq); - } else { - rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket); - if (rxq == NULL) { - ERROR("%p: unable to allocate queue index %u", - (void *)dev, idx); - priv_unlock(priv); - return -ENOMEM; - } - } - if (priv->rss && !priv->isolated) { - /* The list consists of the single default one. */ - parent = LIST_FIRST(&priv->parents); - if (idx >= rte_align32pow2(priv->rxqs_n + 1) >> 1) - inactive = 1; - } else { - parent = NULL; - } - ret = rxq_setup(dev, rxq, desc, socket, - inactive, conf, mp, 0, parent); - if (ret) - rte_free(rxq); - else { - rxq->stats.idx = idx; - DEBUG("%p: adding RX queue %p to list", - (void *)dev, (void *)rxq); - (*priv->rxqs)[idx] = rxq; - /* Update receive callback. */ - if (rxq->sp) - dev->rx_pkt_burst = mlx4_rx_burst_sp; - else - dev->rx_pkt_burst = mlx4_rx_burst; - } - priv_unlock(priv); - return -ret; -} - -/** - * DPDK callback to release a RX queue. - * - * @param dpdk_rxq - * Generic RX queue pointer. - */ -static void -mlx4_rx_queue_release(void *dpdk_rxq) -{ - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct priv *priv; - unsigned int i; - - if (mlx4_is_secondary()) - return; - if (rxq == NULL) - return; - priv = rxq->priv; - priv_lock(priv); - for (i = 0; (i != priv->rxqs_n); ++i) - if ((*priv->rxqs)[i] == rxq) { - DEBUG("%p: removing RX queue %p from list", - (void *)priv->dev, (void *)rxq); - (*priv->rxqs)[i] = NULL; - break; - } - rxq_cleanup(rxq); - rte_free(rxq); - priv_unlock(priv); -} - -static int -priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *); - -static int -priv_dev_removal_interrupt_handler_install(struct priv *, struct rte_eth_dev *); - -static int -priv_dev_link_interrupt_handler_install(struct priv *, struct rte_eth_dev *); - -/** * DPDK callback to start the device. * - * Simulate device start by attaching all configured flows. + * Simulate device start by initializing common RSS resources and attaching + * all configured flows. * * @param dev * Pointer to Ethernet device structure. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, negative errno value otherwise and rte_errno is set. */ static int mlx4_dev_start(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; - unsigned int i = 0; - unsigned int r; - struct rxq *rxq; + struct rte_flow_error error; int ret; - if (mlx4_is_secondary()) - return -E_RTE_SECONDARY; - priv_lock(priv); - if (priv->started) { - priv_unlock(priv); + if (priv->started) return 0; - } DEBUG("%p: attaching configured flows to all RX queues", (void *)dev); priv->started = 1; - if (priv->isolated) { - rxq = NULL; - r = 1; - } else if (priv->rss) { - rxq = LIST_FIRST(&priv->parents); - r = 1; - } else { - rxq = (*priv->rxqs)[0]; - r = priv->rxqs_n; - } - /* Iterate only once when RSS is enabled. */ - do { - /* Ignore nonexistent RX queues. */ - if (rxq == NULL) - continue; - ret = rxq_mac_addrs_add(rxq); - if (!ret && priv->promisc) - ret = rxq_promiscuous_enable(rxq); - if (!ret && priv->allmulti) - ret = rxq_allmulticast_enable(rxq); - if (!ret) - continue; - WARN("%p: QP flow attachment failed: %s", - (void *)dev, strerror(ret)); - goto err; - } while ((--r) && ((rxq = (*priv->rxqs)[++i]), i)); - ret = priv_dev_link_interrupt_handler_install(priv, dev); + ret = mlx4_rss_init(priv); if (ret) { - ERROR("%p: LSC handler install failed", - (void *)dev); + ERROR("%p: cannot initialize RSS resources: %s", + (void *)dev, strerror(-ret)); goto err; } - ret = priv_dev_removal_interrupt_handler_install(priv, dev); + ret = mlx4_intr_install(priv); if (ret) { - ERROR("%p: RMV handler install failed", + ERROR("%p: interrupt handler installation failed", (void *)dev); goto err; } - ret = priv_rx_intr_vec_enable(priv); - if (ret) { - ERROR("%p: Rx interrupt vector creation failed", - (void *)dev); - goto err; - } - ret = mlx4_priv_flow_start(priv); + ret = mlx4_flow_sync(priv, &error); if (ret) { - ERROR("%p: flow start failed: %s", - (void *)dev, strerror(ret)); + ERROR("%p: cannot attach flow rules (code %d, \"%s\")," + " flow error type %d, cause %p, message: %s", + (void *)dev, + -ret, strerror(-ret), error.type, error.cause, + error.message ? error.message : "(unspecified)"); goto err; } - priv_unlock(priv); + rte_wmb(); + dev->tx_pkt_burst = mlx4_tx_burst; + dev->rx_pkt_burst = mlx4_rx_burst; return 0; err: /* Rollback. */ - while (i != 0) { - rxq = (*priv->rxqs)[i--]; - if (rxq != NULL) { - rxq_allmulticast_disable(rxq); - rxq_promiscuous_disable(rxq); - rxq_mac_addrs_del(rxq); - } - } priv->started = 0; - priv_unlock(priv); - return -ret; + return ret; } /** @@ -4220,102 +178,19 @@ static void mlx4_dev_stop(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; - unsigned int i = 0; - unsigned int r; - struct rxq *rxq; - if (mlx4_is_secondary()) - return; - priv_lock(priv); - if (!priv->started) { - priv_unlock(priv); + if (!priv->started) return; - } DEBUG("%p: detaching flows from all RX queues", (void *)dev); priv->started = 0; - if (priv->isolated) { - rxq = NULL; - r = 1; - } else if (priv->rss) { - rxq = LIST_FIRST(&priv->parents); - r = 1; - } else { - rxq = (*priv->rxqs)[0]; - r = priv->rxqs_n; - } - mlx4_priv_flow_stop(priv); - /* Iterate only once when RSS is enabled. */ - do { - /* Ignore nonexistent RX queues. */ - if (rxq == NULL) - continue; - rxq_allmulticast_disable(rxq); - rxq_promiscuous_disable(rxq); - rxq_mac_addrs_del(rxq); - } while ((--r) && ((rxq = (*priv->rxqs)[++i]), i)); - priv_unlock(priv); -} - -/** - * Dummy DPDK callback for TX. - * - * This function is used to temporarily replace the real callback during - * unsafe control operations on the queue, or in case of error. - * - * @param dpdk_txq - * Generic pointer to TX queue structure. - * @param[in] pkts - * Packets to transmit. - * @param pkts_n - * Number of packets in array. - * - * @return - * Number of packets successfully transmitted (<= pkts_n). - */ -static uint16_t -removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) -{ - (void)dpdk_txq; - (void)pkts; - (void)pkts_n; - return 0; -} - -/** - * Dummy DPDK callback for RX. - * - * This function is used to temporarily replace the real callback during - * unsafe control operations on the queue, or in case of error. - * - * @param dpdk_rxq - * Generic pointer to RX queue structure. - * @param[out] pkts - * Array to store received packets. - * @param pkts_n - * Maximum number of packets in array. - * - * @return - * Number of packets successfully received (<= pkts_n). - */ -static uint16_t -removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) -{ - (void)dpdk_rxq; - (void)pkts; - (void)pkts_n; - return 0; + dev->tx_pkt_burst = mlx4_tx_burst_removed; + dev->rx_pkt_burst = mlx4_rx_burst_removed; + rte_wmb(); + mlx4_flow_sync(priv, NULL); + mlx4_intr_uninstall(priv); + mlx4_rss_deinit(priv); } -static int -priv_dev_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *); - -static int -priv_dev_removal_interrupt_handler_uninstall(struct priv *, - struct rte_eth_dev *); - -static int -priv_dev_link_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *); - /** * DPDK callback to close the device. * @@ -4327,1047 +202,58 @@ priv_dev_link_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *); static void mlx4_dev_close(struct rte_eth_dev *dev) { - struct priv *priv = mlx4_get_priv(dev); - void *tmp; + struct priv *priv = dev->data->dev_private; unsigned int i; - if (priv == NULL) - return; - priv_lock(priv); DEBUG("%p: closing device \"%s\"", (void *)dev, ((priv->ctx != NULL) ? priv->ctx->device->name : "")); - /* Prevent crashes when queues are still in use. This is unfortunately - * still required for DPDK 1.3 because some programs (such as testpmd) - * never release them before closing the device. */ - dev->rx_pkt_burst = removed_rx_burst; - dev->tx_pkt_burst = removed_tx_burst; - if (priv->rxqs != NULL) { - /* XXX race condition if mlx4_rx_burst() is still running. */ - usleep(1000); - for (i = 0; (i != priv->rxqs_n); ++i) { - tmp = (*priv->rxqs)[i]; - if (tmp == NULL) - continue; - (*priv->rxqs)[i] = NULL; - rxq_cleanup(tmp); - rte_free(tmp); - } - priv->rxqs_n = 0; - priv->rxqs = NULL; - } - if (priv->txqs != NULL) { - /* XXX race condition if mlx4_tx_burst() is still running. */ - usleep(1000); - for (i = 0; (i != priv->txqs_n); ++i) { - tmp = (*priv->txqs)[i]; - if (tmp == NULL) - continue; - (*priv->txqs)[i] = NULL; - txq_cleanup(tmp); - rte_free(tmp); - } - priv->txqs_n = 0; - priv->txqs = NULL; - } - if (priv->rss) - priv_parent_list_cleanup(priv); + dev->rx_pkt_burst = mlx4_rx_burst_removed; + dev->tx_pkt_burst = mlx4_tx_burst_removed; + rte_wmb(); + mlx4_flow_clean(priv); + for (i = 0; i != dev->data->nb_rx_queues; ++i) + mlx4_rx_queue_release(dev->data->rx_queues[i]); + for (i = 0; i != dev->data->nb_tx_queues; ++i) + mlx4_tx_queue_release(dev->data->tx_queues[i]); if (priv->pd != NULL) { assert(priv->ctx != NULL); claim_zero(ibv_dealloc_pd(priv->pd)); claim_zero(ibv_close_device(priv->ctx)); } else assert(priv->ctx == NULL); - priv_dev_removal_interrupt_handler_uninstall(priv, dev); - priv_dev_link_interrupt_handler_uninstall(priv, dev); - priv_rx_intr_vec_disable(priv); - priv_unlock(priv); + mlx4_intr_uninstall(priv); memset(priv, 0, sizeof(*priv)); } -/** - * Change the link state (UP / DOWN). - * - * @param priv - * Pointer to Ethernet device private data. - * @param up - * Nonzero for link up, otherwise link down. - * - * @return - * 0 on success, errno value on failure. - */ -static int -priv_set_link(struct priv *priv, int up) -{ - struct rte_eth_dev *dev = priv->dev; - int err; - unsigned int i; - - if (up) { - err = priv_set_flags(priv, ~IFF_UP, IFF_UP); - if (err) - return err; - for (i = 0; i < priv->rxqs_n; i++) - if ((*priv->rxqs)[i]->sp) - break; - /* Check if an sp queue exists. - * Note: Some old frames might be received. - */ - if (i == priv->rxqs_n) - dev->rx_pkt_burst = mlx4_rx_burst; - else - dev->rx_pkt_burst = mlx4_rx_burst_sp; - dev->tx_pkt_burst = mlx4_tx_burst; - } else { - err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP); - if (err) - return err; - dev->rx_pkt_burst = removed_rx_burst; - dev->tx_pkt_burst = removed_tx_burst; - } - return 0; -} - -/** - * DPDK callback to bring the link DOWN. - * - * @param dev - * Pointer to Ethernet device structure. - * - * @return - * 0 on success, errno value on failure. - */ -static int -mlx4_set_link_down(struct rte_eth_dev *dev) -{ - struct priv *priv = dev->data->dev_private; - int err; - - priv_lock(priv); - err = priv_set_link(priv, 0); - priv_unlock(priv); - return err; -} - -/** - * DPDK callback to bring the link UP. - * - * @param dev - * Pointer to Ethernet device structure. - * - * @return - * 0 on success, errno value on failure. - */ -static int -mlx4_set_link_up(struct rte_eth_dev *dev) -{ - struct priv *priv = dev->data->dev_private; - int err; - - priv_lock(priv); - err = priv_set_link(priv, 1); - priv_unlock(priv); - return err; -} -/** - * DPDK callback to get information about the device. - * - * @param dev - * Pointer to Ethernet device structure. - * @param[out] info - * Info structure output buffer. - */ -static void -mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) -{ - struct priv *priv = mlx4_get_priv(dev); - unsigned int max; - char ifname[IF_NAMESIZE]; - - info->pci_dev = RTE_ETH_DEV_TO_PCI(dev); - - if (priv == NULL) - return; - priv_lock(priv); - /* FIXME: we should ask the device for these values. */ - info->min_rx_bufsize = 32; - info->max_rx_pktlen = 65536; - /* - * Since we need one CQ per QP, the limit is the minimum number - * between the two values. - */ - max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? - priv->device_attr.max_qp : priv->device_attr.max_cq); - /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ - if (max >= 65535) - max = 65535; - info->max_rx_queues = max; - info->max_tx_queues = max; - /* Last array entry is reserved for broadcast. */ - info->max_mac_addrs = (elemof(priv->mac) - 1); - info->rx_offload_capa = - (priv->hw_csum ? - (DEV_RX_OFFLOAD_IPV4_CKSUM | - DEV_RX_OFFLOAD_UDP_CKSUM | - DEV_RX_OFFLOAD_TCP_CKSUM) : - 0); - info->tx_offload_capa = - (priv->hw_csum ? - (DEV_TX_OFFLOAD_IPV4_CKSUM | - DEV_TX_OFFLOAD_UDP_CKSUM | - DEV_TX_OFFLOAD_TCP_CKSUM) : - 0); - if (priv_get_ifname(priv, &ifname) == 0) - info->if_index = if_nametoindex(ifname); - info->speed_capa = - ETH_LINK_SPEED_1G | - ETH_LINK_SPEED_10G | - ETH_LINK_SPEED_20G | - ETH_LINK_SPEED_40G | - ETH_LINK_SPEED_56G; - priv_unlock(priv); -} - -static const uint32_t * -mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev) -{ - static const uint32_t ptypes[] = { - /* refers to rxq_cq_to_pkt_type() */ - RTE_PTYPE_L3_IPV4, - RTE_PTYPE_L3_IPV6, - RTE_PTYPE_INNER_L3_IPV4, - RTE_PTYPE_INNER_L3_IPV6, - RTE_PTYPE_UNKNOWN - }; - - if (dev->rx_pkt_burst == mlx4_rx_burst || - dev->rx_pkt_burst == mlx4_rx_burst_sp) - return ptypes; - return NULL; -} - -/** - * DPDK callback to get device statistics. - * - * @param dev - * Pointer to Ethernet device structure. - * @param[out] stats - * Stats structure output buffer. - */ -static void -mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) -{ - struct priv *priv = mlx4_get_priv(dev); - struct rte_eth_stats tmp = {0}; - unsigned int i; - unsigned int idx; - - if (priv == NULL) - return; - priv_lock(priv); - /* Add software counters. */ - for (i = 0; (i != priv->rxqs_n); ++i) { - struct rxq *rxq = (*priv->rxqs)[i]; - - if (rxq == NULL) - continue; - idx = rxq->stats.idx; - if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { -#ifdef MLX4_PMD_SOFT_COUNTERS - tmp.q_ipackets[idx] += rxq->stats.ipackets; - tmp.q_ibytes[idx] += rxq->stats.ibytes; -#endif - tmp.q_errors[idx] += (rxq->stats.idropped + - rxq->stats.rx_nombuf); - } -#ifdef MLX4_PMD_SOFT_COUNTERS - tmp.ipackets += rxq->stats.ipackets; - tmp.ibytes += rxq->stats.ibytes; -#endif - tmp.ierrors += rxq->stats.idropped; - tmp.rx_nombuf += rxq->stats.rx_nombuf; - } - for (i = 0; (i != priv->txqs_n); ++i) { - struct txq *txq = (*priv->txqs)[i]; - - if (txq == NULL) - continue; - idx = txq->stats.idx; - if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { -#ifdef MLX4_PMD_SOFT_COUNTERS - tmp.q_opackets[idx] += txq->stats.opackets; - tmp.q_obytes[idx] += txq->stats.obytes; -#endif - tmp.q_errors[idx] += txq->stats.odropped; - } -#ifdef MLX4_PMD_SOFT_COUNTERS - tmp.opackets += txq->stats.opackets; - tmp.obytes += txq->stats.obytes; -#endif - tmp.oerrors += txq->stats.odropped; - } -#ifndef MLX4_PMD_SOFT_COUNTERS - /* FIXME: retrieve and add hardware counters. */ -#endif - *stats = tmp; - priv_unlock(priv); -} - -/** - * DPDK callback to clear device statistics. - * - * @param dev - * Pointer to Ethernet device structure. - */ -static void -mlx4_stats_reset(struct rte_eth_dev *dev) -{ - struct priv *priv = mlx4_get_priv(dev); - unsigned int i; - unsigned int idx; - - if (priv == NULL) - return; - priv_lock(priv); - for (i = 0; (i != priv->rxqs_n); ++i) { - if ((*priv->rxqs)[i] == NULL) - continue; - idx = (*priv->rxqs)[i]->stats.idx; - (*priv->rxqs)[i]->stats = - (struct mlx4_rxq_stats){ .idx = idx }; - } - for (i = 0; (i != priv->txqs_n); ++i) { - if ((*priv->txqs)[i] == NULL) - continue; - idx = (*priv->txqs)[i]->stats.idx; - (*priv->txqs)[i]->stats = - (struct mlx4_txq_stats){ .idx = idx }; - } -#ifndef MLX4_PMD_SOFT_COUNTERS - /* FIXME: reset hardware counters. */ -#endif - priv_unlock(priv); -} - -/** - * DPDK callback to remove a MAC address. - * - * @param dev - * Pointer to Ethernet device structure. - * @param index - * MAC address index. - */ -static void -mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) -{ - struct priv *priv = dev->data->dev_private; - - if (mlx4_is_secondary()) - return; - priv_lock(priv); - if (priv->isolated) - goto end; - DEBUG("%p: removing MAC address from index %" PRIu32, - (void *)dev, index); - /* Last array entry is reserved for broadcast. */ - if (index >= (elemof(priv->mac) - 1)) - goto end; - priv_mac_addr_del(priv, index); -end: - priv_unlock(priv); -} - -/** - * DPDK callback to add a MAC address. - * - * @param dev - * Pointer to Ethernet device structure. - * @param mac_addr - * MAC address to register. - * @param index - * MAC address index. - * @param vmdq - * VMDq pool index to associate address with (ignored). - */ -static int -mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr, - uint32_t index, uint32_t vmdq) -{ - struct priv *priv = dev->data->dev_private; - int re; - - if (mlx4_is_secondary()) - return -ENOTSUP; - (void)vmdq; - priv_lock(priv); - if (priv->isolated) { - DEBUG("%p: cannot add MAC address, " - "device is in isolated mode", (void *)dev); - re = EPERM; - goto end; - } - DEBUG("%p: adding MAC address at index %" PRIu32, - (void *)dev, index); - /* Last array entry is reserved for broadcast. */ - if (index >= (elemof(priv->mac) - 1)) { - re = EINVAL; - goto end; - } - re = priv_mac_addr_add(priv, index, - (const uint8_t (*)[ETHER_ADDR_LEN]) - mac_addr->addr_bytes); -end: - priv_unlock(priv); - return -re; -} - -/** - * DPDK callback to set the primary MAC address. - * - * @param dev - * Pointer to Ethernet device structure. - * @param mac_addr - * MAC address to register. - */ -static void -mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr) -{ - DEBUG("%p: setting primary MAC address", (void *)dev); - mlx4_mac_addr_remove(dev, 0); - mlx4_mac_addr_add(dev, mac_addr, 0, 0); -} - -/** - * DPDK callback to enable promiscuous mode. - * - * @param dev - * Pointer to Ethernet device structure. - */ -static void -mlx4_promiscuous_enable(struct rte_eth_dev *dev) -{ - struct priv *priv = dev->data->dev_private; - unsigned int i; - int ret; - - if (mlx4_is_secondary()) - return; - priv_lock(priv); - if (priv->isolated) { - DEBUG("%p: cannot enable promiscuous, " - "device is in isolated mode", (void *)dev); - priv_unlock(priv); - return; - } - if (priv->promisc) { - priv_unlock(priv); - return; - } - /* If device isn't started, this is all we need to do. */ - if (!priv->started) - goto end; - if (priv->rss) { - ret = rxq_promiscuous_enable(LIST_FIRST(&priv->parents)); - if (ret) { - priv_unlock(priv); - return; - } - goto end; - } - for (i = 0; (i != priv->rxqs_n); ++i) { - if ((*priv->rxqs)[i] == NULL) - continue; - ret = rxq_promiscuous_enable((*priv->rxqs)[i]); - if (!ret) - continue; - /* Failure, rollback. */ - while (i != 0) - if ((*priv->rxqs)[--i] != NULL) - rxq_promiscuous_disable((*priv->rxqs)[i]); - priv_unlock(priv); - return; - } -end: - priv->promisc = 1; - priv_unlock(priv); -} - -/** - * DPDK callback to disable promiscuous mode. - * - * @param dev - * Pointer to Ethernet device structure. - */ -static void -mlx4_promiscuous_disable(struct rte_eth_dev *dev) -{ - struct priv *priv = dev->data->dev_private; - unsigned int i; - - if (mlx4_is_secondary()) - return; - priv_lock(priv); - if (!priv->promisc || priv->isolated) { - priv_unlock(priv); - return; - } - if (priv->rss) { - rxq_promiscuous_disable(LIST_FIRST(&priv->parents)); - goto end; - } - for (i = 0; (i != priv->rxqs_n); ++i) - if ((*priv->rxqs)[i] != NULL) - rxq_promiscuous_disable((*priv->rxqs)[i]); -end: - priv->promisc = 0; - priv_unlock(priv); -} - -/** - * DPDK callback to enable allmulti mode. - * - * @param dev - * Pointer to Ethernet device structure. - */ -static void -mlx4_allmulticast_enable(struct rte_eth_dev *dev) -{ - struct priv *priv = dev->data->dev_private; - unsigned int i; - int ret; - - if (mlx4_is_secondary()) - return; - priv_lock(priv); - if (priv->isolated) { - DEBUG("%p: cannot enable allmulticast, " - "device is in isolated mode", (void *)dev); - priv_unlock(priv); - return; - } - if (priv->allmulti) { - priv_unlock(priv); - return; - } - /* If device isn't started, this is all we need to do. */ - if (!priv->started) - goto end; - if (priv->rss) { - ret = rxq_allmulticast_enable(LIST_FIRST(&priv->parents)); - if (ret) { - priv_unlock(priv); - return; - } - goto end; - } - for (i = 0; (i != priv->rxqs_n); ++i) { - if ((*priv->rxqs)[i] == NULL) - continue; - ret = rxq_allmulticast_enable((*priv->rxqs)[i]); - if (!ret) - continue; - /* Failure, rollback. */ - while (i != 0) - if ((*priv->rxqs)[--i] != NULL) - rxq_allmulticast_disable((*priv->rxqs)[i]); - priv_unlock(priv); - return; - } -end: - priv->allmulti = 1; - priv_unlock(priv); -} - -/** - * DPDK callback to disable allmulti mode. - * - * @param dev - * Pointer to Ethernet device structure. - */ -static void -mlx4_allmulticast_disable(struct rte_eth_dev *dev) -{ - struct priv *priv = dev->data->dev_private; - unsigned int i; - - if (mlx4_is_secondary()) - return; - priv_lock(priv); - if (!priv->allmulti || priv->isolated) { - priv_unlock(priv); - return; - } - if (priv->rss) { - rxq_allmulticast_disable(LIST_FIRST(&priv->parents)); - goto end; - } - for (i = 0; (i != priv->rxqs_n); ++i) - if ((*priv->rxqs)[i] != NULL) - rxq_allmulticast_disable((*priv->rxqs)[i]); -end: - priv->allmulti = 0; - priv_unlock(priv); -} - -/** - * DPDK callback to retrieve physical link information. - * - * @param dev - * Pointer to Ethernet device structure. - * @param wait_to_complete - * Wait for request completion (ignored). - */ -static int -mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete) -{ - const struct priv *priv = mlx4_get_priv(dev); - struct ethtool_cmd edata = { - .cmd = ETHTOOL_GSET - }; - struct ifreq ifr; - struct rte_eth_link dev_link; - int link_speed = 0; - - /* priv_lock() is not taken to allow concurrent calls. */ - - if (priv == NULL) - return -EINVAL; - (void)wait_to_complete; - if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { - WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); - return -1; - } - memset(&dev_link, 0, sizeof(dev_link)); - dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && - (ifr.ifr_flags & IFF_RUNNING)); - ifr.ifr_data = (void *)&edata; - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", - strerror(errno)); - return -1; - } - link_speed = ethtool_cmd_speed(&edata); - if (link_speed == -1) - dev_link.link_speed = 0; - else - dev_link.link_speed = link_speed; - dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? - ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); - dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & - ETH_LINK_SPEED_FIXED); - if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { - /* Link status changed. */ - dev->data->dev_link = dev_link; - return 0; - } - /* Link status is still the same. */ - return -1; -} - -static int -mlx4_ibv_device_to_pci_addr(const struct ibv_device *device, - struct rte_pci_addr *pci_addr); - -/** - * DPDK callback to change the MTU. - * - * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be - * received). Use this as a hint to enable/disable scattered packets support - * and improve performance when not needed. - * Since failure is not an option, reconfiguring queues on the fly is not - * recommended. - * - * @param dev - * Pointer to Ethernet device structure. - * @param in_mtu - * New MTU. - * - * @return - * 0 on success, negative errno value on failure. - */ -static int -mlx4_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) -{ - struct priv *priv = dev->data->dev_private; - int ret = 0; - unsigned int i; - uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) = - mlx4_rx_burst; - - if (mlx4_is_secondary()) - return -E_RTE_SECONDARY; - priv_lock(priv); - /* Set kernel interface MTU first. */ - if (priv_set_mtu(priv, mtu)) { - ret = errno; - WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, - strerror(ret)); - goto out; - } else - DEBUG("adapter port %u MTU set to %u", priv->port, mtu); - priv->mtu = mtu; - /* Temporarily replace RX handler with a fake one, assuming it has not - * been copied elsewhere. */ - dev->rx_pkt_burst = removed_rx_burst; - /* Make sure everyone has left mlx4_rx_burst() and uses - * removed_rx_burst() instead. */ - rte_wmb(); - usleep(1000); - /* Reconfigure each RX queue. */ - for (i = 0; (i != priv->rxqs_n); ++i) { - struct rxq *rxq = (*priv->rxqs)[i]; - unsigned int max_frame_len; - - if (rxq == NULL) - continue; - /* Calculate new maximum frame length according to MTU. */ - max_frame_len = (priv->mtu + ETHER_HDR_LEN + - (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN)); - /* Provide new values to rxq_setup(). */ - dev->data->dev_conf.rxmode.jumbo_frame = - (max_frame_len > ETHER_MAX_LEN); - dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; - ret = rxq_rehash(dev, rxq); - if (ret) { - /* Force SP RX if that queue requires it and abort. */ - if (rxq->sp) - rx_func = mlx4_rx_burst_sp; - break; - } - /* Reenable non-RSS queue attributes. No need to check - * for errors at this stage. */ - if (!priv->rss && !priv->isolated) { - rxq_mac_addrs_add(rxq); - if (priv->promisc) - rxq_promiscuous_enable(rxq); - if (priv->allmulti) - rxq_allmulticast_enable(rxq); - } - /* Scattered burst function takes priority. */ - if (rxq->sp) - rx_func = mlx4_rx_burst_sp; - } - /* Burst functions can now be called again. */ - rte_wmb(); - dev->rx_pkt_burst = rx_func; -out: - priv_unlock(priv); - assert(ret >= 0); - return -ret; -} - -/** - * DPDK callback to get flow control status. - * - * @param dev - * Pointer to Ethernet device structure. - * @param[out] fc_conf - * Flow control output buffer. - * - * @return - * 0 on success, negative errno value on failure. - */ -static int -mlx4_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) -{ - struct priv *priv = dev->data->dev_private; - struct ifreq ifr; - struct ethtool_pauseparam ethpause = { - .cmd = ETHTOOL_GPAUSEPARAM - }; - int ret; - - if (mlx4_is_secondary()) - return -E_RTE_SECONDARY; - ifr.ifr_data = (void *)ðpause; - priv_lock(priv); - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - ret = errno; - WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" - " failed: %s", - strerror(ret)); - goto out; - } - - fc_conf->autoneg = ethpause.autoneg; - if (ethpause.rx_pause && ethpause.tx_pause) - fc_conf->mode = RTE_FC_FULL; - else if (ethpause.rx_pause) - fc_conf->mode = RTE_FC_RX_PAUSE; - else if (ethpause.tx_pause) - fc_conf->mode = RTE_FC_TX_PAUSE; - else - fc_conf->mode = RTE_FC_NONE; - ret = 0; - -out: - priv_unlock(priv); - assert(ret >= 0); - return -ret; -} - -/** - * DPDK callback to modify flow control parameters. - * - * @param dev - * Pointer to Ethernet device structure. - * @param[in] fc_conf - * Flow control parameters. - * - * @return - * 0 on success, negative errno value on failure. - */ -static int -mlx4_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) -{ - struct priv *priv = dev->data->dev_private; - struct ifreq ifr; - struct ethtool_pauseparam ethpause = { - .cmd = ETHTOOL_SPAUSEPARAM - }; - int ret; - - if (mlx4_is_secondary()) - return -E_RTE_SECONDARY; - ifr.ifr_data = (void *)ðpause; - ethpause.autoneg = fc_conf->autoneg; - if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || - (fc_conf->mode & RTE_FC_RX_PAUSE)) - ethpause.rx_pause = 1; - else - ethpause.rx_pause = 0; - - if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || - (fc_conf->mode & RTE_FC_TX_PAUSE)) - ethpause.tx_pause = 1; - else - ethpause.tx_pause = 0; - - priv_lock(priv); - if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { - ret = errno; - WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" - " failed: %s", - strerror(ret)); - goto out; - } - ret = 0; - -out: - priv_unlock(priv); - assert(ret >= 0); - return -ret; -} - -/** - * Configure a VLAN filter. - * - * @param dev - * Pointer to Ethernet device structure. - * @param vlan_id - * VLAN ID to filter. - * @param on - * Toggle filter. - * - * @return - * 0 on success, errno value on failure. - */ -static int -vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) -{ - struct priv *priv = dev->data->dev_private; - unsigned int i; - unsigned int j = -1; - - DEBUG("%p: %s VLAN filter ID %" PRIu16, - (void *)dev, (on ? "enable" : "disable"), vlan_id); - for (i = 0; (i != elemof(priv->vlan_filter)); ++i) { - if (!priv->vlan_filter[i].enabled) { - /* Unused index, remember it. */ - j = i; - continue; - } - if (priv->vlan_filter[i].id != vlan_id) - continue; - /* This VLAN ID is already known, use its index. */ - j = i; - break; - } - /* Check if there's room for another VLAN filter. */ - if (j == (unsigned int)-1) - return ENOMEM; - /* - * VLAN filters apply to all configured MAC addresses, flow - * specifications must be reconfigured accordingly. - */ - priv->vlan_filter[j].id = vlan_id; - if ((on) && (!priv->vlan_filter[j].enabled)) { - /* - * Filter is disabled, enable it. - * Rehashing flows in all RX queues is necessary. - */ - if (priv->rss) - rxq_mac_addrs_del(LIST_FIRST(&priv->parents)); - else - for (i = 0; (i != priv->rxqs_n); ++i) - if ((*priv->rxqs)[i] != NULL) - rxq_mac_addrs_del((*priv->rxqs)[i]); - priv->vlan_filter[j].enabled = 1; - if (priv->started) { - if (priv->rss) - rxq_mac_addrs_add(LIST_FIRST(&priv->parents)); - else - for (i = 0; (i != priv->rxqs_n); ++i) { - if ((*priv->rxqs)[i] == NULL) - continue; - rxq_mac_addrs_add((*priv->rxqs)[i]); - } - } - } else if ((!on) && (priv->vlan_filter[j].enabled)) { - /* - * Filter is enabled, disable it. - * Rehashing flows in all RX queues is necessary. - */ - if (priv->rss) - rxq_mac_addrs_del(LIST_FIRST(&priv->parents)); - else - for (i = 0; (i != priv->rxqs_n); ++i) - if ((*priv->rxqs)[i] != NULL) - rxq_mac_addrs_del((*priv->rxqs)[i]); - priv->vlan_filter[j].enabled = 0; - if (priv->started) { - if (priv->rss) - rxq_mac_addrs_add(LIST_FIRST(&priv->parents)); - else - for (i = 0; (i != priv->rxqs_n); ++i) { - if ((*priv->rxqs)[i] == NULL) - continue; - rxq_mac_addrs_add((*priv->rxqs)[i]); - } - } - } - return 0; -} - -/** - * DPDK callback to configure a VLAN filter. - * - * @param dev - * Pointer to Ethernet device structure. - * @param vlan_id - * VLAN ID to filter. - * @param on - * Toggle filter. - * - * @return - * 0 on success, negative errno value on failure. - */ -static int -mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) -{ - struct priv *priv = dev->data->dev_private; - int ret; - - if (mlx4_is_secondary()) - return -E_RTE_SECONDARY; - priv_lock(priv); - if (priv->isolated) { - DEBUG("%p: cannot set vlan filter, " - "device is in isolated mode", (void *)dev); - priv_unlock(priv); - return -EINVAL; - } - ret = vlan_filter_set(dev, vlan_id, on); - priv_unlock(priv); - assert(ret >= 0); - return -ret; -} - -const struct rte_flow_ops mlx4_flow_ops = { - .validate = mlx4_flow_validate, - .create = mlx4_flow_create, - .destroy = mlx4_flow_destroy, - .flush = mlx4_flow_flush, - .query = NULL, - .isolate = mlx4_flow_isolate, -}; - -/** - * Manage filter operations. - * - * @param dev - * Pointer to Ethernet device structure. - * @param filter_type - * Filter type. - * @param filter_op - * Operation to perform. - * @param arg - * Pointer to operation-specific structure. - * - * @return - * 0 on success, negative errno value on failure. - */ -static int -mlx4_dev_filter_ctrl(struct rte_eth_dev *dev, - enum rte_filter_type filter_type, - enum rte_filter_op filter_op, - void *arg) -{ - int ret = EINVAL; - - switch (filter_type) { - case RTE_ETH_FILTER_GENERIC: - if (filter_op != RTE_ETH_FILTER_GET) - return -EINVAL; - *(const void **)arg = &mlx4_flow_ops; - return 0; - case RTE_ETH_FILTER_FDIR: - DEBUG("%p: filter type FDIR is not supported by this PMD", - (void *)dev); - break; - default: - ERROR("%p: filter type (%d) not supported", - (void *)dev, filter_type); - break; - } - return -ret; -} - static const struct eth_dev_ops mlx4_dev_ops = { .dev_configure = mlx4_dev_configure, .dev_start = mlx4_dev_start, .dev_stop = mlx4_dev_stop, - .dev_set_link_down = mlx4_set_link_down, - .dev_set_link_up = mlx4_set_link_up, + .dev_set_link_down = mlx4_dev_set_link_down, + .dev_set_link_up = mlx4_dev_set_link_up, .dev_close = mlx4_dev_close, + .link_update = mlx4_link_update, .promiscuous_enable = mlx4_promiscuous_enable, .promiscuous_disable = mlx4_promiscuous_disable, .allmulticast_enable = mlx4_allmulticast_enable, .allmulticast_disable = mlx4_allmulticast_disable, - .link_update = mlx4_link_update, + .mac_addr_remove = mlx4_mac_addr_remove, + .mac_addr_add = mlx4_mac_addr_add, + .mac_addr_set = mlx4_mac_addr_set, .stats_get = mlx4_stats_get, .stats_reset = mlx4_stats_reset, - .queue_stats_mapping_set = NULL, .dev_infos_get = mlx4_dev_infos_get, .dev_supported_ptypes_get = mlx4_dev_supported_ptypes_get, .vlan_filter_set = mlx4_vlan_filter_set, - .vlan_tpid_set = NULL, - .vlan_strip_queue_set = NULL, - .vlan_offload_set = NULL, .rx_queue_setup = mlx4_rx_queue_setup, .tx_queue_setup = mlx4_tx_queue_setup, .rx_queue_release = mlx4_rx_queue_release, .tx_queue_release = mlx4_tx_queue_release, - .dev_led_on = NULL, - .dev_led_off = NULL, - .flow_ctrl_get = mlx4_dev_get_flow_ctrl, - .flow_ctrl_set = mlx4_dev_set_flow_ctrl, - .priority_flow_ctrl_set = NULL, - .mac_addr_remove = mlx4_mac_addr_remove, - .mac_addr_add = mlx4_mac_addr_add, - .mac_addr_set = mlx4_mac_addr_set, - .mtu_set = mlx4_dev_set_mtu, - .filter_ctrl = mlx4_dev_filter_ctrl, + .flow_ctrl_get = mlx4_flow_ctrl_get, + .flow_ctrl_set = mlx4_flow_ctrl_set, + .mtu_set = mlx4_mtu_set, + .filter_ctrl = mlx4_filter_ctrl, .rx_queue_intr_enable = mlx4_rx_intr_enable, .rx_queue_intr_disable = mlx4_rx_intr_disable, }; @@ -5381,7 +267,7 @@ static const struct eth_dev_ops mlx4_dev_ops = { * PCI bus address output buffer. * * @return - * 0 on success, -1 on failure and errno is set. + * 0 on success, negative errno value otherwise and rte_errno is set. */ static int mlx4_ibv_device_to_pci_addr(const struct ibv_device *device, @@ -5392,8 +278,10 @@ mlx4_ibv_device_to_pci_addr(const struct ibv_device *device, MKSTR(path, "%s/device/uevent", device->ibdev_path); file = fopen(path, "rb"); - if (file == NULL) - return -1; + if (file == NULL) { + rte_errno = errno; + return -rte_errno; + } while (fgets(line, sizeof(line), file) == line) { size_t len = strlen(line); int ret; @@ -5423,572 +311,48 @@ mlx4_ibv_device_to_pci_addr(const struct ibv_device *device, } /** - * Get MAC address by querying netdevice. - * - * @param[in] priv - * struct priv for the requested device. - * @param[out] mac - * MAC address output buffer. - * - * @return - * 0 on success, -1 on failure and errno is set. - */ -static int -priv_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]) -{ - struct ifreq request; - - if (priv_ifreq(priv, SIOCGIFHWADDR, &request)) - return -1; - memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); - return 0; -} - -/* Support up to 32 adapters. */ -static struct { - struct rte_pci_addr pci_addr; /* associated PCI address */ - uint32_t ports; /* physical ports bitfield. */ -} mlx4_dev[32]; - -/** - * Get device index in mlx4_dev[] from PCI bus address. - * - * @param[in] pci_addr - * PCI bus address to look for. - * - * @return - * mlx4_dev[] index on success, -1 on failure. - */ -static int -mlx4_dev_idx(struct rte_pci_addr *pci_addr) -{ - unsigned int i; - int ret = -1; - - assert(pci_addr != NULL); - for (i = 0; (i != elemof(mlx4_dev)); ++i) { - if ((mlx4_dev[i].pci_addr.domain == pci_addr->domain) && - (mlx4_dev[i].pci_addr.bus == pci_addr->bus) && - (mlx4_dev[i].pci_addr.devid == pci_addr->devid) && - (mlx4_dev[i].pci_addr.function == pci_addr->function)) - return i; - if ((mlx4_dev[i].ports == 0) && (ret == -1)) - ret = i; - } - return ret; -} - -/** - * Retrieve integer value from environment variable. - * - * @param[in] name - * Environment variable name. - * - * @return - * Integer value, 0 if the variable is not set. - */ -static int -mlx4_getenv_int(const char *name) -{ - const char *val = getenv(name); - - if (val == NULL) - return 0; - return atoi(val); -} - -static void -mlx4_dev_link_status_handler(void *); -static void -mlx4_dev_interrupt_handler(void *); - -/** - * Link/device status handler. - * - * @param priv - * Pointer to private structure. - * @param dev - * Pointer to the rte_eth_dev structure. - * @param events - * Pointer to event flags holder. - * - * @return - * Number of events - */ -static int -priv_dev_status_handler(struct priv *priv, struct rte_eth_dev *dev, - uint32_t *events) -{ - struct ibv_async_event event; - int port_change = 0; - struct rte_eth_link *link = &dev->data->dev_link; - int ret = 0; - - *events = 0; - /* Read all message and acknowledge them. */ - for (;;) { - if (ibv_get_async_event(priv->ctx, &event)) - break; - if ((event.event_type == IBV_EVENT_PORT_ACTIVE || - event.event_type == IBV_EVENT_PORT_ERR) && - (priv->intr_conf.lsc == 1)) { - port_change = 1; - ret++; - } else if (event.event_type == IBV_EVENT_DEVICE_FATAL && - priv->intr_conf.rmv == 1) { - *events |= (1 << RTE_ETH_EVENT_INTR_RMV); - ret++; - } else - DEBUG("event type %d on port %d not handled", - event.event_type, event.element.port_num); - ibv_ack_async_event(&event); - } - if (!port_change) - return ret; - mlx4_link_update(dev, 0); - if (((link->link_speed == 0) && link->link_status) || - ((link->link_speed != 0) && !link->link_status)) { - if (!priv->pending_alarm) { - /* Inconsistent status, check again later. */ - priv->pending_alarm = 1; - rte_eal_alarm_set(MLX4_ALARM_TIMEOUT_US, - mlx4_dev_link_status_handler, - dev); - } - } else { - *events |= (1 << RTE_ETH_EVENT_INTR_LSC); - } - return ret; -} - -/** - * Handle delayed link status event. - * - * @param arg - * Registered argument. - */ -static void -mlx4_dev_link_status_handler(void *arg) -{ - struct rte_eth_dev *dev = arg; - struct priv *priv = dev->data->dev_private; - uint32_t events; - int ret; - - priv_lock(priv); - assert(priv->pending_alarm == 1); - priv->pending_alarm = 0; - ret = priv_dev_status_handler(priv, dev, &events); - priv_unlock(priv); - if (ret > 0 && events & (1 << RTE_ETH_EVENT_INTR_LSC)) - _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL, - NULL); -} - -/** - * Handle interrupts from the NIC. - * - * @param[in] intr_handle - * Interrupt handler. - * @param cb_arg - * Callback argument. - */ -static void -mlx4_dev_interrupt_handler(void *cb_arg) -{ - struct rte_eth_dev *dev = cb_arg; - struct priv *priv = dev->data->dev_private; - int ret; - uint32_t ev; - int i; - - priv_lock(priv); - ret = priv_dev_status_handler(priv, dev, &ev); - priv_unlock(priv); - if (ret > 0) { - for (i = RTE_ETH_EVENT_UNKNOWN; - i < RTE_ETH_EVENT_MAX; - i++) { - if (ev & (1 << i)) { - ev &= ~(1 << i); - _rte_eth_dev_callback_process(dev, i, NULL, - NULL); - ret--; - } - } - if (ret) - WARN("%d event%s not processed", ret, - (ret > 1 ? "s were" : " was")); - } -} - -/** - * Uninstall interrupt handler. - * - * @param priv - * Pointer to private structure. - * @param dev - * Pointer to the rte_eth_dev structure. - * @return - * 0 on success, negative errno value on failure. - */ -static int -priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) -{ - int ret; - - if (priv->intr_conf.lsc || - priv->intr_conf.rmv) - return 0; - ret = rte_intr_callback_unregister(&priv->intr_handle, - mlx4_dev_interrupt_handler, - dev); - if (ret < 0) { - ERROR("rte_intr_callback_unregister failed with %d" - "%s%s%s", ret, - (errno ? " (errno: " : ""), - (errno ? strerror(errno) : ""), - (errno ? ")" : "")); - } - priv->intr_handle.fd = 0; - priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; - return ret; -} - -/** - * Install interrupt handler. - * - * @param priv - * Pointer to private structure. - * @param dev - * Pointer to the rte_eth_dev structure. - * @return - * 0 on success, negative errno value on failure. - */ -static int -priv_dev_interrupt_handler_install(struct priv *priv, - struct rte_eth_dev *dev) -{ - int flags; - int rc; - - /* Check whether the interrupt handler has already been installed - * for either type of interrupt - */ - if (priv->intr_conf.lsc && - priv->intr_conf.rmv && - priv->intr_handle.fd) - return 0; - assert(priv->ctx->async_fd > 0); - flags = fcntl(priv->ctx->async_fd, F_GETFL); - rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); - if (rc < 0) { - INFO("failed to change file descriptor async event queue"); - dev->data->dev_conf.intr_conf.lsc = 0; - dev->data->dev_conf.intr_conf.rmv = 0; - return -errno; - } else { - priv->intr_handle.fd = priv->ctx->async_fd; - priv->intr_handle.type = RTE_INTR_HANDLE_EXT; - rc = rte_intr_callback_register(&priv->intr_handle, - mlx4_dev_interrupt_handler, - dev); - if (rc) { - ERROR("rte_intr_callback_register failed " - " (errno: %s)", strerror(errno)); - return rc; - } - } - return 0; -} - -/** - * Uninstall interrupt handler. - * - * @param priv - * Pointer to private structure. - * @param dev - * Pointer to the rte_eth_dev structure. - * @return - * 0 on success, negative value on error. - */ -static int -priv_dev_removal_interrupt_handler_uninstall(struct priv *priv, - struct rte_eth_dev *dev) -{ - if (dev->data->dev_conf.intr_conf.rmv) { - priv->intr_conf.rmv = 0; - return priv_dev_interrupt_handler_uninstall(priv, dev); - } - return 0; -} - -/** - * Uninstall interrupt handler. - * - * @param priv - * Pointer to private structure. - * @param dev - * Pointer to the rte_eth_dev structure. - * @return - * 0 on success, negative value on error, - */ -static int -priv_dev_link_interrupt_handler_uninstall(struct priv *priv, - struct rte_eth_dev *dev) -{ - int ret = 0; - - if (dev->data->dev_conf.intr_conf.lsc) { - priv->intr_conf.lsc = 0; - ret = priv_dev_interrupt_handler_uninstall(priv, dev); - if (ret) - return ret; - } - if (priv->pending_alarm) - if (rte_eal_alarm_cancel(mlx4_dev_link_status_handler, - dev)) { - ERROR("rte_eal_alarm_cancel failed " - " (errno: %s)", strerror(rte_errno)); - return -rte_errno; - } - priv->pending_alarm = 0; - return 0; -} - -/** - * Install link interrupt handler. - * - * @param priv - * Pointer to private structure. - * @param dev - * Pointer to the rte_eth_dev structure. - * @return - * 0 on success, negative value on error. - */ -static int -priv_dev_link_interrupt_handler_install(struct priv *priv, - struct rte_eth_dev *dev) -{ - int ret; - - if (dev->data->dev_conf.intr_conf.lsc) { - ret = priv_dev_interrupt_handler_install(priv, dev); - if (ret) - return ret; - priv->intr_conf.lsc = 1; - } - return 0; -} - -/** - * Install removal interrupt handler. - * - * @param priv - * Pointer to private structure. - * @param dev - * Pointer to the rte_eth_dev structure. - * @return - * 0 on success, negative value on error. - */ -static int -priv_dev_removal_interrupt_handler_install(struct priv *priv, - struct rte_eth_dev *dev) -{ - int ret; - - if (dev->data->dev_conf.intr_conf.rmv) { - ret = priv_dev_interrupt_handler_install(priv, dev); - if (ret) - return ret; - priv->intr_conf.rmv = 1; - } - return 0; -} - -/** - * Allocate queue vector and fill epoll fd list for Rx interrupts. - * - * @param priv - * Pointer to private structure. - * - * @return - * 0 on success, negative on failure. - */ -static int -priv_rx_intr_vec_enable(struct priv *priv) -{ - unsigned int i; - unsigned int rxqs_n = priv->rxqs_n; - unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); - unsigned int count = 0; - struct rte_intr_handle *intr_handle = priv->dev->intr_handle; - - if (!priv->dev->data->dev_conf.intr_conf.rxq) - return 0; - priv_rx_intr_vec_disable(priv); - intr_handle->intr_vec = malloc(sizeof(intr_handle->intr_vec[rxqs_n])); - if (intr_handle->intr_vec == NULL) { - ERROR("failed to allocate memory for interrupt vector," - " Rx interrupts will not be supported"); - return -ENOMEM; - } - intr_handle->type = RTE_INTR_HANDLE_EXT; - for (i = 0; i != n; ++i) { - struct rxq *rxq = (*priv->rxqs)[i]; - int fd; - int flags; - int rc; - - /* Skip queues that cannot request interrupts. */ - if (!rxq || !rxq->channel) { - /* Use invalid intr_vec[] index to disable entry. */ - intr_handle->intr_vec[i] = - RTE_INTR_VEC_RXTX_OFFSET + - RTE_MAX_RXTX_INTR_VEC_ID; - continue; - } - if (count >= RTE_MAX_RXTX_INTR_VEC_ID) { - ERROR("too many Rx queues for interrupt vector size" - " (%d), Rx interrupts cannot be enabled", - RTE_MAX_RXTX_INTR_VEC_ID); - priv_rx_intr_vec_disable(priv); - return -1; - } - fd = rxq->channel->fd; - flags = fcntl(fd, F_GETFL); - rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK); - if (rc < 0) { - ERROR("failed to make Rx interrupt file descriptor" - " %d non-blocking for queue index %d", fd, i); - priv_rx_intr_vec_disable(priv); - return rc; - } - intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count; - intr_handle->efds[count] = fd; - count++; - } - if (!count) - priv_rx_intr_vec_disable(priv); - else - intr_handle->nb_efd = count; - return 0; -} - -/** - * Clean up Rx interrupts handler. - * - * @param priv - * Pointer to private structure. - */ -static void -priv_rx_intr_vec_disable(struct priv *priv) -{ - struct rte_intr_handle *intr_handle = priv->dev->intr_handle; - - rte_intr_free_epoll_fd(intr_handle); - free(intr_handle->intr_vec); - intr_handle->nb_efd = 0; - intr_handle->intr_vec = NULL; -} - -/** - * DPDK callback for Rx queue interrupt enable. - * - * @param dev - * Pointer to Ethernet device structure. - * @param idx - * Rx queue index. - * - * @return - * 0 on success, negative on failure. - */ -static int -mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx) -{ - struct priv *priv = dev->data->dev_private; - struct rxq *rxq = (*priv->rxqs)[idx]; - int ret; - - if (!rxq || !rxq->channel) - ret = EINVAL; - else - ret = ibv_req_notify_cq(rxq->cq, 0); - if (ret) - WARN("unable to arm interrupt on rx queue %d", idx); - return -ret; -} - -/** - * DPDK callback for Rx queue interrupt disable. - * - * @param dev - * Pointer to Ethernet device structure. - * @param idx - * Rx queue index. - * - * @return - * 0 on success, negative on failure. - */ -static int -mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx) -{ - struct priv *priv = dev->data->dev_private; - struct rxq *rxq = (*priv->rxqs)[idx]; - struct ibv_cq *ev_cq; - void *ev_ctx; - int ret; - - if (!rxq || !rxq->channel) { - ret = EINVAL; - } else { - ret = ibv_get_cq_event(rxq->cq->channel, &ev_cq, &ev_ctx); - if (ret || ev_cq != rxq->cq) - ret = EINVAL; - } - if (ret) - WARN("unable to disable interrupt on rx queue %d", - idx); - else - ibv_ack_cq_events(rxq->cq, 1); - return -ret; -} - -/** * Verify and store value for device argument. * * @param[in] key * Key argument to verify. * @param[in] val * Value associated with key. - * @param out - * User data. + * @param[in, out] conf + * Shared configuration data. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, negative errno value otherwise and rte_errno is set. */ static int -mlx4_arg_parse(const char *key, const char *val, void *out) +mlx4_arg_parse(const char *key, const char *val, struct mlx4_conf *conf) { - struct mlx4_conf *conf = out; unsigned long tmp; errno = 0; tmp = strtoul(val, NULL, 0); if (errno) { + rte_errno = errno; WARN("%s: \"%s\" is not a valid integer", key, val); - return -errno; + return -rte_errno; } if (strcmp(MLX4_PMD_PORT_KVARG, key) == 0) { - if (tmp >= MLX4_PMD_MAX_PHYS_PORTS) { - ERROR("invalid port index %lu (max: %u)", - tmp, MLX4_PMD_MAX_PHYS_PORTS - 1); + uint32_t ports = rte_log2_u32(conf->ports.present); + + if (tmp >= ports) { + ERROR("port index %lu outside range [0,%" PRIu32 ")", + tmp, ports); return -EINVAL; } - conf->active_ports |= 1 << tmp; + if (!(conf->ports.present & (1 << tmp))) { + rte_errno = EINVAL; + ERROR("invalid port index %lu", tmp); + return -rte_errno; + } + conf->ports.enabled |= 1 << tmp; } else { + rte_errno = EINVAL; WARN("%s: unknown parameter", key); - return -EINVAL; + return -rte_errno; } return 0; } @@ -6000,7 +364,7 @@ mlx4_arg_parse(const char *key, const char *val, void *out) * Device arguments structure. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, negative errno value otherwise and rte_errno is set. */ static int mlx4_args(struct rte_devargs *devargs, struct mlx4_conf *conf) @@ -6014,15 +378,21 @@ mlx4_args(struct rte_devargs *devargs, struct mlx4_conf *conf) return 0; kvlist = rte_kvargs_parse(devargs->args, pmd_mlx4_init_params); if (kvlist == NULL) { + rte_errno = EINVAL; ERROR("failed to parse kvargs"); - return -EINVAL; + return -rte_errno; } /* Process parameters. */ for (i = 0; pmd_mlx4_init_params[i]; ++i) { arg_count = rte_kvargs_count(kvlist, MLX4_PMD_PORT_KVARG); while (arg_count-- > 0) { - ret = rte_kvargs_process(kvlist, MLX4_PMD_PORT_KVARG, - mlx4_arg_parse, conf); + ret = rte_kvargs_process(kvlist, + MLX4_PMD_PORT_KVARG, + (int (*)(const char *, + const char *, + void *)) + mlx4_arg_parse, + conf); if (ret != 0) goto free_kvlist; } @@ -6046,7 +416,7 @@ static struct rte_pci_driver mlx4_driver; * PCI device information. * * @return - * 0 on success, negative errno value on failure. + * 0 on success, negative errno value otherwise and rte_errno is set. */ static int mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) @@ -6057,30 +427,20 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) struct ibv_context *attr_ctx = NULL; struct ibv_device_attr device_attr; struct mlx4_conf conf = { - .active_ports = 0, + .ports.present = 0, }; unsigned int vf; - int idx; int i; (void)pci_drv; assert(pci_drv == &mlx4_driver); - /* Get mlx4_dev[] index. */ - idx = mlx4_dev_idx(&pci_dev->addr); - if (idx == -1) { - ERROR("this driver cannot support any more adapters"); - return -ENOMEM; - } - DEBUG("using driver device index %d", idx); - - /* Save PCI address. */ - mlx4_dev[idx].pci_addr = pci_dev->addr; list = ibv_get_device_list(&i); if (list == NULL) { - assert(errno); - if (errno == ENOSYS) + rte_errno = errno; + assert(rte_errno); + if (rte_errno == ENOSYS) ERROR("cannot list devices, is ib_uverbs loaded?"); - return -errno; + return -rte_errno; } assert(i >= 0); /* @@ -6111,190 +471,112 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) ibv_free_device_list(list); switch (err) { case 0: + rte_errno = ENODEV; ERROR("cannot access device, is mlx4_ib loaded?"); - return -ENODEV; + return -rte_errno; case EINVAL: + rte_errno = EINVAL; ERROR("cannot use device, are drivers up to date?"); - return -EINVAL; + return -rte_errno; } assert(err > 0); - return -err; + rte_errno = err; + return -rte_errno; } ibv_dev = list[i]; - DEBUG("device opened"); if (ibv_query_device(attr_ctx, &device_attr)) { - err = ENODEV; + rte_errno = ENODEV; goto error; } INFO("%u port(s) detected", device_attr.phys_port_cnt); - + conf.ports.present |= (UINT64_C(1) << device_attr.phys_port_cnt) - 1; if (mlx4_args(pci_dev->device.devargs, &conf)) { ERROR("failed to process device arguments"); - err = EINVAL; + rte_errno = EINVAL; goto error; } /* Use all ports when none are defined */ - if (conf.active_ports == 0) { - for (i = 0; i < MLX4_PMD_MAX_PHYS_PORTS; i++) - conf.active_ports |= 1 << i; - } + if (!conf.ports.enabled) + conf.ports.enabled = conf.ports.present; for (i = 0; i < device_attr.phys_port_cnt; i++) { uint32_t port = i + 1; /* ports are indexed from one */ - uint32_t test = (1 << i); struct ibv_context *ctx = NULL; struct ibv_port_attr port_attr; struct ibv_pd *pd = NULL; struct priv *priv = NULL; struct rte_eth_dev *eth_dev = NULL; -#ifdef HAVE_EXP_QUERY_DEVICE - struct ibv_exp_device_attr exp_device_attr; -#endif /* HAVE_EXP_QUERY_DEVICE */ struct ether_addr mac; - /* If port is not active, skip. */ - if (!(conf.active_ports & (1 << i))) + /* If port is not enabled, skip. */ + if (!(conf.ports.enabled & (1 << i))) continue; -#ifdef HAVE_EXP_QUERY_DEVICE - exp_device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS; -#ifdef RSS_SUPPORT - exp_device_attr.comp_mask |= IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ; -#endif /* RSS_SUPPORT */ -#endif /* HAVE_EXP_QUERY_DEVICE */ - - DEBUG("using port %u (%08" PRIx32 ")", port, test); - + DEBUG("using port %u", port); ctx = ibv_open_device(ibv_dev); if (ctx == NULL) { - err = ENODEV; + rte_errno = ENODEV; goto port_error; } - /* Check port status. */ err = ibv_query_port(ctx, port, &port_attr); if (err) { - ERROR("port query failed: %s", strerror(err)); - err = ENODEV; + rte_errno = err; + ERROR("port query failed: %s", strerror(rte_errno)); goto port_error; } - if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { + rte_errno = ENOTSUP; ERROR("port %d is not configured in Ethernet mode", port); - err = EINVAL; goto port_error; } - if (port_attr.state != IBV_PORT_ACTIVE) DEBUG("port %d is not active: \"%s\" (%d)", port, ibv_port_state_str(port_attr.state), port_attr.state); - + /* Make asynchronous FD non-blocking to handle interrupts. */ + if (mlx4_fd_set_non_blocking(ctx->async_fd) < 0) { + ERROR("cannot make asynchronous FD non-blocking: %s", + strerror(rte_errno)); + goto port_error; + } /* Allocate protection domain. */ pd = ibv_alloc_pd(ctx); if (pd == NULL) { + rte_errno = ENOMEM; ERROR("PD allocation failure"); - err = ENOMEM; goto port_error; } - - mlx4_dev[idx].ports |= test; - /* from rte_ethdev.c */ priv = rte_zmalloc("ethdev private structure", sizeof(*priv), RTE_CACHE_LINE_SIZE); if (priv == NULL) { + rte_errno = ENOMEM; ERROR("priv allocation failure"); - err = ENOMEM; goto port_error; } - priv->ctx = ctx; priv->device_attr = device_attr; priv->port = port; priv->pd = pd; priv->mtu = ETHER_MTU; -#ifdef HAVE_EXP_QUERY_DEVICE - if (ibv_exp_query_device(ctx, &exp_device_attr)) { - ERROR("ibv_exp_query_device() failed"); - err = ENODEV; - goto port_error; - } -#ifdef RSS_SUPPORT - if ((exp_device_attr.exp_device_cap_flags & - IBV_EXP_DEVICE_QPG) && - (exp_device_attr.exp_device_cap_flags & - IBV_EXP_DEVICE_UD_RSS) && - (exp_device_attr.comp_mask & - IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ) && - (exp_device_attr.max_rss_tbl_sz > 0)) { - priv->hw_qpg = 1; - priv->hw_rss = 1; - priv->max_rss_tbl_sz = exp_device_attr.max_rss_tbl_sz; - } else { - priv->hw_qpg = 0; - priv->hw_rss = 0; - priv->max_rss_tbl_sz = 0; - } - priv->hw_tss = !!(exp_device_attr.exp_device_cap_flags & - IBV_EXP_DEVICE_UD_TSS); - DEBUG("device flags: %s%s%s", - (priv->hw_qpg ? "IBV_DEVICE_QPG " : ""), - (priv->hw_tss ? "IBV_DEVICE_TSS " : ""), - (priv->hw_rss ? "IBV_DEVICE_RSS " : "")); - if (priv->hw_rss) - DEBUG("maximum RSS indirection table size: %u", - exp_device_attr.max_rss_tbl_sz); -#endif /* RSS_SUPPORT */ - - priv->hw_csum = - ((exp_device_attr.exp_device_cap_flags & - IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT) && - (exp_device_attr.exp_device_cap_flags & - IBV_EXP_DEVICE_RX_CSUM_IP_PKT)); + priv->vf = vf; + priv->hw_csum = !!(device_attr.device_cap_flags & + IBV_DEVICE_RAW_IP_CSUM); DEBUG("checksum offloading is %ssupported", (priv->hw_csum ? "" : "not ")); - - priv->hw_csum_l2tun = !!(exp_device_attr.exp_device_cap_flags & - IBV_EXP_DEVICE_VXLAN_SUPPORT); + /* Only ConnectX-3 Pro supports tunneling. */ + priv->hw_csum_l2tun = + priv->hw_csum && + (device_attr.vendor_part_id == + PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO); DEBUG("L2 tunnel checksum offloads are %ssupported", (priv->hw_csum_l2tun ? "" : "not ")); - -#ifdef INLINE_RECV - priv->inl_recv_size = mlx4_getenv_int("MLX4_INLINE_RECV_SIZE"); - - if (priv->inl_recv_size) { - exp_device_attr.comp_mask = - IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ; - if (ibv_exp_query_device(ctx, &exp_device_attr)) { - INFO("Couldn't query device for inline-receive" - " capabilities."); - priv->inl_recv_size = 0; - } else { - if ((unsigned)exp_device_attr.inline_recv_sz < - priv->inl_recv_size) { - INFO("Max inline-receive (%d) <" - " requested inline-receive (%u)", - exp_device_attr.inline_recv_sz, - priv->inl_recv_size); - priv->inl_recv_size = - exp_device_attr.inline_recv_sz; - } - } - INFO("Set inline receive size to %u", - priv->inl_recv_size); - } -#endif /* INLINE_RECV */ -#endif /* HAVE_EXP_QUERY_DEVICE */ - - (void)mlx4_getenv_int; - priv->vf = vf; /* Configure the first MAC address by default. */ - if (priv_get_mac(priv, &mac.addr_bytes)) { + if (mlx4_get_mac(priv, &mac.addr_bytes)) { ERROR("cannot get MAC address, is mlx4_en loaded?" - " (errno: %s)", strerror(errno)); - err = ENODEV; + " (rte_errno: %s)", strerror(rte_errno)); goto port_error; } INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", @@ -6302,18 +584,13 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) mac.addr_bytes[0], mac.addr_bytes[1], mac.addr_bytes[2], mac.addr_bytes[3], mac.addr_bytes[4], mac.addr_bytes[5]); - /* Register MAC and broadcast addresses. */ - claim_zero(priv_mac_addr_add(priv, 0, - (const uint8_t (*)[ETHER_ADDR_LEN]) - mac.addr_bytes)); - claim_zero(priv_mac_addr_add(priv, (elemof(priv->mac) - 1), - &(const uint8_t [ETHER_ADDR_LEN]) - { "\xff\xff\xff\xff\xff\xff" })); + /* Register MAC address. */ + priv->mac[0] = mac; #ifndef NDEBUG { char ifname[IF_NAMESIZE]; - if (priv_get_ifname(priv, &ifname) == 0) + if (mlx4_get_ifname(priv, &ifname) == 0) DEBUG("port %u ifname is \"%s\"", priv->port, ifname); else @@ -6321,9 +598,8 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) } #endif /* Get actual MTU if possible. */ - priv_get_mtu(priv, &priv->mtu); + mlx4_mtu_get(priv, &priv->mtu); DEBUG("port %u MTU is %u", priv->port, priv->mtu); - /* from rte_ethdev.c */ { char name[RTE_ETH_NAME_MAX_LEN]; @@ -6334,67 +610,41 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) } if (eth_dev == NULL) { ERROR("can not allocate rte ethdev"); - err = ENOMEM; + rte_errno = ENOMEM; goto port_error; } - - /* Secondary processes have to use local storage for their - * private data as well as a copy of eth_dev->data, but this - * pointer must not be modified before burst functions are - * actually called. */ - if (mlx4_is_secondary()) { - struct mlx4_secondary_data *sd = - &mlx4_secondary_data[eth_dev->data->port_id]; - - sd->primary_priv = eth_dev->data->dev_private; - if (sd->primary_priv == NULL) { - ERROR("no private data for port %u", - eth_dev->data->port_id); - err = EINVAL; - goto port_error; - } - sd->shared_dev_data = eth_dev->data; - rte_spinlock_init(&sd->lock); - memcpy(sd->data.name, sd->shared_dev_data->name, - sizeof(sd->data.name)); - sd->data.dev_private = priv; - sd->data.rx_mbuf_alloc_failed = 0; - sd->data.mtu = ETHER_MTU; - sd->data.port_id = sd->shared_dev_data->port_id; - sd->data.mac_addrs = priv->mac; - eth_dev->tx_pkt_burst = mlx4_tx_burst_secondary_setup; - eth_dev->rx_pkt_burst = mlx4_rx_burst_secondary_setup; - } else { - eth_dev->data->dev_private = priv; - eth_dev->data->mac_addrs = priv->mac; - } + eth_dev->data->dev_private = priv; + eth_dev->data->mac_addrs = priv->mac; eth_dev->device = &pci_dev->device; - rte_eth_copy_pci_info(eth_dev, pci_dev); - eth_dev->device->driver = &mlx4_driver.driver; - + /* Initialize local interrupt handle for current port. */ + priv->intr_handle = (struct rte_intr_handle){ + .fd = -1, + .type = RTE_INTR_HANDLE_EXT, + }; /* - * Copy and override interrupt handle to prevent it from - * being shared between all ethdev instances of a given PCI - * device. This is required to properly handle Rx interrupts - * on all ports. + * Override ethdev interrupt handle pointer with private + * handle instead of that of the parent PCI device used by + * default. This prevents it from being shared between all + * ports of the same PCI device since each of them is + * associated its own Verbs context. + * + * Rx interrupts in particular require this as the PMD has + * no control over the registration of queue interrupts + * besides setting up eth_dev->intr_handle, the rest is + * handled by rte_intr_rx_ctl(). */ - priv->intr_handle_dev = *eth_dev->intr_handle; - eth_dev->intr_handle = &priv->intr_handle_dev; - + eth_dev->intr_handle = &priv->intr_handle; priv->dev = eth_dev; eth_dev->dev_ops = &mlx4_dev_ops; - eth_dev->data->dev_flags |= RTE_ETH_DEV_DETACHABLE; - /* Bring Ethernet device up. */ DEBUG("forcing Ethernet interface up"); - priv_set_flags(priv, ~IFF_UP, IFF_UP); + mlx4_dev_set_link_up(priv->dev); /* Update link status once if waiting for LSC. */ if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) mlx4_link_update(eth_dev, 0); continue; - port_error: rte_free(priv); if (pd) @@ -6405,27 +655,21 @@ port_error: rte_eth_dev_release_port(eth_dev); break; } - + if (i == device_attr.phys_port_cnt) + return 0; /* * XXX if something went wrong in the loop above, there is a resource * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as * long as the dpdk does not provide a way to deallocate a ethdev and a * way to enumerate the registered ethdevs to free the previous ones. */ - - /* no port found, complain */ - if (!mlx4_dev[idx].ports) { - err = ENODEV; - goto error; - } - error: if (attr_ctx) claim_zero(ibv_close_device(attr_ctx)); if (list) ibv_free_device_list(list); - assert(err >= 0); - return -err; + assert(rte_errno >= 0); + return -rte_errno; } static const struct rte_pci_id mlx4_pci_id_map[] = { @@ -6463,7 +707,6 @@ RTE_INIT(rte_mlx4_pmd_init); static void rte_mlx4_pmd_init(void) { - RTE_BUILD_BUG_ON(sizeof(wr_id_t) != sizeof(uint64_t)); /* * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use * huge pages. Calling ibv_fork_init() during init allows diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index c0ade4f1..3aeef87e 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -1,8 +1,8 @@ /*- * BSD LICENSE * - * Copyright 2012-2017 6WIND S.A. - * Copyright 2012-2017 Mellanox. + * Copyright 2012 6WIND S.A. + * Copyright 2012 Mellanox * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -34,29 +34,11 @@ #ifndef RTE_PMD_MLX4_H_ #define RTE_PMD_MLX4_H_ -#include <stddef.h> +#include <net/if.h> #include <stdint.h> -#include <limits.h> +#include <sys/queue.h> -/* - * Runtime logging through RTE_LOG() is enabled when not in debugging mode. - * Intermediate LOG_*() macros add the required end-of-line characters. - */ -#ifndef NDEBUG -#define INFO(...) DEBUG(__VA_ARGS__) -#define WARN(...) DEBUG(__VA_ARGS__) -#define ERROR(...) DEBUG(__VA_ARGS__) -#else -#define LOG__(level, m, ...) \ - RTE_LOG(level, PMD, MLX4_DRIVER_NAME ": " m "%c", __VA_ARGS__) -#define LOG_(level, ...) LOG__(level, __VA_ARGS__, '\n') -#define INFO(...) LOG_(INFO, __VA_ARGS__) -#define WARN(...) LOG_(WARNING, __VA_ARGS__) -#define ERROR(...) LOG_(ERR, __VA_ARGS__) -#endif - -/* Verbs header. */ -/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +/* Verbs headers do not support -pedantic. */ #ifdef PEDANTIC #pragma GCC diagnostic ignored "-Wpedantic" #endif @@ -65,36 +47,25 @@ #pragma GCC diagnostic error "-Wpedantic" #endif -/* - * Maximum number of simultaneous MAC addresses supported. - * - * According to ConnectX's Programmer Reference Manual: - * The L2 Address Match is implemented by comparing a MAC/VLAN combination - * of 128 MAC addresses and 127 VLAN values, comprising 128x127 possible - * L2 addresses. - */ -#define MLX4_MAX_MAC_ADDRESSES 128 +#include <rte_ethdev.h> +#include <rte_ether.h> +#include <rte_interrupts.h> +#include <rte_mempool.h> +#include <rte_spinlock.h> -/* Maximum number of simultaneous VLAN filters supported. See above. */ -#define MLX4_MAX_VLAN_IDS 127 +/** Maximum number of simultaneous MAC addresses. This value is arbitrary. */ +#define MLX4_MAX_MAC_ADDRESSES 128 -/* Request send completion once in every 64 sends, might be less. */ +/** Request send completion once in every 64 sends, might be less. */ #define MLX4_PMD_TX_PER_COMP_REQ 64 -/* Maximum number of physical ports. */ -#define MLX4_PMD_MAX_PHYS_PORTS 2 - -/* Maximum number of Scatter/Gather Elements per Work Request. */ -#ifndef MLX4_PMD_SGE_WR_N -#define MLX4_PMD_SGE_WR_N 4 -#endif - -/* Maximum size for inline data. */ -#ifndef MLX4_PMD_MAX_INLINE +/** Maximum size for inline data. */ #define MLX4_PMD_MAX_INLINE 0 -#endif -/* +/** Fixed RSS hash key size in bytes. Cannot be modified. */ +#define MLX4_RSS_HASH_KEY_SIZE 40 + +/** * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP * from which buffers are to be transmitted will have to be mapped by this * driver to their own Memory Region (MR). This is a slow operation. @@ -105,18 +76,10 @@ #define MLX4_PMD_TX_MP_CACHE 8 #endif -/* - * If defined, only use software counters. The PMD will never ask the hardware - * for these, and many of them won't be available. - */ -#ifndef MLX4_PMD_SOFT_COUNTERS -#define MLX4_PMD_SOFT_COUNTERS 1 -#endif - -/* Alarm timeout. */ -#define MLX4_ALARM_TIMEOUT_US 100000 +/** Interrupt alarm timeout value in microseconds. */ +#define MLX4_INTR_ALARM_TIMEOUT 100000 -/* Port parameter. */ +/** Port parameter. */ #define MLX4_PMD_PORT_KVARG "port" enum { @@ -129,258 +92,92 @@ enum { PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO = 0x1007, }; +/** Driver name reported to lower layers and used in log output. */ #define MLX4_DRIVER_NAME "net_mlx4" -/* Bit-field manipulation. */ -#define BITFIELD_DECLARE(bf, type, size) \ - type bf[(((size_t)(size) / (sizeof(type) * CHAR_BIT)) + \ - !!((size_t)(size) % (sizeof(type) * CHAR_BIT)))] -#define BITFIELD_DEFINE(bf, type, size) \ - BITFIELD_DECLARE((bf), type, (size)) = { 0 } -#define BITFIELD_SET(bf, b) \ - (assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)), \ - (void)((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] |= \ - ((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT))))) -#define BITFIELD_RESET(bf, b) \ - (assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)), \ - (void)((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] &= \ - ~((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT))))) -#define BITFIELD_ISSET(bf, b) \ - (assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)), \ - !!(((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] & \ - ((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT)))))) - -/* Number of elements in array. */ -#define elemof(a) (sizeof(a) / sizeof((a)[0])) - -/* Cast pointer p to structure member m to its parent structure of type t. */ -#define containerof(p, t, m) ((t *)((uint8_t *)(p) - offsetof(t, m))) - -/* Branch prediction helpers. */ -#ifndef likely -#define likely(c) __builtin_expect(!!(c), 1) -#endif -#ifndef unlikely -#define unlikely(c) __builtin_expect(!!(c), 0) -#endif - -/* Debugging */ -#ifndef NDEBUG -#include <stdio.h> -#define DEBUG__(m, ...) \ - (fprintf(stderr, "%s:%d: %s(): " m "%c", \ - __FILE__, __LINE__, __func__, __VA_ARGS__), \ - fflush(stderr), \ - (void)0) -/* - * Save/restore errno around DEBUG__(). - * XXX somewhat undefined behavior, but works. - */ -#define DEBUG_(...) \ - (errno = ((int []){ \ - *(volatile int *)&errno, \ - (DEBUG__(__VA_ARGS__), 0) \ - })[0]) -#define DEBUG(...) DEBUG_(__VA_ARGS__, '\n') -#ifndef MLX4_PMD_DEBUG_BROKEN_VERBS -#define claim_zero(...) assert((__VA_ARGS__) == 0) -#else /* MLX4_PMD_DEBUG_BROKEN_VERBS */ -#define claim_zero(...) \ - (void)(((__VA_ARGS__) == 0) || \ - DEBUG("Assertion `(" # __VA_ARGS__ ") == 0' failed (IGNORED).")) -#endif /* MLX4_PMD_DEBUG_BROKEN_VERBS */ -#define claim_nonzero(...) assert((__VA_ARGS__) != 0) -#define claim_positive(...) assert((__VA_ARGS__) >= 0) -#else /* NDEBUG */ -/* No-ops. */ -#define DEBUG(...) (void)0 -#define claim_zero(...) (__VA_ARGS__) -#define claim_nonzero(...) (__VA_ARGS__) -#define claim_positive(...) (__VA_ARGS__) -#endif /* NDEBUG */ - -struct mlx4_rxq_stats { - unsigned int idx; /**< Mapping index. */ -#ifdef MLX4_PMD_SOFT_COUNTERS - uint64_t ipackets; /**< Total of successfully received packets. */ - uint64_t ibytes; /**< Total of successfully received bytes. */ -#endif - uint64_t idropped; /**< Total of packets dropped when RX ring full. */ - uint64_t rx_nombuf; /**< Total of RX mbuf allocation failures. */ -}; - -/* RX element (scattered packets). */ -struct rxq_elt_sp { - struct ibv_recv_wr wr; /* Work Request. */ - struct ibv_sge sges[MLX4_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */ - struct rte_mbuf *bufs[MLX4_PMD_SGE_WR_N]; /* SGEs buffers. */ -}; - -/* RX element. */ -struct rxq_elt { - struct ibv_recv_wr wr; /* Work Request. */ - struct ibv_sge sge; /* Scatter/Gather Element. */ - /* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */ -}; - -/* RX queue descriptor. */ -struct rxq { - LIST_ENTRY(rxq) next; /* Used by parent queue only */ - struct priv *priv; /* Back pointer to private data. */ - struct rte_mempool *mp; /* Memory Pool for allocations. */ - struct ibv_mr *mr; /* Memory Region (for mp). */ - struct ibv_cq *cq; /* Completion Queue. */ - struct ibv_qp *qp; /* Queue Pair. */ - struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ - struct ibv_exp_cq_family *if_cq; /* CQ interface. */ - struct ibv_comp_channel *channel; - /* - * Each VLAN ID requires a separate flow steering rule. - */ - BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES); - struct ibv_flow *mac_flow[MLX4_MAX_MAC_ADDRESSES][MLX4_MAX_VLAN_IDS]; - struct ibv_flow *promisc_flow; /* Promiscuous flow. */ - struct ibv_flow *allmulti_flow; /* Multicast flow. */ - unsigned int port_id; /* Port ID for incoming packets. */ - unsigned int elts_n; /* (*elts)[] length. */ - unsigned int elts_head; /* Current index in (*elts)[]. */ - union { - struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */ - struct rxq_elt (*no_sp)[]; /* RX elements. */ - } elts; - unsigned int sp:1; /* Use scattered RX elements. */ - unsigned int csum:1; /* Enable checksum offloading. */ - unsigned int csum_l2tun:1; /* Same for L2 tunnels. */ - struct mlx4_rxq_stats stats; /* RX queue counters. */ - unsigned int socket; /* CPU socket ID for allocations. */ - struct ibv_exp_res_domain *rd; /* Resource Domain. */ - struct { - uint16_t queues_n; - uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; - } rss; -}; - -/* TX element. */ -struct txq_elt { - struct rte_mbuf *buf; -}; - -struct mlx4_txq_stats { - unsigned int idx; /**< Mapping index. */ -#ifdef MLX4_PMD_SOFT_COUNTERS - uint64_t opackets; /**< Total of successfully sent packets. */ - uint64_t obytes; /**< Total of successfully sent bytes. */ -#endif - uint64_t odropped; /**< Total of packets not sent when TX ring full. */ -}; - -/* - * Linear buffer type. It is used when transmitting buffers with too many - * segments that do not fit the hardware queue (see max_send_sge). - * Extra segments are copied (linearized) in such buffers, replacing the - * last SGE during TX. - * The size is arbitrary but large enough to hold a jumbo frame with - * 8 segments considering mbuf.buf_len is about 2048 bytes. - */ -typedef uint8_t linear_t[16384]; +struct mlx4_drop; +struct mlx4_rss; +struct rxq; +struct txq; +struct rte_flow; -/* TX queue descriptor. */ -struct txq { - struct priv *priv; /* Back pointer to private data. */ - struct { - const struct rte_mempool *mp; /* Cached Memory Pool. */ - struct ibv_mr *mr; /* Memory Region (for mp). */ - uint32_t lkey; /* mr->lkey */ - } mp2mr[MLX4_PMD_TX_MP_CACHE]; /* MP to MR translation table. */ - struct ibv_cq *cq; /* Completion Queue. */ - struct ibv_qp *qp; /* Queue Pair. */ - struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ - struct ibv_exp_cq_family *if_cq; /* CQ interface. */ -#if MLX4_PMD_MAX_INLINE > 0 - uint32_t max_inline; /* Max inline send size <= MLX4_PMD_MAX_INLINE. */ -#endif - unsigned int elts_n; /* (*elts)[] length. */ - struct txq_elt (*elts)[]; /* TX elements. */ - unsigned int elts_head; /* Current index in (*elts)[]. */ - unsigned int elts_tail; /* First element awaiting completion. */ - unsigned int elts_comp; /* Number of completion requests. */ - unsigned int elts_comp_cd; /* Countdown for next completion request. */ - unsigned int elts_comp_cd_init; /* Initial value for countdown. */ - struct mlx4_txq_stats stats; /* TX queue counters. */ - linear_t (*elts_linear)[]; /* Linearized buffers. */ - struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */ - unsigned int socket; /* CPU socket ID for allocations. */ - struct ibv_exp_res_domain *rd; /* Resource Domain. */ +/** Memory region descriptor. */ +struct mlx4_mr { + LIST_ENTRY(mlx4_mr) next; /**< Next entry in list. */ + uintptr_t start; /**< Base address for memory region. */ + uintptr_t end; /**< End address for memory region. */ + uint32_t lkey; /**< L_Key extracted from @p mr. */ + uint32_t refcnt; /**< Reference count for this object. */ + struct priv *priv; /**< Back pointer to private data. */ + struct ibv_mr *mr; /**< Memory region associated with @p mp. */ + struct rte_mempool *mp; /**< Target memory pool (mempool). */ }; -struct rte_flow; - +/** Private data structure. */ struct priv { - struct rte_eth_dev *dev; /* Ethernet device. */ - struct ibv_context *ctx; /* Verbs context. */ - struct ibv_device_attr device_attr; /* Device properties. */ - struct ibv_pd *pd; /* Protection Domain. */ - /* - * MAC addresses array and configuration bit-field. - * An extra entry that cannot be modified by the DPDK is reserved - * for broadcast frames (destination MAC address ff:ff:ff:ff:ff:ff). - */ - struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES]; - BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES); - /* VLAN filters. */ - struct { - unsigned int enabled:1; /* If enabled. */ - unsigned int id:12; /* VLAN ID (0-4095). */ - } vlan_filter[MLX4_MAX_VLAN_IDS]; /* VLAN filters table. */ + struct rte_eth_dev *dev; /**< Ethernet device. */ + struct ibv_context *ctx; /**< Verbs context. */ + struct ibv_device_attr device_attr; /**< Device properties. */ + struct ibv_pd *pd; /**< Protection Domain. */ /* Device properties. */ - uint16_t mtu; /* Configured MTU. */ - uint8_t port; /* Physical port number. */ - unsigned int started:1; /* Device started, flows enabled. */ - unsigned int promisc:1; /* Device in promiscuous mode. */ - unsigned int allmulti:1; /* Device receives all multicast packets. */ - unsigned int hw_qpg:1; /* QP groups are supported. */ - unsigned int hw_tss:1; /* TSS is supported. */ - unsigned int hw_rss:1; /* RSS is supported. */ - unsigned int hw_csum:1; /* Checksum offload is supported. */ - unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */ - unsigned int rss:1; /* RSS is enabled. */ - unsigned int vf:1; /* This is a VF device. */ - unsigned int pending_alarm:1; /* An alarm is pending. */ - unsigned int isolated:1; /* Toggle isolated mode. */ -#ifdef INLINE_RECV - unsigned int inl_recv_size; /* Inline recv size */ -#endif - unsigned int max_rss_tbl_sz; /* Maximum number of RSS queues. */ - /* RX/TX queues. */ - unsigned int rxqs_n; /* RX queues array size. */ - unsigned int txqs_n; /* TX queues array size. */ - struct rxq *(*rxqs)[]; /* RX queues. */ - struct txq *(*txqs)[]; /* TX queues. */ - struct rte_intr_handle intr_handle_dev; /* Device interrupt handler. */ - struct rte_intr_handle intr_handle; /* Interrupt handler. */ - struct rte_flow_drop *flow_drop_queue; /* Flow drop queue. */ - LIST_HEAD(mlx4_flows, rte_flow) flows; - struct rte_intr_conf intr_conf; /* Active interrupt configuration. */ - LIST_HEAD(mlx4_parents, rxq) parents; - rte_spinlock_t lock; /* Lock for control functions. */ + uint16_t mtu; /**< Configured MTU. */ + uint8_t port; /**< Physical port number. */ + uint32_t started:1; /**< Device started, flows enabled. */ + uint32_t vf:1; /**< This is a VF device. */ + uint32_t intr_alarm:1; /**< An interrupt alarm is scheduled. */ + uint32_t isolated:1; /**< Toggle isolated mode. */ + uint32_t hw_csum:1; /* Checksum offload is supported. */ + uint32_t hw_csum_l2tun:1; /* Checksum support for L2 tunnels. */ + struct rte_intr_handle intr_handle; /**< Port interrupt handle. */ + struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */ + LIST_HEAD(, mlx4_rss) rss; /**< Shared targets for Rx flow rules. */ + LIST_HEAD(, rte_flow) flows; /**< Configured flow rule handles. */ + LIST_HEAD(, mlx4_mr) mr; /**< Registered memory regions. */ + rte_spinlock_t mr_lock; /**< Lock for @p mr access. */ + struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES]; + /**< Configured MAC addresses. Unused entries are zeroed. */ }; -void priv_lock(struct priv *priv); -void priv_unlock(struct priv *priv); - -int -rxq_create_qp(struct rxq *rxq, - uint16_t desc, - int inactive, - int children_n, - struct rxq *rxq_parent); - -void -rxq_parent_cleanup(struct rxq *parent); - -struct rxq * -priv_parent_create(struct priv *priv, - uint16_t queues[], - uint16_t children_n); +/* mlx4_ethdev.c */ + +int mlx4_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]); +int mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]); +int mlx4_mtu_get(struct priv *priv, uint16_t *mtu); +int mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu); +int mlx4_dev_set_link_down(struct rte_eth_dev *dev); +int mlx4_dev_set_link_up(struct rte_eth_dev *dev); +void mlx4_promiscuous_enable(struct rte_eth_dev *dev); +void mlx4_promiscuous_disable(struct rte_eth_dev *dev); +void mlx4_allmulticast_enable(struct rte_eth_dev *dev); +void mlx4_allmulticast_disable(struct rte_eth_dev *dev); +void mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index); +int mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr, + uint32_t index, uint32_t vmdq); +void mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr); +int mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on); +int mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats); +void mlx4_stats_reset(struct rte_eth_dev *dev); +void mlx4_dev_infos_get(struct rte_eth_dev *dev, + struct rte_eth_dev_info *info); +int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete); +int mlx4_flow_ctrl_get(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +int mlx4_flow_ctrl_set(struct rte_eth_dev *dev, + struct rte_eth_fc_conf *fc_conf); +const uint32_t *mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev); + +/* mlx4_intr.c */ + +int mlx4_intr_uninstall(struct priv *priv); +int mlx4_intr_install(struct priv *priv); +int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx); +int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx); + +/* mlx4_mr.c */ + +struct mlx4_mr *mlx4_mr_get(struct priv *priv, struct rte_mempool *mp); +void mlx4_mr_put(struct mlx4_mr *mr); +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, + uint32_t i); #endif /* RTE_PMD_MLX4_H_ */ diff --git a/drivers/net/mlx4/mlx4_ethdev.c b/drivers/net/mlx4/mlx4_ethdev.c new file mode 100644 index 00000000..c2ea4db1 --- /dev/null +++ b/drivers/net/mlx4/mlx4_ethdev.c @@ -0,0 +1,1047 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file + * Miscellaneous control operations for mlx4 driver. + */ + +#include <assert.h> +#include <dirent.h> +#include <errno.h> +#include <linux/ethtool.h> +#include <linux/sockios.h> +#include <net/if.h> +#include <netinet/ip.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <unistd.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_bus_pci.h> +#include <rte_errno.h> +#include <rte_ethdev.h> +#include <rte_ether.h> +#include <rte_flow.h> +#include <rte_pci.h> + +#include "mlx4.h" +#include "mlx4_flow.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +/** + * Get interface name from private structure. + * + * @param[in] priv + * Pointer to private structure. + * @param[out] ifname + * Interface name output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) +{ + DIR *dir; + struct dirent *dent; + unsigned int dev_type = 0; + unsigned int dev_port_prev = ~0u; + char match[IF_NAMESIZE] = ""; + + { + MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); + + dir = opendir(path); + if (dir == NULL) { + rte_errno = errno; + return -rte_errno; + } + } + while ((dent = readdir(dir)) != NULL) { + char *name = dent->d_name; + FILE *file; + unsigned int dev_port; + int r; + + if ((name[0] == '.') && + ((name[1] == '\0') || + ((name[1] == '.') && (name[2] == '\0')))) + continue; + + MKSTR(path, "%s/device/net/%s/%s", + priv->ctx->device->ibdev_path, name, + (dev_type ? "dev_id" : "dev_port")); + + file = fopen(path, "rb"); + if (file == NULL) { + if (errno != ENOENT) + continue; + /* + * Switch to dev_id when dev_port does not exist as + * is the case with Linux kernel versions < 3.15. + */ +try_dev_id: + match[0] = '\0'; + if (dev_type) + break; + dev_type = 1; + dev_port_prev = ~0u; + rewinddir(dir); + continue; + } + r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); + fclose(file); + if (r != 1) + continue; + /* + * Switch to dev_id when dev_port returns the same value for + * all ports. May happen when using a MOFED release older than + * 3.0 with a Linux kernel >= 3.15. + */ + if (dev_port == dev_port_prev) + goto try_dev_id; + dev_port_prev = dev_port; + if (dev_port == (priv->port - 1u)) + snprintf(match, sizeof(match), "%s", name); + } + closedir(dir); + if (match[0] == '\0') { + rte_errno = ENODEV; + return -rte_errno; + } + strncpy(*ifname, match, sizeof(*ifname)); + return 0; +} + +/** + * Read from sysfs entry. + * + * @param[in] priv + * Pointer to private structure. + * @param[in] entry + * Entry name relative to sysfs path. + * @param[out] buf + * Data output buffer. + * @param size + * Buffer size. + * + * @return + * Number of bytes read on success, negative errno value otherwise and + * rte_errno is set. + */ +static int +mlx4_sysfs_read(const struct priv *priv, const char *entry, + char *buf, size_t size) +{ + char ifname[IF_NAMESIZE]; + FILE *file; + int ret; + + ret = mlx4_get_ifname(priv, &ifname); + if (ret) + return ret; + + MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, + ifname, entry); + + file = fopen(path, "rb"); + if (file == NULL) { + rte_errno = errno; + return -rte_errno; + } + ret = fread(buf, 1, size, file); + if ((size_t)ret < size && ferror(file)) { + rte_errno = EIO; + ret = -rte_errno; + } else { + ret = size; + } + fclose(file); + return ret; +} + +/** + * Write to sysfs entry. + * + * @param[in] priv + * Pointer to private structure. + * @param[in] entry + * Entry name relative to sysfs path. + * @param[in] buf + * Data buffer. + * @param size + * Buffer size. + * + * @return + * Number of bytes written on success, negative errno value otherwise and + * rte_errno is set. + */ +static int +mlx4_sysfs_write(const struct priv *priv, const char *entry, + char *buf, size_t size) +{ + char ifname[IF_NAMESIZE]; + FILE *file; + int ret; + + ret = mlx4_get_ifname(priv, &ifname); + if (ret) + return ret; + + MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, + ifname, entry); + + file = fopen(path, "wb"); + if (file == NULL) { + rte_errno = errno; + return -rte_errno; + } + ret = fwrite(buf, 1, size, file); + if ((size_t)ret < size || ferror(file)) { + rte_errno = EIO; + ret = -rte_errno; + } else { + ret = size; + } + fclose(file); + return ret; +} + +/** + * Get unsigned long sysfs property. + * + * @param priv + * Pointer to private structure. + * @param[in] name + * Entry name relative to sysfs path. + * @param[out] value + * Value output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) +{ + int ret; + unsigned long value_ret; + char value_str[32]; + + ret = mlx4_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); + if (ret < 0) { + DEBUG("cannot read %s value from sysfs: %s", + name, strerror(rte_errno)); + return ret; + } + value_str[ret] = '\0'; + errno = 0; + value_ret = strtoul(value_str, NULL, 0); + if (errno) { + rte_errno = errno; + DEBUG("invalid %s value `%s': %s", name, value_str, + strerror(rte_errno)); + return -rte_errno; + } + *value = value_ret; + return 0; +} + +/** + * Set unsigned long sysfs property. + * + * @param priv + * Pointer to private structure. + * @param[in] name + * Entry name relative to sysfs path. + * @param value + * Value to set. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) +{ + int ret; + MKSTR(value_str, "%lu", value); + + ret = mlx4_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); + if (ret < 0) { + DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", + name, value_str, value, strerror(rte_errno)); + return ret; + } + return 0; +} + +/** + * Perform ifreq ioctl() on associated Ethernet device. + * + * @param[in] priv + * Pointer to private structure. + * @param req + * Request number to pass to ioctl(). + * @param[out] ifr + * Interface request structure output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_ifreq(const struct priv *priv, int req, struct ifreq *ifr) +{ + int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + int ret; + + if (sock == -1) { + rte_errno = errno; + return -rte_errno; + } + ret = mlx4_get_ifname(priv, &ifr->ifr_name); + if (!ret && ioctl(sock, req, ifr) == -1) { + rte_errno = errno; + ret = -rte_errno; + } + close(sock); + return ret; +} + +/** + * Get MAC address by querying netdevice. + * + * @param[in] priv + * Pointer to private structure. + * @param[out] mac + * MAC address output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]) +{ + struct ifreq request; + int ret = mlx4_ifreq(priv, SIOCGIFHWADDR, &request); + + if (ret) + return ret; + memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); + return 0; +} + +/** + * Get device MTU. + * + * @param priv + * Pointer to private structure. + * @param[out] mtu + * MTU value output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_mtu_get(struct priv *priv, uint16_t *mtu) +{ + unsigned long ulong_mtu = 0; + int ret = mlx4_get_sysfs_ulong(priv, "mtu", &ulong_mtu); + + if (ret) + return ret; + *mtu = ulong_mtu; + return 0; +} + +/** + * DPDK callback to change the MTU. + * + * @param priv + * Pointer to Ethernet device structure. + * @param mtu + * MTU value to set. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) +{ + struct priv *priv = dev->data->dev_private; + uint16_t new_mtu; + int ret = mlx4_set_sysfs_ulong(priv, "mtu", mtu); + + if (ret) + return ret; + ret = mlx4_mtu_get(priv, &new_mtu); + if (ret) + return ret; + if (new_mtu == mtu) { + priv->mtu = mtu; + return 0; + } + rte_errno = EINVAL; + return -rte_errno; +} + +/** + * Set device flags. + * + * @param priv + * Pointer to private structure. + * @param keep + * Bitmask for flags that must remain untouched. + * @param flags + * Bitmask for flags to modify. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) +{ + unsigned long tmp = 0; + int ret = mlx4_get_sysfs_ulong(priv, "flags", &tmp); + + if (ret) + return ret; + tmp &= keep; + tmp |= (flags & (~keep)); + return mlx4_set_sysfs_ulong(priv, "flags", tmp); +} + +/** + * Change the link state (UP / DOWN). + * + * @param priv + * Pointer to Ethernet device private data. + * @param up + * Nonzero for link up, otherwise link down. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_dev_set_link(struct priv *priv, int up) +{ + int err; + + if (up) { + err = mlx4_set_flags(priv, ~IFF_UP, IFF_UP); + if (err) + return err; + } else { + err = mlx4_set_flags(priv, ~IFF_UP, ~IFF_UP); + if (err) + return err; + } + return 0; +} + +/** + * DPDK callback to bring the link DOWN. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_dev_set_link_down(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + + return mlx4_dev_set_link(priv, 0); +} + +/** + * DPDK callback to bring the link UP. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_dev_set_link_up(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + + return mlx4_dev_set_link(priv, 1); +} + +/** + * Supported Rx mode toggles. + * + * Even and odd values respectively stand for off and on. + */ +enum rxmode_toggle { + RXMODE_TOGGLE_PROMISC_OFF, + RXMODE_TOGGLE_PROMISC_ON, + RXMODE_TOGGLE_ALLMULTI_OFF, + RXMODE_TOGGLE_ALLMULTI_ON, +}; + +/** + * Helper function to toggle promiscuous and all multicast modes. + * + * @param dev + * Pointer to Ethernet device structure. + * @param toggle + * Toggle to set. + */ +static void +mlx4_rxmode_toggle(struct rte_eth_dev *dev, enum rxmode_toggle toggle) +{ + struct priv *priv = dev->data->dev_private; + const char *mode; + struct rte_flow_error error; + + switch (toggle) { + case RXMODE_TOGGLE_PROMISC_OFF: + case RXMODE_TOGGLE_PROMISC_ON: + mode = "promiscuous"; + dev->data->promiscuous = toggle & 1; + break; + case RXMODE_TOGGLE_ALLMULTI_OFF: + case RXMODE_TOGGLE_ALLMULTI_ON: + mode = "all multicast"; + dev->data->all_multicast = toggle & 1; + break; + } + if (!mlx4_flow_sync(priv, &error)) + return; + ERROR("cannot toggle %s mode (code %d, \"%s\")," + " flow error type %d, cause %p, message: %s", + mode, rte_errno, strerror(rte_errno), error.type, error.cause, + error.message ? error.message : "(unspecified)"); +} + +/** + * DPDK callback to enable promiscuous mode. + * + * @param dev + * Pointer to Ethernet device structure. + */ +void +mlx4_promiscuous_enable(struct rte_eth_dev *dev) +{ + mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_PROMISC_ON); +} + +/** + * DPDK callback to disable promiscuous mode. + * + * @param dev + * Pointer to Ethernet device structure. + */ +void +mlx4_promiscuous_disable(struct rte_eth_dev *dev) +{ + mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_PROMISC_OFF); +} + +/** + * DPDK callback to enable all multicast mode. + * + * @param dev + * Pointer to Ethernet device structure. + */ +void +mlx4_allmulticast_enable(struct rte_eth_dev *dev) +{ + mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_ALLMULTI_ON); +} + +/** + * DPDK callback to disable all multicast mode. + * + * @param dev + * Pointer to Ethernet device structure. + */ +void +mlx4_allmulticast_disable(struct rte_eth_dev *dev) +{ + mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_ALLMULTI_OFF); +} + +/** + * DPDK callback to remove a MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param index + * MAC address index. + */ +void +mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) +{ + struct priv *priv = dev->data->dev_private; + struct rte_flow_error error; + + if (index >= RTE_DIM(priv->mac)) { + rte_errno = EINVAL; + return; + } + memset(&priv->mac[index], 0, sizeof(priv->mac[index])); + if (!mlx4_flow_sync(priv, &error)) + return; + ERROR("failed to synchronize flow rules after removing MAC address" + " at index %d (code %d, \"%s\")," + " flow error type %d, cause %p, message: %s", + index, rte_errno, strerror(rte_errno), error.type, error.cause, + error.message ? error.message : "(unspecified)"); +} + +/** + * DPDK callback to add a MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param mac_addr + * MAC address to register. + * @param index + * MAC address index. + * @param vmdq + * VMDq pool index to associate address with (ignored). + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr, + uint32_t index, uint32_t vmdq) +{ + struct priv *priv = dev->data->dev_private; + struct rte_flow_error error; + int ret; + + (void)vmdq; + if (index >= RTE_DIM(priv->mac)) { + rte_errno = EINVAL; + return -rte_errno; + } + memcpy(&priv->mac[index], mac_addr, sizeof(priv->mac[index])); + ret = mlx4_flow_sync(priv, &error); + if (!ret) + return 0; + ERROR("failed to synchronize flow rules after adding MAC address" + " at index %d (code %d, \"%s\")," + " flow error type %d, cause %p, message: %s", + index, rte_errno, strerror(rte_errno), error.type, error.cause, + error.message ? error.message : "(unspecified)"); + return ret; +} + +/** + * DPDK callback to configure a VLAN filter. + * + * @param dev + * Pointer to Ethernet device structure. + * @param vlan_id + * VLAN ID to filter. + * @param on + * Toggle filter. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) +{ + struct priv *priv = dev->data->dev_private; + struct rte_flow_error error; + unsigned int vidx = vlan_id / 64; + unsigned int vbit = vlan_id % 64; + uint64_t *v; + int ret; + + if (vidx >= RTE_DIM(dev->data->vlan_filter_conf.ids)) { + rte_errno = EINVAL; + return -rte_errno; + } + v = &dev->data->vlan_filter_conf.ids[vidx]; + *v &= ~(UINT64_C(1) << vbit); + *v |= (uint64_t)!!on << vbit; + ret = mlx4_flow_sync(priv, &error); + if (!ret) + return 0; + ERROR("failed to synchronize flow rules after %s VLAN filter on ID %u" + " (code %d, \"%s\"), " + " flow error type %d, cause %p, message: %s", + on ? "enabling" : "disabling", vlan_id, + rte_errno, strerror(rte_errno), error.type, error.cause, + error.message ? error.message : "(unspecified)"); + return ret; +} + +/** + * DPDK callback to set the primary MAC address. + * + * @param dev + * Pointer to Ethernet device structure. + * @param mac_addr + * MAC address to register. + */ +void +mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr) +{ + mlx4_mac_addr_add(dev, mac_addr, 0, 0); +} + +/** + * DPDK callback to get information about the device. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] info + * Info structure output buffer. + */ +void +mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) +{ + struct priv *priv = dev->data->dev_private; + unsigned int max; + char ifname[IF_NAMESIZE]; + + info->pci_dev = RTE_ETH_DEV_TO_PCI(dev); + /* FIXME: we should ask the device for these values. */ + info->min_rx_bufsize = 32; + info->max_rx_pktlen = 65536; + /* + * Since we need one CQ per QP, the limit is the minimum number + * between the two values. + */ + max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? + priv->device_attr.max_qp : priv->device_attr.max_cq); + /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ + if (max >= 65535) + max = 65535; + info->max_rx_queues = max; + info->max_tx_queues = max; + info->max_mac_addrs = RTE_DIM(priv->mac); + info->rx_offload_capa = 0; + info->tx_offload_capa = 0; + if (priv->hw_csum) { + info->tx_offload_capa |= (DEV_TX_OFFLOAD_IPV4_CKSUM | + DEV_TX_OFFLOAD_UDP_CKSUM | + DEV_TX_OFFLOAD_TCP_CKSUM); + info->rx_offload_capa |= (DEV_RX_OFFLOAD_IPV4_CKSUM | + DEV_RX_OFFLOAD_UDP_CKSUM | + DEV_RX_OFFLOAD_TCP_CKSUM); + } + if (priv->hw_csum_l2tun) + info->tx_offload_capa |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; + if (mlx4_get_ifname(priv, &ifname) == 0) + info->if_index = if_nametoindex(ifname); + info->hash_key_size = MLX4_RSS_HASH_KEY_SIZE; + info->speed_capa = + ETH_LINK_SPEED_1G | + ETH_LINK_SPEED_10G | + ETH_LINK_SPEED_20G | + ETH_LINK_SPEED_40G | + ETH_LINK_SPEED_56G; +} + +/** + * DPDK callback to get device statistics. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] stats + * Stats structure output buffer. + */ +int +mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) +{ + struct rte_eth_stats tmp; + unsigned int i; + unsigned int idx; + + memset(&tmp, 0, sizeof(tmp)); + /* Add software counters. */ + for (i = 0; i != dev->data->nb_rx_queues; ++i) { + struct rxq *rxq = dev->data->rx_queues[i]; + + if (rxq == NULL) + continue; + idx = rxq->stats.idx; + if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { + tmp.q_ipackets[idx] += rxq->stats.ipackets; + tmp.q_ibytes[idx] += rxq->stats.ibytes; + tmp.q_errors[idx] += (rxq->stats.idropped + + rxq->stats.rx_nombuf); + } + tmp.ipackets += rxq->stats.ipackets; + tmp.ibytes += rxq->stats.ibytes; + tmp.ierrors += rxq->stats.idropped; + tmp.rx_nombuf += rxq->stats.rx_nombuf; + } + for (i = 0; i != dev->data->nb_tx_queues; ++i) { + struct txq *txq = dev->data->tx_queues[i]; + + if (txq == NULL) + continue; + idx = txq->stats.idx; + if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { + tmp.q_opackets[idx] += txq->stats.opackets; + tmp.q_obytes[idx] += txq->stats.obytes; + tmp.q_errors[idx] += txq->stats.odropped; + } + tmp.opackets += txq->stats.opackets; + tmp.obytes += txq->stats.obytes; + tmp.oerrors += txq->stats.odropped; + } + *stats = tmp; + return 0; +} + +/** + * DPDK callback to clear device statistics. + * + * @param dev + * Pointer to Ethernet device structure. + */ +void +mlx4_stats_reset(struct rte_eth_dev *dev) +{ + unsigned int i; + + for (i = 0; i != dev->data->nb_rx_queues; ++i) { + struct rxq *rxq = dev->data->rx_queues[i]; + + if (rxq) + rxq->stats = (struct mlx4_rxq_stats){ + .idx = rxq->stats.idx, + }; + } + for (i = 0; i != dev->data->nb_tx_queues; ++i) { + struct txq *txq = dev->data->tx_queues[i]; + + if (txq) + txq->stats = (struct mlx4_txq_stats){ + .idx = txq->stats.idx, + }; + } +} + +/** + * DPDK callback to retrieve physical link information. + * + * @param dev + * Pointer to Ethernet device structure. + * @param wait_to_complete + * Wait for request completion (ignored). + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete) +{ + const struct priv *priv = dev->data->dev_private; + struct ethtool_cmd edata = { + .cmd = ETHTOOL_GSET, + }; + struct ifreq ifr; + struct rte_eth_link dev_link; + int link_speed = 0; + + if (priv == NULL) { + rte_errno = EINVAL; + return -rte_errno; + } + (void)wait_to_complete; + if (mlx4_ifreq(priv, SIOCGIFFLAGS, &ifr)) { + WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(rte_errno)); + return -rte_errno; + } + memset(&dev_link, 0, sizeof(dev_link)); + dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && + (ifr.ifr_flags & IFF_RUNNING)); + ifr.ifr_data = (void *)&edata; + if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) { + WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", + strerror(rte_errno)); + return -rte_errno; + } + link_speed = ethtool_cmd_speed(&edata); + if (link_speed == -1) + dev_link.link_speed = 0; + else + dev_link.link_speed = link_speed; + dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? + ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); + dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & + ETH_LINK_SPEED_FIXED); + dev->data->dev_link = dev_link; + return 0; +} + +/** + * DPDK callback to get flow control status. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[out] fc_conf + * Flow control output buffer. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_flow_ctrl_get(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) +{ + struct priv *priv = dev->data->dev_private; + struct ifreq ifr; + struct ethtool_pauseparam ethpause = { + .cmd = ETHTOOL_GPAUSEPARAM, + }; + int ret; + + ifr.ifr_data = (void *)ðpause; + if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) { + ret = rte_errno; + WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" + " failed: %s", + strerror(rte_errno)); + goto out; + } + fc_conf->autoneg = ethpause.autoneg; + if (ethpause.rx_pause && ethpause.tx_pause) + fc_conf->mode = RTE_FC_FULL; + else if (ethpause.rx_pause) + fc_conf->mode = RTE_FC_RX_PAUSE; + else if (ethpause.tx_pause) + fc_conf->mode = RTE_FC_TX_PAUSE; + else + fc_conf->mode = RTE_FC_NONE; + ret = 0; +out: + assert(ret >= 0); + return -ret; +} + +/** + * DPDK callback to modify flow control parameters. + * + * @param dev + * Pointer to Ethernet device structure. + * @param[in] fc_conf + * Flow control parameters. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) +{ + struct priv *priv = dev->data->dev_private; + struct ifreq ifr; + struct ethtool_pauseparam ethpause = { + .cmd = ETHTOOL_SPAUSEPARAM, + }; + int ret; + + ifr.ifr_data = (void *)ðpause; + ethpause.autoneg = fc_conf->autoneg; + if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || + (fc_conf->mode & RTE_FC_RX_PAUSE)) + ethpause.rx_pause = 1; + else + ethpause.rx_pause = 0; + if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || + (fc_conf->mode & RTE_FC_TX_PAUSE)) + ethpause.tx_pause = 1; + else + ethpause.tx_pause = 0; + if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) { + ret = rte_errno; + WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" + " failed: %s", + strerror(rte_errno)); + goto out; + } + ret = 0; +out: + assert(ret >= 0); + return -ret; +} + +/** + * DPDK callback to retrieve the received packet types that are recognized + * by the device. + * + * @param dev + * Pointer to Ethernet device structure. + * + * @return + * Pointer to an array of recognized packet types if in Rx burst mode, + * NULL otherwise. + */ +const uint32_t * +mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev) +{ + static const uint32_t ptypes[] = { + /* refers to rxq_cq_to_pkt_type() */ + RTE_PTYPE_L2_ETHER, + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, + RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, + RTE_PTYPE_L4_FRAG, + RTE_PTYPE_L4_TCP, + RTE_PTYPE_L4_UDP, + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, + RTE_PTYPE_UNKNOWN + }; + + if (dev->rx_pkt_burst == mlx4_rx_burst) + return ptypes; + return NULL; +} diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c index 925c89c5..8b87b298 100644 --- a/drivers/net/mlx4/mlx4_flow.c +++ b/drivers/net/mlx4/mlx4_flow.c @@ -2,7 +2,7 @@ * BSD LICENSE * * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox. + * Copyright 2017 Mellanox * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,197 +31,328 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/** + * @file + * Flow API operations for mlx4 driver. + */ + +#include <arpa/inet.h> #include <assert.h> +#include <errno.h> +#include <stdalign.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <sys/queue.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif +#include <rte_byteorder.h> +#include <rte_errno.h> +#include <rte_eth_ctrl.h> +#include <rte_ethdev.h> +#include <rte_ether.h> #include <rte_flow.h> #include <rte_flow_driver.h> #include <rte_malloc.h> -/* Generated configuration header. */ -#include "mlx4_autoconf.h" - /* PMD headers. */ #include "mlx4.h" #include "mlx4_flow.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" -/** Static initializer for items. */ -#define ITEMS(...) \ +/** Static initializer for a list of subsequent item types. */ +#define NEXT_ITEM(...) \ (const enum rte_flow_item_type []){ \ __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \ } -/** Structure to generate a simple graph of layers supported by the NIC. */ -struct mlx4_flow_items { - /** List of possible actions for these items. */ - const enum rte_flow_action_type *const actions; - /** Bit-masks corresponding to the possibilities for the item. */ - const void *mask; - /** - * Default bit-masks to use when item->mask is not provided. When - * \default_mask is also NULL, the full supported bit-mask (\mask) is - * used instead. - */ - const void *default_mask; - /** Bit-masks size in bytes. */ +/** Processor structure associated with a flow item. */ +struct mlx4_flow_proc_item { + /** Bit-mask for fields supported by this PMD. */ + const void *mask_support; + /** Bit-mask to use when @p item->mask is not provided. */ + const void *mask_default; + /** Size in bytes for @p mask_support and @p mask_default. */ const unsigned int mask_sz; - /** - * Check support for a given item. - * - * @param item[in] - * Item specification. - * @param mask[in] - * Bit-masks covering supported fields to compare with spec, - * last and mask in - * \item. - * @param size - * Bit-Mask size in bytes. - * - * @return - * 0 on success, negative value otherwise. - */ - int (*validate)(const struct rte_flow_item *item, - const uint8_t *mask, unsigned int size); - /** - * Conversion function from rte_flow to NIC specific flow. - * - * @param item - * rte_flow item to convert. - * @param default_mask - * Default bit-masks to use when item->mask is not provided. - * @param data - * Internal structure to store the conversion. - * - * @return - * 0 on success, negative value otherwise. - */ - int (*convert)(const struct rte_flow_item *item, - const void *default_mask, - void *data); + /** Merge a pattern item into a flow rule handle. */ + int (*merge)(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error); /** Size in bytes of the destination structure. */ const unsigned int dst_sz; - /** List of possible following items. */ - const enum rte_flow_item_type *const items; + /** List of possible subsequent items. */ + const enum rte_flow_item_type *const next_item; }; -struct rte_flow_drop { - struct ibv_qp *qp; /**< Verbs queue pair. */ - struct ibv_cq *cq; /**< Verbs completion queue. */ +/** Shared resources for drop flow rules. */ +struct mlx4_drop { + struct ibv_qp *qp; /**< QP target. */ + struct ibv_cq *cq; /**< CQ associated with above QP. */ + struct priv *priv; /**< Back pointer to private data. */ + uint32_t refcnt; /**< Reference count. */ }; -/** Valid action for this PMD. */ -static const enum rte_flow_action_type valid_actions[] = { - RTE_FLOW_ACTION_TYPE_DROP, - RTE_FLOW_ACTION_TYPE_QUEUE, - RTE_FLOW_ACTION_TYPE_RSS, - RTE_FLOW_ACTION_TYPE_END, -}; +/** + * Convert DPDK RSS hash fields to their Verbs equivalent. + * + * @param rss_hf + * Hash fields in DPDK format (see struct rte_eth_rss_conf). + * + * @return + * A valid Verbs RSS hash fields mask for mlx4 on success, (uint64_t)-1 + * otherwise and rte_errno is set. + */ +static uint64_t +mlx4_conv_rss_hf(uint64_t rss_hf) +{ + enum { IPV4, IPV6, TCP, UDP, }; + const uint64_t in[] = { + [IPV4] = (ETH_RSS_IPV4 | + ETH_RSS_FRAG_IPV4 | + ETH_RSS_NONFRAG_IPV4_TCP | + ETH_RSS_NONFRAG_IPV4_UDP | + ETH_RSS_NONFRAG_IPV4_OTHER), + [IPV6] = (ETH_RSS_IPV6 | + ETH_RSS_FRAG_IPV6 | + ETH_RSS_NONFRAG_IPV6_TCP | + ETH_RSS_NONFRAG_IPV6_UDP | + ETH_RSS_NONFRAG_IPV6_OTHER | + ETH_RSS_IPV6_EX | + ETH_RSS_IPV6_TCP_EX | + ETH_RSS_IPV6_UDP_EX), + [TCP] = (ETH_RSS_NONFRAG_IPV4_TCP | + ETH_RSS_NONFRAG_IPV6_TCP | + ETH_RSS_IPV6_TCP_EX), + /* + * UDP support is temporarily disabled due to an + * implementation issue in the kernel. + */ + [UDP] = 0, + }; + const uint64_t out[RTE_DIM(in)] = { + [IPV4] = IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4, + [IPV6] = IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6, + [TCP] = IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP, + [UDP] = IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP, + }; + uint64_t seen = 0; + uint64_t conv = 0; + unsigned int i; + + for (i = 0; i != RTE_DIM(in); ++i) + if (rss_hf & in[i]) { + seen |= rss_hf & in[i]; + conv |= out[i]; + } + if (!(rss_hf & ~seen)) + return conv; + rte_errno = ENOTSUP; + return (uint64_t)-1; +} /** - * Convert Ethernet item to Verbs specification. + * Merge Ethernet pattern item into flow rule handle. * - * @param item[in] - * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * Additional mlx4-specific constraints on supported fields: + * + * - No support for partial masks, except in the specific case of matching + * all multicast traffic (@p spec->dst and @p mask->dst equal to + * 01:00:00:00:00:00). + * - Not providing @p item->spec or providing an empty @p mask->dst is + * *only* supported if the rule doesn't specify additional matching + * criteria (i.e. rule is promiscuous-like). + * + * @param[in, out] flow + * Flow rule handle to update. + * @param[in] item + * Pattern item to merge. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx4_flow_create_eth(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx4_flow_merge_eth(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) { const struct rte_flow_item_eth *spec = item->spec; - const struct rte_flow_item_eth *mask = item->mask; - struct mlx4_flow *flow = (struct mlx4_flow *)data; + const struct rte_flow_item_eth *mask = + spec ? (item->mask ? item->mask : proc->mask_default) : NULL; struct ibv_flow_spec_eth *eth; - const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth); + const char *msg; unsigned int i; + if (!mask) { + flow->promisc = 1; + } else { + uint32_t sum_dst = 0; + uint32_t sum_src = 0; + + for (i = 0; i != sizeof(mask->dst.addr_bytes); ++i) { + sum_dst += mask->dst.addr_bytes[i]; + sum_src += mask->src.addr_bytes[i]; + } + if (sum_src) { + msg = "mlx4 does not support source MAC matching"; + goto error; + } else if (!sum_dst) { + flow->promisc = 1; + } else if (sum_dst == 1 && mask->dst.addr_bytes[0] == 1) { + if (!(spec->dst.addr_bytes[0] & 1)) { + msg = "mlx4 does not support the explicit" + " exclusion of all multicast traffic"; + goto error; + } + flow->allmulti = 1; + } else if (sum_dst != (UINT8_C(0xff) * ETHER_ADDR_LEN)) { + msg = "mlx4 does not support matching partial" + " Ethernet fields"; + goto error; + } + } + if (!flow->ibv_attr) + return 0; + if (flow->promisc) { + flow->ibv_attr->type = IBV_FLOW_ATTR_ALL_DEFAULT; + return 0; + } + if (flow->allmulti) { + flow->ibv_attr->type = IBV_FLOW_ATTR_MC_DEFAULT; + return 0; + } ++flow->ibv_attr->num_of_specs; - flow->ibv_attr->priority = 2; - eth = (void *)((uintptr_t)flow->ibv_attr + flow->offset); + eth = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size); *eth = (struct ibv_flow_spec_eth) { .type = IBV_FLOW_SPEC_ETH, - .size = eth_size, + .size = sizeof(*eth), }; - if (!spec) { - flow->ibv_attr->type = IBV_FLOW_ATTR_ALL_DEFAULT; - return 0; - } - if (!mask) - mask = default_mask; memcpy(eth->val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN); - memcpy(eth->val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN); memcpy(eth->mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN); - memcpy(eth->mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN); /* Remove unwanted bits from values. */ for (i = 0; i < ETHER_ADDR_LEN; ++i) { eth->val.dst_mac[i] &= eth->mask.dst_mac[i]; - eth->val.src_mac[i] &= eth->mask.src_mac[i]; } return 0; +error: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg); } /** - * Convert VLAN item to Verbs specification. + * Merge VLAN pattern item into flow rule handle. * - * @param item[in] - * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * Additional mlx4-specific constraints on supported fields: + * + * - Matching *all* VLAN traffic by omitting @p item->spec or providing an + * empty @p item->mask would also include non-VLAN traffic. Doing so is + * therefore unsupported. + * - No support for partial masks. + * + * @param[in, out] flow + * Flow rule handle to update. + * @param[in] item + * Pattern item to merge. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx4_flow_create_vlan(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx4_flow_merge_vlan(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) { const struct rte_flow_item_vlan *spec = item->spec; - const struct rte_flow_item_vlan *mask = item->mask; - struct mlx4_flow *flow = (struct mlx4_flow *)data; + const struct rte_flow_item_vlan *mask = + spec ? (item->mask ? item->mask : proc->mask_default) : NULL; struct ibv_flow_spec_eth *eth; - const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth); + const char *msg; - eth = (void *)((uintptr_t)flow->ibv_attr + flow->offset - eth_size); - if (!spec) + if (!mask || !mask->tci) { + msg = "mlx4 cannot match all VLAN traffic while excluding" + " non-VLAN traffic, TCI VID must be specified"; + goto error; + } + if (mask->tci != RTE_BE16(0x0fff)) { + msg = "mlx4 does not support partial TCI VID matching"; + goto error; + } + if (!flow->ibv_attr) return 0; - if (!mask) - mask = default_mask; + eth = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size - + sizeof(*eth)); eth->val.vlan_tag = spec->tci; eth->mask.vlan_tag = mask->tci; eth->val.vlan_tag &= eth->mask.vlan_tag; return 0; +error: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg); } /** - * Convert IPv4 item to Verbs specification. + * Merge IPv4 pattern item into flow rule handle. * - * @param item[in] - * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * Additional mlx4-specific constraints on supported fields: + * + * - No support for partial masks. + * + * @param[in, out] flow + * Flow rule handle to update. + * @param[in] item + * Pattern item to merge. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx4_flow_create_ipv4(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx4_flow_merge_ipv4(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) { const struct rte_flow_item_ipv4 *spec = item->spec; - const struct rte_flow_item_ipv4 *mask = item->mask; - struct mlx4_flow *flow = (struct mlx4_flow *)data; + const struct rte_flow_item_ipv4 *mask = + spec ? (item->mask ? item->mask : proc->mask_default) : NULL; struct ibv_flow_spec_ipv4 *ipv4; - unsigned int ipv4_size = sizeof(struct ibv_flow_spec_ipv4); + const char *msg; + if (mask && + ((uint32_t)(mask->hdr.src_addr + 1) > UINT32_C(1) || + (uint32_t)(mask->hdr.dst_addr + 1) > UINT32_C(1))) { + msg = "mlx4 does not support matching partial IPv4 fields"; + goto error; + } + if (!flow->ibv_attr) + return 0; ++flow->ibv_attr->num_of_specs; - flow->ibv_attr->priority = 1; - ipv4 = (void *)((uintptr_t)flow->ibv_attr + flow->offset); + ipv4 = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size); *ipv4 = (struct ibv_flow_spec_ipv4) { .type = IBV_FLOW_SPEC_IPV4, - .size = ipv4_size, + .size = sizeof(*ipv4), }; if (!spec) return 0; @@ -229,8 +360,6 @@ mlx4_flow_create_ipv4(const struct rte_flow_item *item, .src_ip = spec->hdr.src_addr, .dst_ip = spec->hdr.dst_addr, }; - if (!mask) - mask = default_mask; ipv4->mask = (struct ibv_flow_ipv4_filter) { .src_ip = mask->hdr.src_addr, .dst_ip = mask->hdr.dst_addr, @@ -239,528 +368,504 @@ mlx4_flow_create_ipv4(const struct rte_flow_item *item, ipv4->val.src_ip &= ipv4->mask.src_ip; ipv4->val.dst_ip &= ipv4->mask.dst_ip; return 0; +error: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg); } /** - * Convert UDP item to Verbs specification. + * Merge UDP pattern item into flow rule handle. * - * @param item[in] - * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * Additional mlx4-specific constraints on supported fields: + * + * - No support for partial masks. + * + * @param[in, out] flow + * Flow rule handle to update. + * @param[in] item + * Pattern item to merge. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx4_flow_create_udp(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx4_flow_merge_udp(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) { const struct rte_flow_item_udp *spec = item->spec; - const struct rte_flow_item_udp *mask = item->mask; - struct mlx4_flow *flow = (struct mlx4_flow *)data; + const struct rte_flow_item_udp *mask = + spec ? (item->mask ? item->mask : proc->mask_default) : NULL; struct ibv_flow_spec_tcp_udp *udp; - unsigned int udp_size = sizeof(struct ibv_flow_spec_tcp_udp); + const char *msg; + if (mask && + ((uint16_t)(mask->hdr.src_port + 1) > UINT16_C(1) || + (uint16_t)(mask->hdr.dst_port + 1) > UINT16_C(1))) { + msg = "mlx4 does not support matching partial UDP fields"; + goto error; + } + if (!flow->ibv_attr) + return 0; ++flow->ibv_attr->num_of_specs; - flow->ibv_attr->priority = 0; - udp = (void *)((uintptr_t)flow->ibv_attr + flow->offset); + udp = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size); *udp = (struct ibv_flow_spec_tcp_udp) { .type = IBV_FLOW_SPEC_UDP, - .size = udp_size, + .size = sizeof(*udp), }; if (!spec) return 0; udp->val.dst_port = spec->hdr.dst_port; udp->val.src_port = spec->hdr.src_port; - if (!mask) - mask = default_mask; udp->mask.dst_port = mask->hdr.dst_port; udp->mask.src_port = mask->hdr.src_port; /* Remove unwanted bits from values. */ udp->val.src_port &= udp->mask.src_port; udp->val.dst_port &= udp->mask.dst_port; return 0; +error: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg); } /** - * Convert TCP item to Verbs specification. + * Merge TCP pattern item into flow rule handle. * - * @param item[in] - * Item specification. - * @param default_mask[in] - * Default bit-masks to use when item->mask is not provided. - * @param data[in, out] - * User structure. + * Additional mlx4-specific constraints on supported fields: + * + * - No support for partial masks. + * + * @param[in, out] flow + * Flow rule handle to update. + * @param[in] item + * Pattern item to merge. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx4_flow_create_tcp(const struct rte_flow_item *item, - const void *default_mask, - void *data) +mlx4_flow_merge_tcp(struct rte_flow *flow, + const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) { const struct rte_flow_item_tcp *spec = item->spec; - const struct rte_flow_item_tcp *mask = item->mask; - struct mlx4_flow *flow = (struct mlx4_flow *)data; + const struct rte_flow_item_tcp *mask = + spec ? (item->mask ? item->mask : proc->mask_default) : NULL; struct ibv_flow_spec_tcp_udp *tcp; - unsigned int tcp_size = sizeof(struct ibv_flow_spec_tcp_udp); + const char *msg; + if (mask && + ((uint16_t)(mask->hdr.src_port + 1) > UINT16_C(1) || + (uint16_t)(mask->hdr.dst_port + 1) > UINT16_C(1))) { + msg = "mlx4 does not support matching partial TCP fields"; + goto error; + } + if (!flow->ibv_attr) + return 0; ++flow->ibv_attr->num_of_specs; - flow->ibv_attr->priority = 0; - tcp = (void *)((uintptr_t)flow->ibv_attr + flow->offset); + tcp = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size); *tcp = (struct ibv_flow_spec_tcp_udp) { .type = IBV_FLOW_SPEC_TCP, - .size = tcp_size, + .size = sizeof(*tcp), }; if (!spec) return 0; tcp->val.dst_port = spec->hdr.dst_port; tcp->val.src_port = spec->hdr.src_port; - if (!mask) - mask = default_mask; tcp->mask.dst_port = mask->hdr.dst_port; tcp->mask.src_port = mask->hdr.src_port; /* Remove unwanted bits from values. */ tcp->val.src_port &= tcp->mask.src_port; tcp->val.dst_port &= tcp->mask.dst_port; return 0; +error: + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg); } /** - * Check support for a given item. + * Perform basic sanity checks on a pattern item. * - * @param item[in] + * @param[in] item * Item specification. - * @param mask[in] - * Bit-masks covering supported fields to compare with spec, last and mask in - * \item. - * @param size - * Bit-Mask size in bytes. + * @param[in] proc + * Associated item-processing object. + * @param[out] error + * Perform verbose error reporting if not NULL. * * @return - * 0 on success, negative value otherwise. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx4_flow_item_validate(const struct rte_flow_item *item, - const uint8_t *mask, unsigned int size) +mlx4_flow_item_check(const struct rte_flow_item *item, + const struct mlx4_flow_proc_item *proc, + struct rte_flow_error *error) { - int ret = 0; + const uint8_t *mask; + unsigned int i; + /* item->last and item->mask cannot exist without item->spec. */ if (!item->spec && (item->mask || item->last)) - return -1; - if (item->spec && !item->mask) { - unsigned int i; - const uint8_t *spec = item->spec; - - for (i = 0; i < size; ++i) - if ((spec[i] | mask[i]) != mask[i]) - return -1; - } - if (item->last && !item->mask) { - unsigned int i; - const uint8_t *spec = item->last; - - for (i = 0; i < size; ++i) - if ((spec[i] | mask[i]) != mask[i]) - return -1; - } - if (item->spec && item->last) { - uint8_t spec[size]; - uint8_t last[size]; - const uint8_t *apply = mask; - unsigned int i; - - if (item->mask) - apply = item->mask; - for (i = 0; i < size; ++i) { - spec[i] = ((const uint8_t *)item->spec)[i] & apply[i]; - last[i] = ((const uint8_t *)item->last)[i] & apply[i]; - } - ret = memcmp(spec, last, size); - } - return ret; -} - -static int -mlx4_flow_validate_eth(const struct rte_flow_item *item, - const uint8_t *mask, unsigned int size) -{ - if (item->mask) { - const struct rte_flow_item_eth *mask = item->mask; - - if (mask->dst.addr_bytes[0] != 0xff || - mask->dst.addr_bytes[1] != 0xff || - mask->dst.addr_bytes[2] != 0xff || - mask->dst.addr_bytes[3] != 0xff || - mask->dst.addr_bytes[4] != 0xff || - mask->dst.addr_bytes[5] != 0xff) - return -1; - } - return mlx4_flow_item_validate(item, mask, size); -} - -static int -mlx4_flow_validate_vlan(const struct rte_flow_item *item, - const uint8_t *mask, unsigned int size) -{ - if (item->mask) { - const struct rte_flow_item_vlan *mask = item->mask; - - if (mask->tci != 0 && - ntohs(mask->tci) != 0x0fff) - return -1; - } - return mlx4_flow_item_validate(item, mask, size); -} - -static int -mlx4_flow_validate_ipv4(const struct rte_flow_item *item, - const uint8_t *mask, unsigned int size) -{ - if (item->mask) { - const struct rte_flow_item_ipv4 *mask = item->mask; - - if (mask->hdr.src_addr != 0 && - mask->hdr.src_addr != 0xffffffff) - return -1; - if (mask->hdr.dst_addr != 0 && - mask->hdr.dst_addr != 0xffffffff) - return -1; - } - return mlx4_flow_item_validate(item, mask, size); -} - -static int -mlx4_flow_validate_udp(const struct rte_flow_item *item, - const uint8_t *mask, unsigned int size) -{ - if (item->mask) { - const struct rte_flow_item_udp *mask = item->mask; - - if (mask->hdr.src_port != 0 && - mask->hdr.src_port != 0xffff) - return -1; - if (mask->hdr.dst_port != 0 && - mask->hdr.dst_port != 0xffff) - return -1; - } - return mlx4_flow_item_validate(item, mask, size); -} - -static int -mlx4_flow_validate_tcp(const struct rte_flow_item *item, - const uint8_t *mask, unsigned int size) -{ - if (item->mask) { - const struct rte_flow_item_tcp *mask = item->mask; - - if (mask->hdr.src_port != 0 && - mask->hdr.src_port != 0xffff) - return -1; - if (mask->hdr.dst_port != 0 && - mask->hdr.dst_port != 0xffff) - return -1; + return rte_flow_error_set + (error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, + "\"mask\" or \"last\" field provided without a" + " corresponding \"spec\""); + /* No spec, no mask, no problem. */ + if (!item->spec) + return 0; + mask = item->mask ? + (const uint8_t *)item->mask : + (const uint8_t *)proc->mask_default; + assert(mask); + /* + * Single-pass check to make sure that: + * - Mask is supported, no bits are set outside proc->mask_support. + * - Both item->spec and item->last are included in mask. + */ + for (i = 0; i != proc->mask_sz; ++i) { + if (!mask[i]) + continue; + if ((mask[i] | ((const uint8_t *)proc->mask_support)[i]) != + ((const uint8_t *)proc->mask_support)[i]) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, "unsupported field found in \"mask\""); + if (item->last && + (((const uint8_t *)item->spec)[i] & mask[i]) != + (((const uint8_t *)item->last)[i] & mask[i])) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, + "range between \"spec\" and \"last\"" + " is larger than \"mask\""); } - return mlx4_flow_item_validate(item, mask, size); + return 0; } /** Graph of supported items and associated actions. */ -static const struct mlx4_flow_items mlx4_flow_items[] = { +static const struct mlx4_flow_proc_item mlx4_flow_proc_item_list[] = { [RTE_FLOW_ITEM_TYPE_END] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH), + .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_ETH), }, [RTE_FLOW_ITEM_TYPE_ETH] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_VLAN, - RTE_FLOW_ITEM_TYPE_IPV4), - .actions = valid_actions, - .mask = &(const struct rte_flow_item_eth){ + .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_VLAN, + RTE_FLOW_ITEM_TYPE_IPV4), + .mask_support = &(const struct rte_flow_item_eth){ + /* Only destination MAC can be matched. */ .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", - .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", }, - .default_mask = &rte_flow_item_eth_mask, + .mask_default = &rte_flow_item_eth_mask, .mask_sz = sizeof(struct rte_flow_item_eth), - .validate = mlx4_flow_validate_eth, - .convert = mlx4_flow_create_eth, + .merge = mlx4_flow_merge_eth, .dst_sz = sizeof(struct ibv_flow_spec_eth), }, [RTE_FLOW_ITEM_TYPE_VLAN] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4), - .actions = valid_actions, - .mask = &(const struct rte_flow_item_vlan){ - /* rte_flow_item_vlan_mask is invalid for mlx4. */ -#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN - .tci = 0x0fff, -#else - .tci = 0xff0f, -#endif + .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_IPV4), + .mask_support = &(const struct rte_flow_item_vlan){ + /* Only TCI VID matching is supported. */ + .tci = RTE_BE16(0x0fff), }, + .mask_default = &rte_flow_item_vlan_mask, .mask_sz = sizeof(struct rte_flow_item_vlan), - .validate = mlx4_flow_validate_vlan, - .convert = mlx4_flow_create_vlan, + .merge = mlx4_flow_merge_vlan, .dst_sz = 0, }, [RTE_FLOW_ITEM_TYPE_IPV4] = { - .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, - RTE_FLOW_ITEM_TYPE_TCP), - .actions = valid_actions, - .mask = &(const struct rte_flow_item_ipv4){ + .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_UDP, + RTE_FLOW_ITEM_TYPE_TCP), + .mask_support = &(const struct rte_flow_item_ipv4){ .hdr = { - .src_addr = -1, - .dst_addr = -1, + .src_addr = RTE_BE32(0xffffffff), + .dst_addr = RTE_BE32(0xffffffff), }, }, - .default_mask = &rte_flow_item_ipv4_mask, + .mask_default = &rte_flow_item_ipv4_mask, .mask_sz = sizeof(struct rte_flow_item_ipv4), - .validate = mlx4_flow_validate_ipv4, - .convert = mlx4_flow_create_ipv4, + .merge = mlx4_flow_merge_ipv4, .dst_sz = sizeof(struct ibv_flow_spec_ipv4), }, [RTE_FLOW_ITEM_TYPE_UDP] = { - .actions = valid_actions, - .mask = &(const struct rte_flow_item_udp){ + .mask_support = &(const struct rte_flow_item_udp){ .hdr = { - .src_port = -1, - .dst_port = -1, + .src_port = RTE_BE16(0xffff), + .dst_port = RTE_BE16(0xffff), }, }, - .default_mask = &rte_flow_item_udp_mask, + .mask_default = &rte_flow_item_udp_mask, .mask_sz = sizeof(struct rte_flow_item_udp), - .validate = mlx4_flow_validate_udp, - .convert = mlx4_flow_create_udp, + .merge = mlx4_flow_merge_udp, .dst_sz = sizeof(struct ibv_flow_spec_tcp_udp), }, [RTE_FLOW_ITEM_TYPE_TCP] = { - .actions = valid_actions, - .mask = &(const struct rte_flow_item_tcp){ + .mask_support = &(const struct rte_flow_item_tcp){ .hdr = { - .src_port = -1, - .dst_port = -1, + .src_port = RTE_BE16(0xffff), + .dst_port = RTE_BE16(0xffff), }, }, - .default_mask = &rte_flow_item_tcp_mask, + .mask_default = &rte_flow_item_tcp_mask, .mask_sz = sizeof(struct rte_flow_item_tcp), - .validate = mlx4_flow_validate_tcp, - .convert = mlx4_flow_create_tcp, + .merge = mlx4_flow_merge_tcp, .dst_sz = sizeof(struct ibv_flow_spec_tcp_udp), }, }; /** - * Validate a flow supported by the NIC. + * Make sure a flow rule is supported and initialize associated structure. * * @param priv * Pointer to private structure. * @param[in] attr * Flow rule attributes. - * @param[in] items + * @param[in] pattern * Pattern specification (list terminated by the END pattern item). * @param[in] actions * Associated actions (list terminated by the END action). * @param[out] error * Perform verbose error reporting if not NULL. - * @param[in, out] flow - * Flow structure to update. + * @param[in, out] addr + * Buffer where the resulting flow rule handle pointer must be stored. + * If NULL, stop processing after validation stage. * * @return * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -priv_flow_validate(struct priv *priv, - const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - const struct rte_flow_action actions[], - struct rte_flow_error *error, - struct mlx4_flow *flow) +mlx4_flow_prepare(struct priv *priv, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error, + struct rte_flow **addr) { - const struct mlx4_flow_items *cur_item = mlx4_flow_items; - struct mlx4_flow_action action = { - .queue = 0, - .drop = 0, - }; - - (void)priv; - if (attr->group) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_GROUP, - NULL, - "groups are not supported"); - return -rte_errno; - } - if (attr->priority) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, - NULL, - "priorities are not supported"); - return -rte_errno; - } - if (attr->egress) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, - NULL, - "egress is not supported"); - return -rte_errno; - } - if (!attr->ingress) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, - NULL, - "only ingress is supported"); - return -rte_errno; - } - /* Go over items list. */ - for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) { - const struct mlx4_flow_items *token = NULL; + const struct rte_flow_item *item; + const struct rte_flow_action *action; + const struct mlx4_flow_proc_item *proc; + struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) }; + struct rte_flow *flow = &temp; + const char *msg = NULL; + + if (attr->group) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + NULL, "groups are not supported"); + if (attr->priority > MLX4_FLOW_PRIORITY_LAST) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, "maximum priority level is " + MLX4_STR_EXPAND(MLX4_FLOW_PRIORITY_LAST)); + if (attr->egress) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, + NULL, "egress is not supported"); + if (!attr->ingress) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + NULL, "only ingress is supported"); +fill: + proc = mlx4_flow_proc_item_list; + /* Go over pattern. */ + for (item = pattern; item->type; ++item) { + const struct mlx4_flow_proc_item *next = NULL; unsigned int i; int err; - if (items->type == RTE_FLOW_ITEM_TYPE_VOID) + if (item->type == RTE_FLOW_ITEM_TYPE_VOID) + continue; + if (item->type == MLX4_FLOW_ITEM_TYPE_INTERNAL) { + flow->internal = 1; continue; - /* - * The nic can support patterns with NULL eth spec only - * if eth is a single item in a rule. - */ - if (!items->spec && - items->type == RTE_FLOW_ITEM_TYPE_ETH) { - const struct rte_flow_item *next = items + 1; - - if (next->type != RTE_FLOW_ITEM_TYPE_END) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - items, - "the rule requires" - " an Ethernet spec"); - return -rte_errno; - } } - for (i = 0; - cur_item->items && - cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END; - ++i) { - if (cur_item->items[i] == items->type) { - token = &mlx4_flow_items[items->type]; + if (flow->promisc || flow->allmulti) { + msg = "mlx4 does not support additional matching" + " criteria combined with indiscriminate" + " matching on Ethernet headers"; + goto exit_item_not_supported; + } + for (i = 0; proc->next_item && proc->next_item[i]; ++i) { + if (proc->next_item[i] == item->type) { + next = &mlx4_flow_proc_item_list[item->type]; break; } } - if (!token) - goto exit_item_not_supported; - cur_item = token; - err = cur_item->validate(items, - (const uint8_t *)cur_item->mask, - cur_item->mask_sz); - if (err) + if (!next) goto exit_item_not_supported; - if (flow->ibv_attr && cur_item->convert) { - err = cur_item->convert(items, - (cur_item->default_mask ? - cur_item->default_mask : - cur_item->mask), - flow); + proc = next; + /* + * Perform basic sanity checks only once, while handle is + * not allocated. + */ + if (flow == &temp) { + err = mlx4_flow_item_check(item, proc, error); if (err) - goto exit_item_not_supported; + return err; } - flow->offset += cur_item->dst_sz; + if (proc->merge) { + err = proc->merge(flow, item, proc, error); + if (err) + return err; + } + flow->ibv_attr_size += proc->dst_sz; } - /* Go over actions list */ - for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { - if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) { - continue; - } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { - action.drop = 1; - } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) { - const struct rte_flow_action_queue *queue = - (const struct rte_flow_action_queue *) - actions->conf; + /* Go over actions list. */ + for (action = actions; action->type; ++action) { + switch (action->type) { + const struct rte_flow_action_queue *queue; + const struct rte_flow_action_rss *rss; + const struct rte_eth_rss_conf *rss_conf; + unsigned int i; - if (!queue || (queue->index > (priv->rxqs_n - 1))) + case RTE_FLOW_ACTION_TYPE_VOID: + continue; + case RTE_FLOW_ACTION_TYPE_DROP: + flow->drop = 1; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + if (flow->rss) + break; + queue = action->conf; + if (queue->index >= priv->dev->data->nb_rx_queues) { + msg = "queue target index beyond number of" + " configured Rx queues"; goto exit_action_not_supported; - action.queue = 1; - action.queues_n = 1; - action.queues[0] = queue->index; - } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) { - int i; - int ierr; - const struct rte_flow_action_rss *rss = - (const struct rte_flow_action_rss *) - actions->conf; - - if (!priv->hw_rss) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "RSS cannot be used with " - "the current configuration"); - return -rte_errno; } - if (!priv->isolated) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "RSS cannot be used without " - "isolated mode"); - return -rte_errno; + flow->rss = mlx4_rss_get + (priv, 0, mlx4_rss_hash_key_default, 1, + &queue->index); + if (!flow->rss) { + msg = "not enough resources for additional" + " single-queue RSS context"; + goto exit_action_not_supported; + } + break; + case RTE_FLOW_ACTION_TYPE_RSS: + if (flow->rss) + break; + rss = action->conf; + /* Default RSS configuration if none is provided. */ + rss_conf = + rss->rss_conf ? + rss->rss_conf : + &(struct rte_eth_rss_conf){ + .rss_key = mlx4_rss_hash_key_default, + .rss_key_len = MLX4_RSS_HASH_KEY_SIZE, + .rss_hf = (ETH_RSS_IPV4 | + ETH_RSS_NONFRAG_IPV4_TCP | + ETH_RSS_IPV6 | + ETH_RSS_NONFRAG_IPV6_TCP), + }; + /* Sanity checks. */ + for (i = 0; i < rss->num; ++i) + if (rss->queue[i] >= + priv->dev->data->nb_rx_queues) + break; + if (i != rss->num) { + msg = "queue index target beyond number of" + " configured Rx queues"; + goto exit_action_not_supported; } if (!rte_is_power_of_2(rss->num)) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "the number of queues " - "should be power of two"); - return -rte_errno; + msg = "for RSS, mlx4 requires the number of" + " queues to be a power of two"; + goto exit_action_not_supported; } - if (priv->max_rss_tbl_sz < rss->num) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "the number of queues " - "is too large"); - return -rte_errno; + if (rss_conf->rss_key_len != + sizeof(flow->rss->key)) { + msg = "mlx4 supports exactly one RSS hash key" + " length: " + MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE); + goto exit_action_not_supported; } - /* checking indexes array */ - ierr = 0; - for (i = 0; i < rss->num; ++i) { - int j; - if (rss->queue[i] >= priv->rxqs_n) - ierr = 1; - /* - * Prevent the user from specifying - * the same queue twice in the RSS array. - */ - for (j = i + 1; j < rss->num && !ierr; ++j) - if (rss->queue[j] == rss->queue[i]) - ierr = 1; - if (ierr) { - rte_flow_error_set( - error, - ENOTSUP, - RTE_FLOW_ERROR_TYPE_HANDLE, - NULL, - "RSS action only supports " - "unique queue indices " - "in a list"); - return -rte_errno; - } + for (i = 1; i < rss->num; ++i) + if (rss->queue[i] - rss->queue[i - 1] != 1) + break; + if (i != rss->num) { + msg = "mlx4 requires RSS contexts to use" + " consecutive queue indices only"; + goto exit_action_not_supported; } - action.queue = 1; - action.queues_n = rss->num; - for (i = 0; i < rss->num; ++i) - action.queues[i] = rss->queue[i]; - } else { + if (rss->queue[0] % rss->num) { + msg = "mlx4 requires the first queue of a RSS" + " context to be aligned on a multiple" + " of the context size"; + goto exit_action_not_supported; + } + flow->rss = mlx4_rss_get + (priv, mlx4_conv_rss_hf(rss_conf->rss_hf), + rss_conf->rss_key, rss->num, rss->queue); + if (!flow->rss) { + msg = "either invalid parameters or not enough" + " resources for additional multi-queue" + " RSS context"; + goto exit_action_not_supported; + } + break; + default: goto exit_action_not_supported; } } - if (!action.queue && !action.drop) { - rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, - NULL, "no valid action"); - return -rte_errno; + if (!flow->rss && !flow->drop) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "no valid action"); + /* Validation ends here. */ + if (!addr) { + if (flow->rss) + mlx4_rss_put(flow->rss); + return 0; } + if (flow == &temp) { + /* Allocate proper handle based on collected data. */ + const struct mlx4_malloc_vec vec[] = { + { + .align = alignof(struct rte_flow), + .size = sizeof(*flow), + .addr = (void **)&flow, + }, + { + .align = alignof(struct ibv_flow_attr), + .size = temp.ibv_attr_size, + .addr = (void **)&temp.ibv_attr, + }, + }; + + if (!mlx4_zmallocv(__func__, vec, RTE_DIM(vec))) + return rte_flow_error_set + (error, -rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "flow rule handle allocation failure"); + /* Most fields will be updated by second pass. */ + *flow = (struct rte_flow){ + .ibv_attr = temp.ibv_attr, + .ibv_attr_size = sizeof(*flow->ibv_attr), + .rss = temp.rss, + }; + *flow->ibv_attr = (struct ibv_flow_attr){ + .type = IBV_FLOW_ATTR_NORMAL, + .size = sizeof(*flow->ibv_attr), + .priority = attr->priority, + .port = priv->port, + }; + goto fill; + } + *addr = flow; return 0; exit_item_not_supported: - rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, - items, "item not supported"); - return -rte_errno; + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, msg ? msg : "item not supported"); exit_action_not_supported: - rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, - actions, "action not supported"); - return -rte_errno; + return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, + action, msg ? msg : "action not supported"); } /** @@ -769,552 +874,691 @@ exit_action_not_supported: * @see rte_flow_validate() * @see rte_flow_ops */ -int +static int mlx4_flow_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, - const struct rte_flow_item items[], + const struct rte_flow_item pattern[], const struct rte_flow_action actions[], struct rte_flow_error *error) { struct priv *priv = dev->data->dev_private; - int ret; - struct mlx4_flow flow = { .offset = sizeof(struct ibv_flow_attr) }; - - priv_lock(priv); - ret = priv_flow_validate(priv, attr, items, actions, error, &flow); - priv_unlock(priv); - return ret; -} - -/** - * Destroy a drop queue. - * - * @param priv - * Pointer to private structure. - */ -static void -mlx4_flow_destroy_drop_queue(struct priv *priv) -{ - if (priv->flow_drop_queue) { - struct rte_flow_drop *fdq = priv->flow_drop_queue; - priv->flow_drop_queue = NULL; - claim_zero(ibv_destroy_qp(fdq->qp)); - claim_zero(ibv_destroy_cq(fdq->cq)); - rte_free(fdq); - } + return mlx4_flow_prepare(priv, attr, pattern, actions, error, NULL); } /** - * Create a single drop queue for all drop flows. + * Get a drop flow rule resources instance. * * @param priv * Pointer to private structure. * * @return - * 0 on success, negative value otherwise. + * Pointer to drop flow resources on success, NULL otherwise and rte_errno + * is set. */ -static int -mlx4_flow_create_drop_queue(struct priv *priv) +static struct mlx4_drop * +mlx4_drop_get(struct priv *priv) { - struct ibv_qp *qp; - struct ibv_cq *cq; - struct rte_flow_drop *fdq; + struct mlx4_drop *drop = priv->drop; - fdq = rte_calloc(__func__, 1, sizeof(*fdq), 0); - if (!fdq) { - ERROR("Cannot allocate memory for drop struct"); - goto err; - } - cq = ibv_exp_create_cq(priv->ctx, 1, NULL, NULL, 0, - &(struct ibv_exp_cq_init_attr){ - .comp_mask = 0, - }); - if (!cq) { - ERROR("Cannot create drop CQ"); - goto err_create_cq; - } - qp = ibv_exp_create_qp(priv->ctx, - &(struct ibv_exp_qp_init_attr){ - .send_cq = cq, - .recv_cq = cq, - .cap = { - .max_recv_wr = 1, - .max_recv_sge = 1, - }, - .qp_type = IBV_QPT_RAW_PACKET, - .comp_mask = - IBV_EXP_QP_INIT_ATTR_PD | - IBV_EXP_QP_INIT_ATTR_PORT, - .pd = priv->pd, - .port_num = priv->port, - }); - if (!qp) { - ERROR("Cannot create drop QP"); - goto err_create_qp; + if (drop) { + assert(drop->refcnt); + assert(drop->priv == priv); + ++drop->refcnt; + return drop; } - *fdq = (struct rte_flow_drop){ - .qp = qp, - .cq = cq, + drop = rte_malloc(__func__, sizeof(*drop), 0); + if (!drop) + goto error; + *drop = (struct mlx4_drop){ + .priv = priv, + .refcnt = 1, }; - priv->flow_drop_queue = fdq; - return 0; -err_create_qp: - claim_zero(ibv_destroy_cq(cq)); -err_create_cq: - rte_free(fdq); -err: - return -1; + drop->cq = ibv_create_cq(priv->ctx, 1, NULL, NULL, 0); + if (!drop->cq) + goto error; + drop->qp = ibv_create_qp(priv->pd, + &(struct ibv_qp_init_attr){ + .send_cq = drop->cq, + .recv_cq = drop->cq, + .qp_type = IBV_QPT_RAW_PACKET, + }); + if (!drop->qp) + goto error; + priv->drop = drop; + return drop; +error: + if (drop->qp) + claim_zero(ibv_destroy_qp(drop->qp)); + if (drop->cq) + claim_zero(ibv_destroy_cq(drop->cq)); + if (drop) + rte_free(drop); + rte_errno = ENOMEM; + return NULL; } /** - * Get RSS parent rxq structure for given queues. + * Give back a drop flow rule resources instance. * - * Creates a new or returns an existed one. - * - * @param priv - * Pointer to private structure. - * @param queues - * queues indices array, NULL in default RSS case. - * @param children_n - * the size of queues array. - * - * @return - * Pointer to a parent rxq structure, NULL on failure. + * @param drop + * Pointer to drop flow rule resources. */ -static struct rxq * -priv_parent_get(struct priv *priv, - uint16_t queues[], - uint16_t children_n, - struct rte_flow_error *error) +static void +mlx4_drop_put(struct mlx4_drop *drop) { - unsigned int i; - struct rxq *parent; - - for (parent = LIST_FIRST(&priv->parents); - parent; - parent = LIST_NEXT(parent, next)) { - unsigned int same = 0; - unsigned int overlap = 0; - - /* - * Find out whether an appropriate parent queue already exists - * and can be reused, otherwise make sure there are no overlaps. - */ - for (i = 0; i < children_n; ++i) { - unsigned int j; - - for (j = 0; j < parent->rss.queues_n; ++j) { - if (parent->rss.queues[j] != queues[i]) - continue; - ++overlap; - if (i == j) - ++same; - } - } - if (same == children_n && - children_n == parent->rss.queues_n) - return parent; - else if (overlap) - goto error; - } - /* Exclude the cases when some QPs were created without RSS */ - for (i = 0; i < children_n; ++i) { - struct rxq *rxq = (*priv->rxqs)[queues[i]]; - if (rxq->qp) - goto error; - } - parent = priv_parent_create(priv, queues, children_n); - if (!parent) { - rte_flow_error_set(error, - ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, "flow rule creation failure"); - return NULL; - } - return parent; - -error: - rte_flow_error_set(error, - EEXIST, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "sharing a queue between several" - " RSS groups is not supported"); - return NULL; + assert(drop->refcnt); + if (--drop->refcnt) + return; + drop->priv->drop = NULL; + claim_zero(ibv_destroy_qp(drop->qp)); + claim_zero(ibv_destroy_cq(drop->cq)); + rte_free(drop); } /** - * Complete flow rule creation. + * Toggle a configured flow rule. * * @param priv * Pointer to private structure. - * @param ibv_attr - * Verbs flow attributes. - * @param action - * Target action structure. + * @param flow + * Flow rule handle to toggle. + * @param enable + * Whether associated Verbs flow must be created or removed. * @param[out] error * Perform verbose error reporting if not NULL. * * @return - * A flow if the rule could be created. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static struct rte_flow * -priv_flow_create_action_queue(struct priv *priv, - struct ibv_flow_attr *ibv_attr, - struct mlx4_flow_action *action, - struct rte_flow_error *error) +static int +mlx4_flow_toggle(struct priv *priv, + struct rte_flow *flow, + int enable, + struct rte_flow_error *error) { - struct ibv_qp *qp; - struct rte_flow *rte_flow; - struct rxq *rxq_parent = NULL; + struct ibv_qp *qp = NULL; + const char *msg; + int err; - assert(priv->pd); - assert(priv->ctx); - rte_flow = rte_calloc(__func__, 1, sizeof(*rte_flow), 0); - if (!rte_flow) { - rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, - NULL, "cannot allocate flow memory"); - return NULL; + if (!enable) { + if (!flow->ibv_flow) + return 0; + claim_zero(ibv_destroy_flow(flow->ibv_flow)); + flow->ibv_flow = NULL; + if (flow->drop) + mlx4_drop_put(priv->drop); + else if (flow->rss) + mlx4_rss_detach(flow->rss); + return 0; } - if (action->drop) { - qp = priv->flow_drop_queue ? priv->flow_drop_queue->qp : NULL; - } else { - int ret; + assert(flow->ibv_attr); + if (!flow->internal && + !priv->isolated && + flow->ibv_attr->priority == MLX4_FLOW_PRIORITY_LAST) { + if (flow->ibv_flow) { + claim_zero(ibv_destroy_flow(flow->ibv_flow)); + flow->ibv_flow = NULL; + if (flow->drop) + mlx4_drop_put(priv->drop); + else if (flow->rss) + mlx4_rss_detach(flow->rss); + } + err = EACCES; + msg = ("priority level " + MLX4_STR_EXPAND(MLX4_FLOW_PRIORITY_LAST) + " is reserved when not in isolated mode"); + goto error; + } + if (flow->rss) { + struct mlx4_rss *rss = flow->rss; + int missing = 0; unsigned int i; - struct rxq *rxq = NULL; - if (action->queues_n > 1) { - rxq_parent = priv_parent_get(priv, action->queues, - action->queues_n, error); - if (!rxq_parent) - goto error; + /* Stop at the first nonexistent target queue. */ + for (i = 0; i != rss->queues; ++i) + if (rss->queue_id[i] >= + priv->dev->data->nb_rx_queues || + !priv->dev->data->rx_queues[rss->queue_id[i]]) { + missing = 1; + break; + } + if (flow->ibv_flow) { + if (missing ^ !flow->drop) + return 0; + /* Verbs flow needs updating. */ + claim_zero(ibv_destroy_flow(flow->ibv_flow)); + flow->ibv_flow = NULL; + if (flow->drop) + mlx4_drop_put(priv->drop); + else + mlx4_rss_detach(rss); } - for (i = 0; i < action->queues_n; ++i) { - rxq = (*priv->rxqs)[action->queues[i]]; - /* - * In case of isolated mode we postpone - * ibv receive queue creation till the first - * rte_flow rule will be applied on that queue. - */ - if (!rxq->qp) { - assert(priv->isolated); - ret = rxq_create_qp(rxq, rxq->elts_n, - 0, 0, rxq_parent); - if (ret) { - rte_flow_error_set( - error, - ENOMEM, - RTE_FLOW_ERROR_TYPE_HANDLE, - NULL, - "flow rule creation failure"); - goto error; - } + if (!missing) { + err = mlx4_rss_attach(rss); + if (err) { + err = -err; + msg = "cannot create indirection table or hash" + " QP to associate flow rule with"; + goto error; } + qp = rss->qp; } - qp = action->queues_n > 1 ? rxq_parent->qp : rxq->qp; - rte_flow->qp = qp; + /* A missing target queue drops traffic implicitly. */ + flow->drop = missing; } - rte_flow->ibv_attr = ibv_attr; - if (!priv->started) - return rte_flow; - rte_flow->ibv_flow = ibv_create_flow(qp, rte_flow->ibv_attr); - if (!rte_flow->ibv_flow) { - rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, - NULL, "flow rule creation failure"); - goto error; + if (flow->drop) { + mlx4_drop_get(priv); + if (!priv->drop) { + err = rte_errno; + msg = "resources for drop flow rule cannot be created"; + goto error; + } + qp = priv->drop->qp; } - return rte_flow; - + assert(qp); + if (flow->ibv_flow) + return 0; + flow->ibv_flow = ibv_create_flow(qp, flow->ibv_attr); + if (flow->ibv_flow) + return 0; + if (flow->drop) + mlx4_drop_put(priv->drop); + else if (flow->rss) + mlx4_rss_detach(flow->rss); + err = errno; + msg = "flow rule rejected by device"; error: - if (rxq_parent) - rxq_parent_cleanup(rxq_parent); - rte_free(rte_flow); - return NULL; + return rte_flow_error_set + (error, err, RTE_FLOW_ERROR_TYPE_HANDLE, flow, msg); } /** - * Convert a flow. - * - * @param priv - * Pointer to private structure. - * @param[in] attr - * Flow rule attributes. - * @param[in] items - * Pattern specification (list terminated by the END pattern item). - * @param[in] actions - * Associated actions (list terminated by the END action). - * @param[out] error - * Perform verbose error reporting if not NULL. + * Create a flow. * - * @return - * A flow on success, NULL otherwise. + * @see rte_flow_create() + * @see rte_flow_ops */ static struct rte_flow * -priv_flow_create(struct priv *priv, +mlx4_flow_create(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, - const struct rte_flow_item items[], + const struct rte_flow_item pattern[], const struct rte_flow_action actions[], struct rte_flow_error *error) { - struct rte_flow *rte_flow; - struct mlx4_flow_action action; - struct mlx4_flow flow = { .offset = sizeof(struct ibv_flow_attr), }; + struct priv *priv = dev->data->dev_private; + struct rte_flow *flow; int err; - err = priv_flow_validate(priv, attr, items, actions, error, &flow); + err = mlx4_flow_prepare(priv, attr, pattern, actions, error, &flow); if (err) return NULL; - flow.ibv_attr = rte_malloc(__func__, flow.offset, 0); - if (!flow.ibv_attr) { - rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, - NULL, "cannot allocate ibv_attr memory"); - return NULL; - } - flow.offset = sizeof(struct ibv_flow_attr); - *flow.ibv_attr = (struct ibv_flow_attr){ - .comp_mask = 0, - .type = IBV_FLOW_ATTR_NORMAL, - .size = sizeof(struct ibv_flow_attr), - .priority = attr->priority, - .num_of_specs = 0, - .port = priv->port, - .flags = 0, - }; - claim_zero(priv_flow_validate(priv, attr, items, actions, - error, &flow)); - action = (struct mlx4_flow_action){ - .queue = 0, - .drop = 0, - }; - for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { - if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) { - continue; - } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) { - action.queue = 1; - action.queues_n = 1; - action.queues[0] = - ((const struct rte_flow_action_queue *) - actions->conf)->index; - } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { - action.drop = 1; - } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) { - unsigned int i; - const struct rte_flow_action_rss *rss = - (const struct rte_flow_action_rss *) - actions->conf; + err = mlx4_flow_toggle(priv, flow, priv->started, error); + if (!err) { + struct rte_flow *curr = LIST_FIRST(&priv->flows); - action.queue = 1; - action.queues_n = rss->num; - for (i = 0; i < rss->num; ++i) - action.queues[i] = rss->queue[i]; + /* New rules are inserted after internal ones. */ + if (!curr || !curr->internal) { + LIST_INSERT_HEAD(&priv->flows, flow, next); } else { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, "unsupported action"); - goto exit; + while (LIST_NEXT(curr, next) && + LIST_NEXT(curr, next)->internal) + curr = LIST_NEXT(curr, next); + LIST_INSERT_AFTER(curr, flow, next); } + return flow; } - rte_flow = priv_flow_create_action_queue(priv, flow.ibv_attr, - &action, error); - if (rte_flow) - return rte_flow; -exit: - rte_free(flow.ibv_attr); + if (flow->rss) + mlx4_rss_put(flow->rss); + rte_flow_error_set(error, -err, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + error->message); + rte_free(flow); return NULL; } /** - * Create a flow. + * Configure isolated mode. * - * @see rte_flow_create() - * @see rte_flow_ops - */ -struct rte_flow * -mlx4_flow_create(struct rte_eth_dev *dev, - const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - const struct rte_flow_action actions[], - struct rte_flow_error *error) -{ - struct priv *priv = dev->data->dev_private; - struct rte_flow *flow; - - priv_lock(priv); - flow = priv_flow_create(priv, attr, items, actions, error); - if (flow) { - LIST_INSERT_HEAD(&priv->flows, flow, next); - DEBUG("Flow created %p", (void *)flow); - } - priv_unlock(priv); - return flow; -} - -/** * @see rte_flow_isolate() - * - * Must be done before calling dev_configure(). - * - * @param dev - * Pointer to the ethernet device structure. - * @param enable - * Nonzero to enter isolated mode, attempt to leave it otherwise. - * @param[out] error - * Perform verbose error reporting if not NULL. PMDs initialize this - * structure in case of error only. - * - * @return - * 0 on success, a negative value on error. + * @see rte_flow_ops */ -int +static int mlx4_flow_isolate(struct rte_eth_dev *dev, int enable, struct rte_flow_error *error) { struct priv *priv = dev->data->dev_private; - priv_lock(priv); - if (priv->rxqs) { - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, "isolated mode must be set" - " before configuring the device"); - priv_unlock(priv); + if (!!enable == !!priv->isolated) + return 0; + priv->isolated = !!enable; + if (mlx4_flow_sync(priv, error)) { + priv->isolated = !enable; return -rte_errno; } - priv->isolated = !!enable; - priv_unlock(priv); return 0; } /** - * Destroy a flow. + * Destroy a flow rule. * - * @param priv - * Pointer to private structure. - * @param[in] flow - * Flow to destroy. + * @see rte_flow_destroy() + * @see rte_flow_ops */ -static void -priv_flow_destroy(struct priv *priv, struct rte_flow *flow) +static int +mlx4_flow_destroy(struct rte_eth_dev *dev, + struct rte_flow *flow, + struct rte_flow_error *error) { - (void)priv; + struct priv *priv = dev->data->dev_private; + int err = mlx4_flow_toggle(priv, flow, 0, error); + + if (err) + return err; LIST_REMOVE(flow, next); - if (flow->ibv_flow) - claim_zero(ibv_destroy_flow(flow->ibv_flow)); - rte_free(flow->ibv_attr); - DEBUG("Flow destroyed %p", (void *)flow); + if (flow->rss) + mlx4_rss_put(flow->rss); rte_free(flow); + return 0; } /** - * Destroy a flow. + * Destroy user-configured flow rules. * - * @see rte_flow_destroy() + * This function skips internal flows rules. + * + * @see rte_flow_flush() * @see rte_flow_ops */ -int -mlx4_flow_destroy(struct rte_eth_dev *dev, - struct rte_flow *flow, - struct rte_flow_error *error) +static int +mlx4_flow_flush(struct rte_eth_dev *dev, + struct rte_flow_error *error) { struct priv *priv = dev->data->dev_private; + struct rte_flow *flow = LIST_FIRST(&priv->flows); + + while (flow) { + struct rte_flow *next = LIST_NEXT(flow, next); - (void)error; - priv_lock(priv); - priv_flow_destroy(priv, flow); - priv_unlock(priv); + if (!flow->internal) + mlx4_flow_destroy(dev, flow, error); + flow = next; + } return 0; } /** - * Destroy all flows. + * Helper function to determine the next configured VLAN filter. * * @param priv * Pointer to private structure. + * @param vlan + * VLAN ID to use as a starting point. + * + * @return + * Next configured VLAN ID or a high value (>= 4096) if there is none. */ -static void -priv_flow_flush(struct priv *priv) +static uint16_t +mlx4_flow_internal_next_vlan(struct priv *priv, uint16_t vlan) +{ + while (vlan < 4096) { + if (priv->dev->data->vlan_filter_conf.ids[vlan / 64] & + (UINT64_C(1) << (vlan % 64))) + return vlan; + ++vlan; + } + return vlan; +} + +/** + * Generate internal flow rules. + * + * Various flow rules are created depending on the mode the device is in: + * + * 1. Promiscuous: port MAC + catch-all (VLAN filtering is ignored). + * 2. All multicast: port MAC/VLAN + catch-all multicast. + * 3. Otherwise: port MAC/VLAN + broadcast MAC/VLAN. + * + * About MAC flow rules: + * + * - MAC flow rules are generated from @p dev->data->mac_addrs + * (@p priv->mac array). + * - An additional flow rule for Ethernet broadcasts is also generated. + * - All these are per-VLAN if @p dev->data->dev_conf.rxmode.hw_vlan_filter + * is enabled and VLAN filters are configured. + * + * @param priv + * Pointer to private structure. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error) { - while (!LIST_EMPTY(&priv->flows)) { - struct rte_flow *flow; + struct rte_flow_attr attr = { + .priority = MLX4_FLOW_PRIORITY_LAST, + .ingress = 1, + }; + struct rte_flow_item_eth eth_spec; + const struct rte_flow_item_eth eth_mask = { + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }; + const struct rte_flow_item_eth eth_allmulti = { + .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00", + }; + struct rte_flow_item_vlan vlan_spec; + const struct rte_flow_item_vlan vlan_mask = { + .tci = RTE_BE16(0x0fff), + }; + struct rte_flow_item pattern[] = { + { + .type = MLX4_FLOW_ITEM_TYPE_INTERNAL, + }, + { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .spec = ð_spec, + .mask = ð_mask, + }, + { + /* Replaced with VLAN if filtering is enabled. */ + .type = RTE_FLOW_ITEM_TYPE_END, + }, + { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + }; + /* + * Round number of queues down to their previous power of 2 to + * comply with RSS context limitations. Extra queues silently do not + * get RSS by default. + */ + uint32_t queues = + rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1; + alignas(struct rte_flow_action_rss) uint8_t rss_conf_data + [offsetof(struct rte_flow_action_rss, queue) + + sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues]; + struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data; + struct rte_flow_action actions[] = { + { + .type = RTE_FLOW_ACTION_TYPE_RSS, + .conf = rss_conf, + }, + { + .type = RTE_FLOW_ACTION_TYPE_END, + }, + }; + struct ether_addr *rule_mac = ð_spec.dst; + rte_be16_t *rule_vlan = + priv->dev->data->dev_conf.rxmode.hw_vlan_filter && + !priv->dev->data->promiscuous ? + &vlan_spec.tci : + NULL; + int broadcast = + !priv->dev->data->promiscuous && + !priv->dev->data->all_multicast; + uint16_t vlan = 0; + struct rte_flow *flow; + unsigned int i; + int err = 0; - flow = LIST_FIRST(&priv->flows); - priv_flow_destroy(priv, flow); + /* Nothing to be done if there are no Rx queues. */ + if (!queues) + goto error; + /* Prepare default RSS configuration. */ + *rss_conf = (struct rte_flow_action_rss){ + .rss_conf = NULL, /* Rely on default fallback settings. */ + .num = queues, + }; + for (i = 0; i != queues; ++i) + rss_conf->queue[i] = i; + /* + * Set up VLAN item if filtering is enabled and at least one VLAN + * filter is configured. + */ + if (rule_vlan) { + vlan = mlx4_flow_internal_next_vlan(priv, 0); + if (vlan < 4096) { + pattern[2] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_VLAN, + .spec = &vlan_spec, + .mask = &vlan_mask, + }; +next_vlan: + *rule_vlan = rte_cpu_to_be_16(vlan); + } else { + rule_vlan = NULL; + } } + for (i = 0; i != RTE_DIM(priv->mac) + broadcast; ++i) { + const struct ether_addr *mac; + + /* Broadcasts are handled by an extra iteration. */ + if (i < RTE_DIM(priv->mac)) + mac = &priv->mac[i]; + else + mac = ð_mask.dst; + if (is_zero_ether_addr(mac)) + continue; + /* Check if MAC flow rule is already present. */ + for (flow = LIST_FIRST(&priv->flows); + flow && flow->internal; + flow = LIST_NEXT(flow, next)) { + const struct ibv_flow_spec_eth *eth = + (const void *)((uintptr_t)flow->ibv_attr + + sizeof(*flow->ibv_attr)); + unsigned int j; + + if (!flow->mac) + continue; + assert(flow->ibv_attr->type == IBV_FLOW_ATTR_NORMAL); + assert(flow->ibv_attr->num_of_specs == 1); + assert(eth->type == IBV_FLOW_SPEC_ETH); + assert(flow->rss); + if (rule_vlan && + (eth->val.vlan_tag != *rule_vlan || + eth->mask.vlan_tag != RTE_BE16(0x0fff))) + continue; + if (!rule_vlan && eth->mask.vlan_tag) + continue; + for (j = 0; j != sizeof(mac->addr_bytes); ++j) + if (eth->val.dst_mac[j] != mac->addr_bytes[j] || + eth->mask.dst_mac[j] != UINT8_C(0xff) || + eth->val.src_mac[j] != UINT8_C(0x00) || + eth->mask.src_mac[j] != UINT8_C(0x00)) + break; + if (j != sizeof(mac->addr_bytes)) + continue; + if (flow->rss->queues != queues || + memcmp(flow->rss->queue_id, rss_conf->queue, + queues * sizeof(flow->rss->queue_id[0]))) + continue; + break; + } + if (!flow || !flow->internal) { + /* Not found, create a new flow rule. */ + memcpy(rule_mac, mac, sizeof(*mac)); + flow = mlx4_flow_create(priv->dev, &attr, pattern, + actions, error); + if (!flow) { + err = -rte_errno; + goto error; + } + } + flow->select = 1; + flow->mac = 1; + } + if (rule_vlan) { + vlan = mlx4_flow_internal_next_vlan(priv, vlan + 1); + if (vlan < 4096) + goto next_vlan; + } + /* Take care of promiscuous and all multicast flow rules. */ + if (!broadcast) { + for (flow = LIST_FIRST(&priv->flows); + flow && flow->internal; + flow = LIST_NEXT(flow, next)) { + if (priv->dev->data->promiscuous) { + if (flow->promisc) + break; + } else { + assert(priv->dev->data->all_multicast); + if (flow->allmulti) + break; + } + } + if (flow && flow->internal) { + assert(flow->rss); + if (flow->rss->queues != queues || + memcmp(flow->rss->queue_id, rss_conf->queue, + queues * sizeof(flow->rss->queue_id[0]))) + flow = NULL; + } + if (!flow || !flow->internal) { + /* Not found, create a new flow rule. */ + if (priv->dev->data->promiscuous) { + pattern[1].spec = NULL; + pattern[1].mask = NULL; + } else { + assert(priv->dev->data->all_multicast); + pattern[1].spec = ð_allmulti; + pattern[1].mask = ð_allmulti; + } + pattern[2] = pattern[3]; + flow = mlx4_flow_create(priv->dev, &attr, pattern, + actions, error); + if (!flow) { + err = -rte_errno; + goto error; + } + } + assert(flow->promisc || flow->allmulti); + flow->select = 1; + } +error: + /* Clear selection and clean up stale internal flow rules. */ + flow = LIST_FIRST(&priv->flows); + while (flow && flow->internal) { + struct rte_flow *next = LIST_NEXT(flow, next); + + if (!flow->select) + claim_zero(mlx4_flow_destroy(priv->dev, flow, error)); + else + flow->select = 0; + flow = next; + } + return err; } /** - * Destroy all flows. + * Synchronize flow rules. * - * @see rte_flow_flush() - * @see rte_flow_ops + * This function synchronizes flow rules with the state of the device by + * taking into account isolated mode and whether target queues are + * configured. + * + * @param priv + * Pointer to private structure. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -mlx4_flow_flush(struct rte_eth_dev *dev, - struct rte_flow_error *error) +mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error) { - struct priv *priv = dev->data->dev_private; + struct rte_flow *flow; + int ret; - (void)error; - priv_lock(priv); - priv_flow_flush(priv); - priv_unlock(priv); + /* Internal flow rules are guaranteed to come first in the list. */ + if (priv->isolated) { + /* + * Get rid of them in isolated mode, stop at the first + * non-internal rule found. + */ + for (flow = LIST_FIRST(&priv->flows); + flow && flow->internal; + flow = LIST_FIRST(&priv->flows)) + claim_zero(mlx4_flow_destroy(priv->dev, flow, error)); + } else { + /* Refresh internal rules. */ + ret = mlx4_flow_internal(priv, error); + if (ret) + return ret; + } + /* Toggle the remaining flow rules . */ + LIST_FOREACH(flow, &priv->flows, next) { + ret = mlx4_flow_toggle(priv, flow, priv->started, error); + if (ret) + return ret; + } + if (!priv->started) + assert(!priv->drop); return 0; } /** - * Remove all flows. + * Clean up all flow rules. * - * Called by dev_stop() to remove all flows. + * Unlike mlx4_flow_flush(), this function takes care of all remaining flow + * rules regardless of whether they are internal or user-configured. * * @param priv * Pointer to private structure. */ void -mlx4_priv_flow_stop(struct priv *priv) +mlx4_flow_clean(struct priv *priv) { struct rte_flow *flow; - for (flow = LIST_FIRST(&priv->flows); - flow; - flow = LIST_NEXT(flow, next)) { - claim_zero(ibv_destroy_flow(flow->ibv_flow)); - flow->ibv_flow = NULL; - DEBUG("Flow %p removed", (void *)flow); - } - mlx4_flow_destroy_drop_queue(priv); + while ((flow = LIST_FIRST(&priv->flows))) + mlx4_flow_destroy(priv->dev, flow, NULL); + assert(LIST_EMPTY(&priv->rss)); } +static const struct rte_flow_ops mlx4_flow_ops = { + .validate = mlx4_flow_validate, + .create = mlx4_flow_create, + .destroy = mlx4_flow_destroy, + .flush = mlx4_flow_flush, + .isolate = mlx4_flow_isolate, +}; + /** - * Add all flows. + * Manage filter operations. * - * @param priv - * Pointer to private structure. + * @param dev + * Pointer to Ethernet device structure. + * @param filter_type + * Filter type. + * @param filter_op + * Operation to perform. + * @param arg + * Pointer to operation-specific structure. * * @return - * 0 on success, a errno value otherwise and rte_errno is set. + * 0 on success, negative errno value otherwise and rte_errno is set. */ int -mlx4_priv_flow_start(struct priv *priv) +mlx4_filter_ctrl(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg) { - int ret; - struct ibv_qp *qp; - struct rte_flow *flow; - - ret = mlx4_flow_create_drop_queue(priv); - if (ret) - return -1; - for (flow = LIST_FIRST(&priv->flows); - flow; - flow = LIST_NEXT(flow, next)) { - qp = flow->qp ? flow->qp : priv->flow_drop_queue->qp; - flow->ibv_flow = ibv_create_flow(qp, flow->ibv_attr); - if (!flow->ibv_flow) { - DEBUG("Flow %p cannot be applied", (void *)flow); - rte_errno = EINVAL; - return rte_errno; - } - DEBUG("Flow %p applied", (void *)flow); + switch (filter_type) { + case RTE_ETH_FILTER_GENERIC: + if (filter_op != RTE_ETH_FILTER_GET) + break; + *(const void **)arg = &mlx4_flow_ops; + return 0; + default: + ERROR("%p: filter type (%d) not supported", + (void *)dev, filter_type); + break; } - return 0; + rte_errno = ENOTSUP; + return -rte_errno; } diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h index beabcf2d..651fd37b 100644 --- a/drivers/net/mlx4/mlx4_flow.h +++ b/drivers/net/mlx4/mlx4_flow.h @@ -2,7 +2,7 @@ * BSD LICENSE * * Copyright 2017 6WIND S.A. - * Copyright 2017 Mellanox. + * Copyright 2017 Mellanox * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -34,12 +34,10 @@ #ifndef RTE_PMD_MLX4_FLOW_H_ #define RTE_PMD_MLX4_FLOW_H_ -#include <stddef.h> #include <stdint.h> #include <sys/queue.h> -/* Verbs header. */ -/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +/* Verbs headers do not support -pedantic. */ #ifdef PEDANTIC #pragma GCC diagnostic ignored "-Wpedantic" #endif @@ -48,61 +46,40 @@ #pragma GCC diagnostic error "-Wpedantic" #endif +#include <rte_eth_ctrl.h> +#include <rte_ethdev.h> #include <rte_flow.h> #include <rte_flow_driver.h> #include <rte_byteorder.h> -#include "mlx4.h" +/** Last and lowest priority level for a flow rule. */ +#define MLX4_FLOW_PRIORITY_LAST UINT32_C(0xfff) +/** Meta pattern item used to distinguish internal rules. */ +#define MLX4_FLOW_ITEM_TYPE_INTERNAL ((enum rte_flow_item_type)-1) + +/** PMD-specific (mlx4) definition of a flow rule handle. */ struct rte_flow { LIST_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */ struct ibv_flow *ibv_flow; /**< Verbs flow. */ struct ibv_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */ - struct ibv_qp *qp; /**< Verbs queue pair. */ + uint32_t ibv_attr_size; /**< Size of Verbs attributes. */ + uint32_t select:1; /**< Used by operations on the linked list. */ + uint32_t internal:1; /**< Internal flow rule outside isolated mode. */ + uint32_t mac:1; /**< Rule associated with a configured MAC address. */ + uint32_t promisc:1; /**< This rule matches everything. */ + uint32_t allmulti:1; /**< This rule matches all multicast traffic. */ + uint32_t drop:1; /**< This rule drops packets. */ + struct mlx4_rss *rss; /**< Rx target. */ }; -int -mlx4_flow_validate(struct rte_eth_dev *dev, - const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - const struct rte_flow_action actions[], - struct rte_flow_error *error); - -struct rte_flow * -mlx4_flow_create(struct rte_eth_dev *dev, - const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - const struct rte_flow_action actions[], - struct rte_flow_error *error); - -int -mlx4_flow_destroy(struct rte_eth_dev *dev, - struct rte_flow *flow, - struct rte_flow_error *error); - -int -mlx4_flow_flush(struct rte_eth_dev *dev, - struct rte_flow_error *error); - -/** Structure to pass to the conversion function. */ -struct mlx4_flow { - struct ibv_flow_attr *ibv_attr; /**< Verbs attribute. */ - unsigned int offset; /**< Offset in bytes in the ibv_attr buffer. */ -}; - -int -mlx4_flow_isolate(struct rte_eth_dev *dev, - int enable, - struct rte_flow_error *error); - -struct mlx4_flow_action { - uint32_t drop:1; /**< Target is a drop queue. */ - uint32_t queue:1; /**< Target is a receive queue. */ - uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queue indices to use. */ - uint16_t queues_n; /**< Number of entries in queue[] */ -}; +/* mlx4_flow.c */ -int mlx4_priv_flow_start(struct priv *priv); -void mlx4_priv_flow_stop(struct priv *priv); +int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error); +void mlx4_flow_clean(struct priv *priv); +int mlx4_filter_ctrl(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg); #endif /* RTE_PMD_MLX4_FLOW_H_ */ diff --git a/drivers/net/mlx4/mlx4_intr.c b/drivers/net/mlx4/mlx4_intr.c new file mode 100644 index 00000000..b17d109a --- /dev/null +++ b/drivers/net/mlx4/mlx4_intr.c @@ -0,0 +1,397 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file + * Interrupts handling for mlx4 driver. + */ + +#include <assert.h> +#include <errno.h> +#include <stdint.h> +#include <stdlib.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_alarm.h> +#include <rte_errno.h> +#include <rte_ethdev.h> +#include <rte_io.h> +#include <rte_interrupts.h> + +#include "mlx4.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +static int mlx4_link_status_check(struct priv *priv); + +/** + * Clean up Rx interrupts handler. + * + * @param priv + * Pointer to private structure. + */ +static void +mlx4_rx_intr_vec_disable(struct priv *priv) +{ + struct rte_intr_handle *intr_handle = &priv->intr_handle; + + rte_intr_free_epoll_fd(intr_handle); + free(intr_handle->intr_vec); + intr_handle->nb_efd = 0; + intr_handle->intr_vec = NULL; +} + +/** + * Allocate queue vector and fill epoll fd list for Rx interrupts. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +static int +mlx4_rx_intr_vec_enable(struct priv *priv) +{ + unsigned int i; + unsigned int rxqs_n = priv->dev->data->nb_rx_queues; + unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); + unsigned int count = 0; + struct rte_intr_handle *intr_handle = &priv->intr_handle; + + mlx4_rx_intr_vec_disable(priv); + intr_handle->intr_vec = malloc(sizeof(intr_handle->intr_vec[rxqs_n])); + if (intr_handle->intr_vec == NULL) { + rte_errno = ENOMEM; + ERROR("failed to allocate memory for interrupt vector," + " Rx interrupts will not be supported"); + return -rte_errno; + } + for (i = 0; i != n; ++i) { + struct rxq *rxq = priv->dev->data->rx_queues[i]; + + /* Skip queues that cannot request interrupts. */ + if (!rxq || !rxq->channel) { + /* Use invalid intr_vec[] index to disable entry. */ + intr_handle->intr_vec[i] = + RTE_INTR_VEC_RXTX_OFFSET + + RTE_MAX_RXTX_INTR_VEC_ID; + continue; + } + if (count >= RTE_MAX_RXTX_INTR_VEC_ID) { + rte_errno = E2BIG; + ERROR("too many Rx queues for interrupt vector size" + " (%d), Rx interrupts cannot be enabled", + RTE_MAX_RXTX_INTR_VEC_ID); + mlx4_rx_intr_vec_disable(priv); + return -rte_errno; + } + intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count; + intr_handle->efds[count] = rxq->channel->fd; + count++; + } + if (!count) + mlx4_rx_intr_vec_disable(priv); + else + intr_handle->nb_efd = count; + return 0; +} + +/** + * Process scheduled link status check. + * + * If LSC interrupts are requested, process related callback. + * + * @param priv + * Pointer to private structure. + */ +static void +mlx4_link_status_alarm(struct priv *priv) +{ + const struct rte_intr_conf *const intr_conf = + &priv->dev->data->dev_conf.intr_conf; + + assert(priv->intr_alarm == 1); + priv->intr_alarm = 0; + if (intr_conf->lsc && !mlx4_link_status_check(priv)) + _rte_eth_dev_callback_process(priv->dev, + RTE_ETH_EVENT_INTR_LSC, + NULL, NULL); +} + +/** + * Check link status. + * + * In case of inconsistency, another check is scheduled. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success (link status is consistent), negative errno value + * otherwise and rte_errno is set. + */ +static int +mlx4_link_status_check(struct priv *priv) +{ + struct rte_eth_link *link = &priv->dev->data->dev_link; + int ret = mlx4_link_update(priv->dev, 0); + + if (ret) + return ret; + if ((!link->link_speed && link->link_status) || + (link->link_speed && !link->link_status)) { + if (!priv->intr_alarm) { + /* Inconsistent status, check again later. */ + ret = rte_eal_alarm_set(MLX4_INTR_ALARM_TIMEOUT, + (void (*)(void *)) + mlx4_link_status_alarm, + priv); + if (ret) + return ret; + priv->intr_alarm = 1; + } + rte_errno = EINPROGRESS; + return -rte_errno; + } + return 0; +} + +/** + * Handle interrupts from the NIC. + * + * @param priv + * Pointer to private structure. + */ +static void +mlx4_interrupt_handler(struct priv *priv) +{ + enum { LSC, RMV, }; + static const enum rte_eth_event_type type[] = { + [LSC] = RTE_ETH_EVENT_INTR_LSC, + [RMV] = RTE_ETH_EVENT_INTR_RMV, + }; + uint32_t caught[RTE_DIM(type)] = { 0 }; + struct ibv_async_event event; + const struct rte_intr_conf *const intr_conf = + &priv->dev->data->dev_conf.intr_conf; + unsigned int i; + + /* Read all message and acknowledge them. */ + while (!ibv_get_async_event(priv->ctx, &event)) { + switch (event.event_type) { + case IBV_EVENT_PORT_ACTIVE: + case IBV_EVENT_PORT_ERR: + if (intr_conf->lsc && !mlx4_link_status_check(priv)) + ++caught[LSC]; + break; + case IBV_EVENT_DEVICE_FATAL: + if (intr_conf->rmv) + ++caught[RMV]; + break; + default: + DEBUG("event type %d on physical port %d not handled", + event.event_type, event.element.port_num); + } + ibv_ack_async_event(&event); + } + for (i = 0; i != RTE_DIM(caught); ++i) + if (caught[i]) + _rte_eth_dev_callback_process(priv->dev, type[i], + NULL, NULL); +} + +/** + * MLX4 CQ notification . + * + * @param rxq + * Pointer to receive queue structure. + * @param solicited + * Is request solicited or not. + */ +static void +mlx4_arm_cq(struct rxq *rxq, int solicited) +{ + struct mlx4_cq *cq = &rxq->mcq; + uint64_t doorbell; + uint32_t sn = cq->arm_sn & MLX4_CQ_DB_GEQ_N_MASK; + uint32_t ci = cq->cons_index & MLX4_CQ_DB_CI_MASK; + uint32_t cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT; + + *cq->arm_db = rte_cpu_to_be_32(sn << 28 | cmd | ci); + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI MMIO. + */ + rte_wmb(); + doorbell = sn << 28 | cmd | cq->cqn; + doorbell <<= 32; + doorbell |= ci; + rte_write64(rte_cpu_to_be_64(doorbell), cq->cq_db_reg); +} + +/** + * Uninstall interrupt handler. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_intr_uninstall(struct priv *priv) +{ + int err = rte_errno; /* Make sure rte_errno remains unchanged. */ + + if (priv->intr_handle.fd != -1) { + rte_intr_callback_unregister(&priv->intr_handle, + (void (*)(void *)) + mlx4_interrupt_handler, + priv); + priv->intr_handle.fd = -1; + } + rte_eal_alarm_cancel((void (*)(void *))mlx4_link_status_alarm, priv); + priv->intr_alarm = 0; + mlx4_rx_intr_vec_disable(priv); + rte_errno = err; + return 0; +} + +/** + * Install interrupt handler. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_intr_install(struct priv *priv) +{ + const struct rte_intr_conf *const intr_conf = + &priv->dev->data->dev_conf.intr_conf; + int rc; + + mlx4_intr_uninstall(priv); + if (intr_conf->rxq && mlx4_rx_intr_vec_enable(priv) < 0) + goto error; + if (intr_conf->lsc | intr_conf->rmv) { + priv->intr_handle.fd = priv->ctx->async_fd; + rc = rte_intr_callback_register(&priv->intr_handle, + (void (*)(void *)) + mlx4_interrupt_handler, + priv); + if (rc < 0) { + rte_errno = -rc; + goto error; + } + } + return 0; +error: + mlx4_intr_uninstall(priv); + return -rte_errno; +} + +/** + * DPDK callback for Rx queue interrupt disable. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * Rx queue index. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx) +{ + struct rxq *rxq = dev->data->rx_queues[idx]; + struct ibv_cq *ev_cq; + void *ev_ctx; + int ret; + + if (!rxq || !rxq->channel) { + ret = EINVAL; + } else { + ret = ibv_get_cq_event(rxq->cq->channel, &ev_cq, &ev_ctx); + if (ret || ev_cq != rxq->cq) + ret = EINVAL; + } + if (ret) { + rte_errno = ret; + WARN("unable to disable interrupt on rx queue %d", + idx); + } else { + rxq->mcq.arm_sn++; + ibv_ack_cq_events(rxq->cq, 1); + } + return -ret; +} + +/** + * DPDK callback for Rx queue interrupt enable. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * Rx queue index. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx) +{ + struct rxq *rxq = dev->data->rx_queues[idx]; + int ret = 0; + + if (!rxq || !rxq->channel) { + ret = EINVAL; + rte_errno = ret; + WARN("unable to arm interrupt on rx queue %d", idx); + } else { + mlx4_arm_cq(rxq, 0); + } + return -ret; +} diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c new file mode 100644 index 00000000..2a3e2695 --- /dev/null +++ b/drivers/net/mlx4/mlx4_mr.c @@ -0,0 +1,293 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file + * Memory management functions for mlx4 driver. + */ + +#include <assert.h> +#include <errno.h> +#include <inttypes.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_branch_prediction.h> +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_malloc.h> +#include <rte_memory.h> +#include <rte_mempool.h> +#include <rte_spinlock.h> + +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +struct mlx4_check_mempool_data { + int ret; + char *start; + char *end; +}; + +/** + * Called by mlx4_check_mempool() when iterating the memory chunks. + * + * @param[in] mp + * Pointer to memory pool (unused). + * @param[in, out] data + * Pointer to shared buffer with mlx4_check_mempool(). + * @param[in] memhdr + * Pointer to mempool chunk header. + * @param mem_idx + * Mempool element index (unused). + */ +static void +mlx4_check_mempool_cb(struct rte_mempool *mp, void *opaque, + struct rte_mempool_memhdr *memhdr, + unsigned int mem_idx) +{ + struct mlx4_check_mempool_data *data = opaque; + + (void)mp; + (void)mem_idx; + /* It already failed, skip the next chunks. */ + if (data->ret != 0) + return; + /* It is the first chunk. */ + if (data->start == NULL && data->end == NULL) { + data->start = memhdr->addr; + data->end = data->start + memhdr->len; + return; + } + if (data->end == memhdr->addr) { + data->end += memhdr->len; + return; + } + if (data->start == (char *)memhdr->addr + memhdr->len) { + data->start -= memhdr->len; + return; + } + /* Error, mempool is not virtually contiguous. */ + data->ret = -1; +} + +/** + * Check if a mempool can be used: it must be virtually contiguous. + * + * @param[in] mp + * Pointer to memory pool. + * @param[out] start + * Pointer to the start address of the mempool virtual memory area. + * @param[out] end + * Pointer to the end address of the mempool virtual memory area. + * + * @return + * 0 on success (mempool is virtually contiguous), -1 on error. + */ +static int +mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start, uintptr_t *end) +{ + struct mlx4_check_mempool_data data; + + memset(&data, 0, sizeof(data)); + rte_mempool_mem_iter(mp, mlx4_check_mempool_cb, &data); + *start = (uintptr_t)data.start; + *end = (uintptr_t)data.end; + return data.ret; +} + +/** + * Obtain a memory region from a memory pool. + * + * If a matching memory region already exists, it is returned with its + * reference count incremented, otherwise a new one is registered. + * + * @param priv + * Pointer to private structure. + * @param mp + * Pointer to memory pool. + * + * @return + * Memory region pointer, NULL in case of error and rte_errno is set. + */ +struct mlx4_mr * +mlx4_mr_get(struct priv *priv, struct rte_mempool *mp) +{ + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); + uintptr_t start; + uintptr_t end; + unsigned int i; + struct mlx4_mr *mr; + + if (mlx4_check_mempool(mp, &start, &end) != 0) { + rte_errno = EINVAL; + ERROR("mempool %p: not virtually contiguous", + (void *)mp); + return NULL; + } + DEBUG("mempool %p area start=%p end=%p size=%zu", + (void *)mp, (void *)start, (void *)end, + (size_t)(end - start)); + /* Round start and end to page boundary if found in memory segments. */ + for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { + uintptr_t addr = (uintptr_t)ms[i].addr; + size_t len = ms[i].len; + unsigned int align = ms[i].hugepage_sz; + + if ((start > addr) && (start < addr + len)) + start = RTE_ALIGN_FLOOR(start, align); + if ((end > addr) && (end < addr + len)) + end = RTE_ALIGN_CEIL(end, align); + } + DEBUG("mempool %p using start=%p end=%p size=%zu for MR", + (void *)mp, (void *)start, (void *)end, + (size_t)(end - start)); + rte_spinlock_lock(&priv->mr_lock); + LIST_FOREACH(mr, &priv->mr, next) + if (mp == mr->mp && start >= mr->start && end <= mr->end) + break; + if (mr) { + ++mr->refcnt; + goto release; + } + mr = rte_malloc(__func__, sizeof(*mr), 0); + if (!mr) { + rte_errno = ENOMEM; + goto release; + } + *mr = (struct mlx4_mr){ + .start = start, + .end = end, + .refcnt = 1, + .priv = priv, + .mr = ibv_reg_mr(priv->pd, (void *)start, end - start, + IBV_ACCESS_LOCAL_WRITE), + .mp = mp, + }; + if (mr->mr) { + mr->lkey = mr->mr->lkey; + LIST_INSERT_HEAD(&priv->mr, mr, next); + } else { + rte_free(mr); + mr = NULL; + rte_errno = errno ? errno : EINVAL; + } +release: + rte_spinlock_unlock(&priv->mr_lock); + return mr; +} + +/** + * Release a memory region. + * + * This function decrements its reference count and destroys it after + * reaching 0. + * + * Note to avoid race conditions given this function may be used from the + * data plane, it's extremely important that each user holds its own + * reference. + * + * @param mr + * Memory region to release. + */ +void +mlx4_mr_put(struct mlx4_mr *mr) +{ + struct priv *priv = mr->priv; + + rte_spinlock_lock(&priv->mr_lock); + assert(mr->refcnt); + if (--mr->refcnt) + goto release; + LIST_REMOVE(mr, next); + claim_zero(ibv_dereg_mr(mr->mr)); + rte_free(mr); +release: + rte_spinlock_unlock(&priv->mr_lock); +} + +/** + * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[]. + * If mp2mr[] is full, remove an entry first. + * + * @param txq + * Pointer to Tx queue structure. + * @param[in] mp + * Memory pool for which a memory region lkey must be added. + * @param[in] i + * Index in memory pool (MP) where to add memory region (MR). + * + * @return + * Added mr->lkey on success, (uint32_t)-1 on failure. + */ +uint32_t +mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i) +{ + struct mlx4_mr *mr; + + /* Add a new entry, register MR first. */ + DEBUG("%p: discovered new memory pool \"%s\" (%p)", + (void *)txq, mp->name, (void *)mp); + mr = mlx4_mr_get(txq->priv, mp); + if (unlikely(mr == NULL)) { + DEBUG("%p: unable to configure MR, mlx4_mr_get() failed", + (void *)txq); + return (uint32_t)-1; + } + if (unlikely(i == RTE_DIM(txq->mp2mr))) { + /* Table is full, remove oldest entry. */ + DEBUG("%p: MR <-> MP table full, dropping oldest entry.", + (void *)txq); + --i; + mlx4_mr_put(txq->mp2mr[0].mr); + memmove(&txq->mp2mr[0], &txq->mp2mr[1], + (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); + } + /* Store the new entry. */ + txq->mp2mr[i].mp = mp; + txq->mp2mr[i].mr = mr; + txq->mp2mr[i].lkey = mr->lkey; + DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, + (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey); + return txq->mp2mr[i].lkey; +} diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h new file mode 100644 index 00000000..fcc7c129 --- /dev/null +++ b/drivers/net/mlx4/mlx4_prm.h @@ -0,0 +1,177 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MLX4_PRM_H_ +#define MLX4_PRM_H_ + +#include <rte_atomic.h> +#include <rte_branch_prediction.h> +#include <rte_byteorder.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/mlx4dv.h> +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +/* ConnectX-3 Tx queue basic block. */ +#define MLX4_TXBB_SHIFT 6 +#define MLX4_TXBB_SIZE (1 << MLX4_TXBB_SHIFT) + +/* Typical TSO descriptor with 16 gather entries is 352 bytes. */ +#define MLX4_MAX_WQE_SIZE 512 +#define MLX4_MAX_WQE_TXBBS (MLX4_MAX_WQE_SIZE / MLX4_TXBB_SIZE) + +/* Send queue stamping/invalidating information. */ +#define MLX4_SQ_STAMP_STRIDE 64 +#define MLX4_SQ_STAMP_DWORDS (MLX4_SQ_STAMP_STRIDE / 4) +#define MLX4_SQ_STAMP_SHIFT 31 +#define MLX4_SQ_STAMP_VAL 0x7fffffff + +/* Work queue element (WQE) flags. */ +#define MLX4_BIT_WQE_OWN 0x80000000 +#define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28) +#define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27) + +#define MLX4_SIZE_TO_TXBBS(size) \ + (RTE_ALIGN((size), (MLX4_TXBB_SIZE)) >> (MLX4_TXBB_SHIFT)) + +/* CQE checksum flags. */ +enum { + MLX4_CQE_L2_TUNNEL_IPV4 = (int)(1u << 25), + MLX4_CQE_L2_TUNNEL_L4_CSUM = (int)(1u << 26), + MLX4_CQE_L2_TUNNEL = (int)(1u << 27), + MLX4_CQE_L2_VLAN_MASK = (int)(3u << 29), + MLX4_CQE_L2_TUNNEL_IPOK = (int)(1u << 31), +}; + +/* CQE status flags. */ +#define MLX4_CQE_STATUS_IPV4 (1 << 22) +#define MLX4_CQE_STATUS_IPV4F (1 << 23) +#define MLX4_CQE_STATUS_IPV6 (1 << 24) +#define MLX4_CQE_STATUS_IPV4OPT (1 << 25) +#define MLX4_CQE_STATUS_TCP (1 << 26) +#define MLX4_CQE_STATUS_UDP (1 << 27) +#define MLX4_CQE_STATUS_PTYPE_MASK \ + (MLX4_CQE_STATUS_IPV4 | \ + MLX4_CQE_STATUS_IPV4F | \ + MLX4_CQE_STATUS_IPV6 | \ + MLX4_CQE_STATUS_IPV4OPT | \ + MLX4_CQE_STATUS_TCP | \ + MLX4_CQE_STATUS_UDP) + +/* Send queue information. */ +struct mlx4_sq { + volatile uint8_t *buf; /**< SQ buffer. */ + volatile uint8_t *eob; /**< End of SQ buffer */ + uint32_t head; /**< SQ head counter in units of TXBBS. */ + uint32_t tail; /**< SQ tail counter in units of TXBBS. */ + uint32_t txbb_cnt; /**< Num of WQEBB in the Q (should be ^2). */ + uint32_t txbb_cnt_mask; /**< txbbs_cnt mask (txbb_cnt is ^2). */ + uint32_t headroom_txbbs; /**< Num of txbbs that should be kept free. */ + volatile uint32_t *db; /**< Pointer to the doorbell. */ + uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */ +}; + +#define mlx4_get_send_wqe(sq, n) ((sq)->buf + ((n) * (MLX4_TXBB_SIZE))) + +/* Completion queue events, numbers and masks. */ +#define MLX4_CQ_DB_GEQ_N_MASK 0x3 +#define MLX4_CQ_DOORBELL 0x20 +#define MLX4_CQ_DB_CI_MASK 0xffffff + +/* Completion queue information. */ +struct mlx4_cq { + volatile void *cq_uar; /**< CQ user access region. */ + volatile void *cq_db_reg; /**< CQ doorbell register. */ + volatile uint32_t *set_ci_db; /**< Pointer to the CQ doorbell. */ + volatile uint32_t *arm_db; /**< Arming Rx events doorbell. */ + volatile uint8_t *buf; /**< Pointer to the completion queue buffer. */ + uint32_t cqe_cnt; /**< Number of entries in the queue. */ + uint32_t cqe_64:1; /**< CQ entry size is 64 bytes. */ + uint32_t cons_index; /**< Last queue entry that was handled. */ + uint32_t cqn; /**< CQ number. */ + int arm_sn; /**< Rx event counter. */ +}; + +/** + * Retrieve a CQE entry from a CQ. + * + * cqe = cq->buf + cons_index * cqe_size + cqe_offset + * + * Where cqe_size is 32 or 64 bytes and cqe_offset is 0 or 32 (depending on + * cqe_size). + * + * @param cq + * CQ to retrieve entry from. + * @param index + * Entry index. + * + * @return + * Pointer to CQE entry. + */ +static inline volatile struct mlx4_cqe * +mlx4_get_cqe(struct mlx4_cq *cq, uint32_t index) +{ + return (volatile struct mlx4_cqe *)(cq->buf + + ((index & (cq->cqe_cnt - 1)) << + (5 + cq->cqe_64)) + + (cq->cqe_64 << 5)); +} + +/** + * Transpose a flag in a value. + * + * @param val + * Input value. + * @param from + * Flag to retrieve from input value. + * @param to + * Flag to set in output value. + * + * @return + * Output value with transposed flag enabled if present on input. + */ +static inline uint64_t +mlx4_transpose(uint64_t val, uint64_t from, uint64_t to) +{ + return (from >= to ? + (val & from) / (from / to) : + (val & from) * (to / from)); +} + +#endif /* MLX4_PRM_H_ */ diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c new file mode 100644 index 00000000..8b97a894 --- /dev/null +++ b/drivers/net/mlx4/mlx4_rxq.c @@ -0,0 +1,873 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file + * Rx queues configuration for mlx4 driver. + */ + +#include <assert.h> +#include <errno.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/mlx4dv.h> +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_byteorder.h> +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_ethdev.h> +#include <rte_flow.h> +#include <rte_malloc.h> +#include <rte_mbuf.h> +#include <rte_mempool.h> + +#include "mlx4.h" +#include "mlx4_flow.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +/** + * Historical RSS hash key. + * + * This used to be the default for mlx4 in Linux before v3.19 switched to + * generating random hash keys through netdev_rss_key_fill(). + * + * It is used in this PMD for consistency with past DPDK releases but can + * now be overridden through user configuration. + * + * Note: this is not const to work around API quirks. + */ +uint8_t +mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = { + 0x2c, 0xc6, 0x81, 0xd1, + 0x5b, 0xdb, 0xf4, 0xf7, + 0xfc, 0xa2, 0x83, 0x19, + 0xdb, 0x1a, 0x3e, 0x94, + 0x6b, 0x9e, 0x38, 0xd9, + 0x2c, 0x9c, 0x03, 0xd1, + 0xad, 0x99, 0x44, 0xa7, + 0xd9, 0x56, 0x3d, 0x59, + 0x06, 0x3c, 0x25, 0xf3, + 0xfc, 0x1f, 0xdc, 0x2a, +}; + +/** + * Obtain a RSS context with specified properties. + * + * Used when creating a flow rule targeting one or several Rx queues. + * + * If a matching RSS context already exists, it is returned with its + * reference count incremented. + * + * @param priv + * Pointer to private structure. + * @param fields + * Fields for RSS processing (Verbs format). + * @param[in] key + * Hash key to use (whose size is exactly MLX4_RSS_HASH_KEY_SIZE). + * @param queues + * Number of target queues. + * @param[in] queue_id + * Target queues. + * + * @return + * Pointer to RSS context on success, NULL otherwise and rte_errno is set. + */ +struct mlx4_rss * +mlx4_rss_get(struct priv *priv, uint64_t fields, + uint8_t key[MLX4_RSS_HASH_KEY_SIZE], + uint16_t queues, const uint16_t queue_id[]) +{ + struct mlx4_rss *rss; + size_t queue_id_size = sizeof(queue_id[0]) * queues; + + LIST_FOREACH(rss, &priv->rss, next) + if (fields == rss->fields && + queues == rss->queues && + !memcmp(key, rss->key, MLX4_RSS_HASH_KEY_SIZE) && + !memcmp(queue_id, rss->queue_id, queue_id_size)) { + ++rss->refcnt; + return rss; + } + rss = rte_malloc(__func__, offsetof(struct mlx4_rss, queue_id) + + queue_id_size, 0); + if (!rss) + goto error; + *rss = (struct mlx4_rss){ + .priv = priv, + .refcnt = 1, + .usecnt = 0, + .qp = NULL, + .ind = NULL, + .fields = fields, + .queues = queues, + }; + memcpy(rss->key, key, MLX4_RSS_HASH_KEY_SIZE); + memcpy(rss->queue_id, queue_id, queue_id_size); + LIST_INSERT_HEAD(&priv->rss, rss, next); + return rss; +error: + rte_errno = ENOMEM; + return NULL; +} + +/** + * Release a RSS context instance. + * + * Used when destroying a flow rule targeting one or several Rx queues. + * + * This function decrements the reference count of the context and destroys + * it after reaching 0. The context must have no users at this point; all + * prior calls to mlx4_rss_attach() must have been followed by matching + * calls to mlx4_rss_detach(). + * + * @param rss + * RSS context to release. + */ +void +mlx4_rss_put(struct mlx4_rss *rss) +{ + assert(rss->refcnt); + if (--rss->refcnt) + return; + assert(!rss->usecnt); + assert(!rss->qp); + assert(!rss->ind); + LIST_REMOVE(rss, next); + rte_free(rss); +} + +/** + * Attach a user to a RSS context instance. + * + * Used when the RSS QP and indirection table objects must be instantiated, + * that is, when a flow rule must be enabled. + * + * This function increments the usage count of the context. + * + * @param rss + * RSS context to attach to. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rss_attach(struct mlx4_rss *rss) +{ + assert(rss->refcnt); + if (rss->usecnt++) { + assert(rss->qp); + assert(rss->ind); + return 0; + } + + struct ibv_wq *ind_tbl[rss->queues]; + struct priv *priv = rss->priv; + const char *msg; + unsigned int i = 0; + int ret; + + if (!rte_is_power_of_2(RTE_DIM(ind_tbl))) { + ret = EINVAL; + msg = "number of RSS queues must be a power of two"; + goto error; + } + for (i = 0; i != RTE_DIM(ind_tbl); ++i) { + uint16_t id = rss->queue_id[i]; + struct rxq *rxq = NULL; + + if (id < priv->dev->data->nb_rx_queues) + rxq = priv->dev->data->rx_queues[id]; + if (!rxq) { + ret = EINVAL; + msg = "RSS target queue is not configured"; + goto error; + } + ret = mlx4_rxq_attach(rxq); + if (ret) { + ret = -ret; + msg = "unable to attach RSS target queue"; + goto error; + } + ind_tbl[i] = rxq->wq; + } + rss->ind = ibv_create_rwq_ind_table + (priv->ctx, + &(struct ibv_rwq_ind_table_init_attr){ + .log_ind_tbl_size = rte_log2_u32(RTE_DIM(ind_tbl)), + .ind_tbl = ind_tbl, + .comp_mask = 0, + }); + if (!rss->ind) { + ret = errno ? errno : EINVAL; + msg = "RSS indirection table creation failure"; + goto error; + } + rss->qp = ibv_create_qp_ex + (priv->ctx, + &(struct ibv_qp_init_attr_ex){ + .comp_mask = (IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_RX_HASH | + IBV_QP_INIT_ATTR_IND_TABLE), + .qp_type = IBV_QPT_RAW_PACKET, + .pd = priv->pd, + .rwq_ind_tbl = rss->ind, + .rx_hash_conf = { + .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ, + .rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE, + .rx_hash_key = rss->key, + .rx_hash_fields_mask = rss->fields, + }, + }); + if (!rss->qp) { + ret = errno ? errno : EINVAL; + msg = "RSS hash QP creation failure"; + goto error; + } + ret = ibv_modify_qp + (rss->qp, + &(struct ibv_qp_attr){ + .qp_state = IBV_QPS_INIT, + .port_num = priv->port, + }, + IBV_QP_STATE | IBV_QP_PORT); + if (ret) { + msg = "failed to switch RSS hash QP to INIT state"; + goto error; + } + ret = ibv_modify_qp + (rss->qp, + &(struct ibv_qp_attr){ + .qp_state = IBV_QPS_RTR, + }, + IBV_QP_STATE); + if (ret) { + msg = "failed to switch RSS hash QP to RTR state"; + goto error; + } + return 0; +error: + if (rss->qp) { + claim_zero(ibv_destroy_qp(rss->qp)); + rss->qp = NULL; + } + if (rss->ind) { + claim_zero(ibv_destroy_rwq_ind_table(rss->ind)); + rss->ind = NULL; + } + while (i--) + mlx4_rxq_detach(priv->dev->data->rx_queues[rss->queue_id[i]]); + ERROR("mlx4: %s", msg); + --rss->usecnt; + rte_errno = ret; + return -ret; +} + +/** + * Detach a user from a RSS context instance. + * + * Used when disabling (not destroying) a flow rule. + * + * This function decrements the usage count of the context and destroys + * usage resources after reaching 0. + * + * @param rss + * RSS context to detach from. + */ +void +mlx4_rss_detach(struct mlx4_rss *rss) +{ + struct priv *priv = rss->priv; + unsigned int i; + + assert(rss->refcnt); + assert(rss->qp); + assert(rss->ind); + if (--rss->usecnt) + return; + claim_zero(ibv_destroy_qp(rss->qp)); + rss->qp = NULL; + claim_zero(ibv_destroy_rwq_ind_table(rss->ind)); + rss->ind = NULL; + for (i = 0; i != rss->queues; ++i) + mlx4_rxq_detach(priv->dev->data->rx_queues[rss->queue_id[i]]); +} + +/** + * Initialize common RSS context resources. + * + * Because ConnectX-3 hardware limitations require a fixed order in the + * indirection table, WQs must be allocated sequentially to be part of a + * common RSS context. + * + * Since a newly created WQ cannot be moved to a different context, this + * function allocates them all at once, one for each configured Rx queue, + * as well as all related resources (CQs and mbufs). + * + * This must therefore be done before creating any Rx flow rules relying on + * indirection tables. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rss_init(struct priv *priv) +{ + struct rte_eth_dev *dev = priv->dev; + uint8_t log2_range = rte_log2_u32(dev->data->nb_rx_queues); + uint32_t wq_num_prev = 0; + const char *msg; + unsigned int i; + int ret; + + /* Prepare range for RSS contexts before creating the first WQ. */ + ret = mlx4dv_set_context_attr(priv->ctx, + MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ, + &log2_range); + if (ret) { + ERROR("cannot set up range size for RSS context to %u" + " (for %u Rx queues), error: %s", + 1 << log2_range, dev->data->nb_rx_queues, strerror(ret)); + rte_errno = ret; + return -ret; + } + for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) { + struct rxq *rxq = priv->dev->data->rx_queues[i]; + struct ibv_cq *cq; + struct ibv_wq *wq; + uint32_t wq_num; + + /* Attach the configured Rx queues. */ + if (rxq) { + assert(!rxq->usecnt); + ret = mlx4_rxq_attach(rxq); + if (!ret) { + wq_num = rxq->wq->wq_num; + goto wq_num_check; + } + ret = -ret; + msg = "unable to create Rx queue resources"; + goto error; + } + /* + * WQs are temporarily allocated for unconfigured Rx queues + * to maintain proper index alignment in indirection table + * by skipping unused WQ numbers. + * + * The reason this works at all even though these WQs are + * immediately destroyed is that WQNs are allocated + * sequentially and are guaranteed to never be reused in the + * same context by the underlying implementation. + */ + cq = ibv_create_cq(priv->ctx, 1, NULL, NULL, 0); + if (!cq) { + ret = ENOMEM; + msg = "placeholder CQ creation failure"; + goto error; + } + wq = ibv_create_wq + (priv->ctx, + &(struct ibv_wq_init_attr){ + .wq_type = IBV_WQT_RQ, + .max_wr = 1, + .max_sge = 1, + .pd = priv->pd, + .cq = cq, + }); + if (wq) { + wq_num = wq->wq_num; + claim_zero(ibv_destroy_wq(wq)); + } else { + wq_num = 0; /* Shut up GCC 4.8 warnings. */ + } + claim_zero(ibv_destroy_cq(cq)); + if (!wq) { + ret = ENOMEM; + msg = "placeholder WQ creation failure"; + goto error; + } +wq_num_check: + /* + * While guaranteed by the implementation, make sure WQ + * numbers are really sequential (as the saying goes, + * trust, but verify). + */ + if (i && wq_num - wq_num_prev != 1) { + if (rxq) + mlx4_rxq_detach(rxq); + ret = ERANGE; + msg = "WQ numbers are not sequential"; + goto error; + } + wq_num_prev = wq_num; + } + return 0; +error: + ERROR("cannot initialize common RSS resources (queue %u): %s: %s", + i, msg, strerror(ret)); + while (i--) { + struct rxq *rxq = priv->dev->data->rx_queues[i]; + + if (rxq) + mlx4_rxq_detach(rxq); + } + rte_errno = ret; + return -ret; +} + +/** + * Release common RSS context resources. + * + * As the reverse of mlx4_rss_init(), this must be done after removing all + * flow rules relying on indirection tables. + * + * @param priv + * Pointer to private structure. + */ +void +mlx4_rss_deinit(struct priv *priv) +{ + unsigned int i; + + for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) { + struct rxq *rxq = priv->dev->data->rx_queues[i]; + + if (rxq) { + assert(rxq->usecnt == 1); + mlx4_rxq_detach(rxq); + } + } +} + +/** + * Attach a user to a Rx queue. + * + * Used when the resources of an Rx queue must be instantiated for it to + * become in a usable state. + * + * This function increments the usage count of the Rx queue. + * + * @param rxq + * Pointer to Rx queue structure. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rxq_attach(struct rxq *rxq) +{ + if (rxq->usecnt++) { + assert(rxq->cq); + assert(rxq->wq); + assert(rxq->wqes); + assert(rxq->rq_db); + return 0; + } + + struct priv *priv = rxq->priv; + const uint32_t elts_n = 1 << rxq->elts_n; + const uint32_t sges_n = 1 << rxq->sges_n; + struct rte_mbuf *(*elts)[elts_n] = rxq->elts; + struct mlx4dv_obj mlxdv; + struct mlx4dv_rwq dv_rwq; + struct mlx4dv_cq dv_cq = { .comp_mask = MLX4DV_CQ_MASK_UAR, }; + const char *msg; + struct ibv_cq *cq = NULL; + struct ibv_wq *wq = NULL; + volatile struct mlx4_wqe_data_seg (*wqes)[]; + unsigned int i; + int ret; + + assert(rte_is_power_of_2(elts_n)); + cq = ibv_create_cq(priv->ctx, elts_n / sges_n, NULL, rxq->channel, 0); + if (!cq) { + ret = ENOMEM; + msg = "CQ creation failure"; + goto error; + } + wq = ibv_create_wq + (priv->ctx, + &(struct ibv_wq_init_attr){ + .wq_type = IBV_WQT_RQ, + .max_wr = elts_n / sges_n, + .max_sge = sges_n, + .pd = priv->pd, + .cq = cq, + }); + if (!wq) { + ret = errno ? errno : EINVAL; + msg = "WQ creation failure"; + goto error; + } + ret = ibv_modify_wq + (wq, + &(struct ibv_wq_attr){ + .attr_mask = IBV_WQ_ATTR_STATE, + .wq_state = IBV_WQS_RDY, + }); + if (ret) { + msg = "WQ state change to IBV_WQS_RDY failed"; + goto error; + } + /* Retrieve device queue information. */ + mlxdv.cq.in = cq; + mlxdv.cq.out = &dv_cq; + mlxdv.rwq.in = wq; + mlxdv.rwq.out = &dv_rwq; + ret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_RWQ | MLX4DV_OBJ_CQ); + if (ret) { + msg = "failed to obtain device information from WQ/CQ objects"; + goto error; + } + wqes = (volatile struct mlx4_wqe_data_seg (*)[]) + ((uintptr_t)dv_rwq.buf.buf + dv_rwq.rq.offset); + for (i = 0; i != RTE_DIM(*elts); ++i) { + volatile struct mlx4_wqe_data_seg *scat = &(*wqes)[i]; + struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp); + + if (buf == NULL) { + while (i--) { + rte_pktmbuf_free_seg((*elts)[i]); + (*elts)[i] = NULL; + } + ret = ENOMEM; + msg = "cannot allocate mbuf"; + goto error; + } + /* Headroom is reserved by rte_pktmbuf_alloc(). */ + assert(buf->data_off == RTE_PKTMBUF_HEADROOM); + /* Buffer is supposed to be empty. */ + assert(rte_pktmbuf_data_len(buf) == 0); + assert(rte_pktmbuf_pkt_len(buf) == 0); + /* Only the first segment keeps headroom. */ + if (i % sges_n) + buf->data_off = 0; + buf->port = rxq->port_id; + buf->data_len = rte_pktmbuf_tailroom(buf); + buf->pkt_len = rte_pktmbuf_tailroom(buf); + buf->nb_segs = 1; + *scat = (struct mlx4_wqe_data_seg){ + .addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, + uintptr_t)), + .byte_count = rte_cpu_to_be_32(buf->data_len), + .lkey = rte_cpu_to_be_32(rxq->mr->lkey), + }; + (*elts)[i] = buf; + } + DEBUG("%p: allocated and configured %u segments (max %u packets)", + (void *)rxq, elts_n, elts_n / sges_n); + rxq->cq = cq; + rxq->wq = wq; + rxq->wqes = wqes; + rxq->rq_db = dv_rwq.rdb; + rxq->mcq.buf = dv_cq.buf.buf; + rxq->mcq.cqe_cnt = dv_cq.cqe_cnt; + rxq->mcq.set_ci_db = dv_cq.set_ci_db; + rxq->mcq.cqe_64 = (dv_cq.cqe_size & 64) ? 1 : 0; + rxq->mcq.arm_db = dv_cq.arm_db; + rxq->mcq.arm_sn = dv_cq.arm_sn; + rxq->mcq.cqn = dv_cq.cqn; + rxq->mcq.cq_uar = dv_cq.cq_uar; + rxq->mcq.cq_db_reg = (uint8_t *)dv_cq.cq_uar + MLX4_CQ_DOORBELL; + /* Update doorbell counter. */ + rxq->rq_ci = elts_n / sges_n; + rte_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + return 0; +error: + if (wq) + claim_zero(ibv_destroy_wq(wq)); + if (cq) + claim_zero(ibv_destroy_cq(cq)); + rte_errno = ret; + ERROR("error while attaching Rx queue %p: %s: %s", + (void *)rxq, msg, strerror(ret)); + return -ret; +} + +/** + * Detach a user from a Rx queue. + * + * This function decrements the usage count of the Rx queue and destroys + * usage resources after reaching 0. + * + * @param rxq + * Pointer to Rx queue structure. + */ +void +mlx4_rxq_detach(struct rxq *rxq) +{ + unsigned int i; + struct rte_mbuf *(*elts)[1 << rxq->elts_n] = rxq->elts; + + if (--rxq->usecnt) + return; + rxq->rq_ci = 0; + memset(&rxq->mcq, 0, sizeof(rxq->mcq)); + rxq->rq_db = NULL; + rxq->wqes = NULL; + claim_zero(ibv_destroy_wq(rxq->wq)); + rxq->wq = NULL; + claim_zero(ibv_destroy_cq(rxq->cq)); + rxq->cq = NULL; + DEBUG("%p: freeing Rx queue elements", (void *)rxq); + for (i = 0; (i != RTE_DIM(*elts)); ++i) { + if (!(*elts)[i]) + continue; + rte_pktmbuf_free_seg((*elts)[i]); + (*elts)[i] = NULL; + } +} + +/** + * DPDK callback to configure a Rx queue. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * Rx queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param socket + * NUMA socket on which memory must be allocated. + * @param[in] conf + * Thresholds parameters. + * @param mp + * Memory pool for buffer allocations. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_rxconf *conf, + struct rte_mempool *mp) +{ + struct priv *priv = dev->data->dev_private; + uint32_t mb_len = rte_pktmbuf_data_room_size(mp); + struct rte_mbuf *(*elts)[rte_align32pow2(desc)]; + struct rxq *rxq; + struct mlx4_malloc_vec vec[] = { + { + .align = RTE_CACHE_LINE_SIZE, + .size = sizeof(*rxq), + .addr = (void **)&rxq, + }, + { + .align = RTE_CACHE_LINE_SIZE, + .size = sizeof(*elts), + .addr = (void **)&elts, + }, + }; + int ret; + + (void)conf; /* Thresholds configuration (ignored). */ + DEBUG("%p: configuring queue %u for %u descriptors", + (void *)dev, idx, desc); + if (idx >= dev->data->nb_rx_queues) { + rte_errno = EOVERFLOW; + ERROR("%p: queue index out of range (%u >= %u)", + (void *)dev, idx, dev->data->nb_rx_queues); + return -rte_errno; + } + rxq = dev->data->rx_queues[idx]; + if (rxq) { + rte_errno = EEXIST; + ERROR("%p: Rx queue %u already configured, release it first", + (void *)dev, idx); + return -rte_errno; + } + if (!desc) { + rte_errno = EINVAL; + ERROR("%p: invalid number of Rx descriptors", (void *)dev); + return -rte_errno; + } + if (desc != RTE_DIM(*elts)) { + desc = RTE_DIM(*elts); + WARN("%p: increased number of descriptors in Rx queue %u" + " to the next power of two (%u)", + (void *)dev, idx, desc); + } + /* Allocate and initialize Rx queue. */ + mlx4_zmallocv_socket("RXQ", vec, RTE_DIM(vec), socket); + if (!rxq) { + ERROR("%p: unable to allocate queue index %u", + (void *)dev, idx); + return -rte_errno; + } + *rxq = (struct rxq){ + .priv = priv, + .mp = mp, + .port_id = dev->data->port_id, + .sges_n = 0, + .elts_n = rte_log2_u32(desc), + .elts = elts, + /* Toggle Rx checksum offload if hardware supports it. */ + .csum = (priv->hw_csum && + dev->data->dev_conf.rxmode.hw_ip_checksum), + .csum_l2tun = (priv->hw_csum_l2tun && + dev->data->dev_conf.rxmode.hw_ip_checksum), + .stats = { + .idx = idx, + }, + .socket = socket, + }; + /* Enable scattered packets support for this queue if necessary. */ + assert(mb_len >= RTE_PKTMBUF_HEADROOM); + if (dev->data->dev_conf.rxmode.max_rx_pkt_len <= + (mb_len - RTE_PKTMBUF_HEADROOM)) { + ; + } else if (dev->data->dev_conf.rxmode.enable_scatter) { + uint32_t size = + RTE_PKTMBUF_HEADROOM + + dev->data->dev_conf.rxmode.max_rx_pkt_len; + uint32_t sges_n; + + /* + * Determine the number of SGEs needed for a full packet + * and round it to the next power of two. + */ + sges_n = rte_log2_u32((size / mb_len) + !!(size % mb_len)); + rxq->sges_n = sges_n; + /* Make sure sges_n did not overflow. */ + size = mb_len * (1 << rxq->sges_n); + size -= RTE_PKTMBUF_HEADROOM; + if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) { + rte_errno = EOVERFLOW; + ERROR("%p: too many SGEs (%u) needed to handle" + " requested maximum packet size %u", + (void *)dev, + 1 << sges_n, + dev->data->dev_conf.rxmode.max_rx_pkt_len); + goto error; + } + } else { + WARN("%p: the requested maximum Rx packet size (%u) is" + " larger than a single mbuf (%u) and scattered" + " mode has not been requested", + (void *)dev, + dev->data->dev_conf.rxmode.max_rx_pkt_len, + mb_len - RTE_PKTMBUF_HEADROOM); + } + DEBUG("%p: maximum number of segments per packet: %u", + (void *)dev, 1 << rxq->sges_n); + if (desc % (1 << rxq->sges_n)) { + rte_errno = EINVAL; + ERROR("%p: number of Rx queue descriptors (%u) is not a" + " multiple of maximum segments per packet (%u)", + (void *)dev, + desc, + 1 << rxq->sges_n); + goto error; + } + /* Use the entire Rx mempool as the memory region. */ + rxq->mr = mlx4_mr_get(priv, mp); + if (!rxq->mr) { + ERROR("%p: MR creation failure: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + if (dev->data->dev_conf.intr_conf.rxq) { + rxq->channel = ibv_create_comp_channel(priv->ctx); + if (rxq->channel == NULL) { + rte_errno = ENOMEM; + ERROR("%p: Rx interrupt completion channel creation" + " failure: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + if (mlx4_fd_set_non_blocking(rxq->channel->fd) < 0) { + ERROR("%p: unable to make Rx interrupt completion" + " channel non-blocking: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + } + DEBUG("%p: adding Rx queue %p to list", (void *)dev, (void *)rxq); + dev->data->rx_queues[idx] = rxq; + return 0; +error: + dev->data->rx_queues[idx] = NULL; + ret = rte_errno; + mlx4_rx_queue_release(rxq); + rte_errno = ret; + assert(rte_errno > 0); + return -rte_errno; +} + +/** + * DPDK callback to release a Rx queue. + * + * @param dpdk_rxq + * Generic Rx queue pointer. + */ +void +mlx4_rx_queue_release(void *dpdk_rxq) +{ + struct rxq *rxq = (struct rxq *)dpdk_rxq; + struct priv *priv; + unsigned int i; + + if (rxq == NULL) + return; + priv = rxq->priv; + for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) + if (priv->dev->data->rx_queues[i] == rxq) { + DEBUG("%p: removing Rx queue %p from list", + (void *)priv->dev, (void *)rxq); + priv->dev->data->rx_queues[i] = NULL; + break; + } + assert(!rxq->cq); + assert(!rxq->wq); + assert(!rxq->wqes); + assert(!rxq->rq_db); + if (rxq->channel) + claim_zero(ibv_destroy_comp_channel(rxq->channel)); + if (rxq->mr) + mlx4_mr_put(rxq->mr); + rte_free(rxq); +} diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c new file mode 100644 index 00000000..3985e06d --- /dev/null +++ b/drivers/net/mlx4/mlx4_rxtx.c @@ -0,0 +1,1071 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file + * Data plane functions for mlx4 driver. + */ + +#include <assert.h> +#include <stdint.h> +#include <string.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_branch_prediction.h> +#include <rte_common.h> +#include <rte_io.h> +#include <rte_mbuf.h> +#include <rte_mempool.h> +#include <rte_prefetch.h> + +#include "mlx4.h" +#include "mlx4_prm.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +#define WQE_ONE_DATA_SEG_SIZE \ + (sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg)) + +/** + * Pointer-value pair structure used in tx_post_send for saving the first + * DWORD (32 byte) of a TXBB. + */ +struct pv { + volatile struct mlx4_wqe_data_seg *dseg; + uint32_t val; +}; + +/** A table to translate Rx completion flags to packet type. */ +uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = { + /* + * The index to the array should have: + * bit[7] - MLX4_CQE_L2_TUNNEL + * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4 + * bit[5] - MLX4_CQE_STATUS_UDP + * bit[4] - MLX4_CQE_STATUS_TCP + * bit[3] - MLX4_CQE_STATUS_IPV4OPT + * bit[2] - MLX4_CQE_STATUS_IPV6 + * bit[1] - MLX4_CQE_STATUS_IPV4F + * bit[0] - MLX4_CQE_STATUS_IPV4 + * giving a total of up to 256 entries. + */ + [0x00] = RTE_PTYPE_L2_ETHER, + [0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, + [0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, + [0x03] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, + [0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, + [0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT, + [0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_FRAG, + [0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP, + [0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP, + [0x14] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP, + [0x18] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_TCP, + [0x19] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_TCP, + [0x1a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_TCP, + [0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP, + [0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP, + [0x24] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP, + [0x28] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_UDP, + [0x29] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_UDP, + [0x2a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_UDP, + /* Tunneled - L3 IPV6 */ + [0x80] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, + [0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, + [0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0x83] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0x84] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, + [0x88] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG, + /* Tunneled - L3 IPV6, TCP */ + [0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + [0x93] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + [0x94] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0x98] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_TCP, + [0x99] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_TCP, + [0x9a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + /* Tunneled - L3 IPV6, UDP */ + [0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, + [0xa3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, + [0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_UDP, + [0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_UDP, + [0xaa] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, + /* Tunneled - L3 IPV4 */ + [0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, + [0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, + [0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0xc3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0xc4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, + [0xc8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT, + [0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT, + [0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_FRAG, + /* Tunneled - L3 IPV4, TCP */ + [0xd0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + [0xd3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + [0xd4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0xd8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_TCP, + [0xd9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_TCP, + [0xda] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + /* Tunneled - L3 IPV4, UDP */ + [0xe0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, + [0xe3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, + [0xe4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xe8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP, + [0xe9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP, + [0xea] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, +}; + +/** + * Stamp a WQE so it won't be reused by the HW. + * + * Routine is used when freeing WQE used by the chip or when failing + * building an WQ entry has failed leaving partial information on the queue. + * + * @param sq + * Pointer to the SQ structure. + * @param index + * Index of the freed WQE. + * @param num_txbbs + * Number of blocks to stamp. + * If < 0 the routine will use the size written in the WQ entry. + * @param owner + * The value of the WQE owner bit to use in the stamp. + * + * @return + * The number of Tx basic blocs (TXBB) the WQE contained. + */ +static int +mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner) +{ + uint32_t stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL | + (!!owner << MLX4_SQ_STAMP_SHIFT)); + volatile uint8_t *wqe = mlx4_get_send_wqe(sq, + (index & sq->txbb_cnt_mask)); + volatile uint32_t *ptr = (volatile uint32_t *)wqe; + int i; + int txbbs_size; + int num_txbbs; + + /* Extract the size from the control segment of the WQE. */ + num_txbbs = MLX4_SIZE_TO_TXBBS((((volatile struct mlx4_wqe_ctrl_seg *) + wqe)->fence_size & 0x3f) << 4); + txbbs_size = num_txbbs * MLX4_TXBB_SIZE; + /* Optimize the common case when there is no wrap-around. */ + if (wqe + txbbs_size <= sq->eob) { + /* Stamp the freed descriptor. */ + for (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) { + *ptr = stamp; + ptr += MLX4_SQ_STAMP_DWORDS; + } + } else { + /* Stamp the freed descriptor. */ + for (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) { + *ptr = stamp; + ptr += MLX4_SQ_STAMP_DWORDS; + if ((volatile uint8_t *)ptr >= sq->eob) { + ptr = (volatile uint32_t *)sq->buf; + stamp ^= RTE_BE32(0x80000000); + } + } + } + return num_txbbs; +} + +/** + * Manage Tx completions. + * + * When sending a burst, mlx4_tx_burst() posts several WRs. + * To improve performance, a completion event is only required once every + * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information + * for other WRs, but this information would not be used anyway. + * + * @param txq + * Pointer to Tx queue structure. + * + * @return + * 0 on success, -1 on failure. + */ +static int +mlx4_txq_complete(struct txq *txq, const unsigned int elts_n, + struct mlx4_sq *sq) +{ + unsigned int elts_comp = txq->elts_comp; + unsigned int elts_tail = txq->elts_tail; + struct mlx4_cq *cq = &txq->mcq; + volatile struct mlx4_cqe *cqe; + uint32_t cons_index = cq->cons_index; + uint16_t new_index; + uint16_t nr_txbbs = 0; + int pkts = 0; + + /* + * Traverse over all CQ entries reported and handle each WQ entry + * reported by them. + */ + do { + cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index); + if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(cons_index & cq->cqe_cnt))) + break; + /* + * Make sure we read the CQE after we read the ownership bit. + */ + rte_io_rmb(); +#ifndef NDEBUG + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR)) { + volatile struct mlx4_err_cqe *cqe_err = + (volatile struct mlx4_err_cqe *)cqe; + ERROR("%p CQE error - vendor syndrome: 0x%x" + " syndrome: 0x%x\n", + (void *)txq, cqe_err->vendor_err, + cqe_err->syndrome); + } +#endif /* NDEBUG */ + /* Get WQE index reported in the CQE. */ + new_index = + rte_be_to_cpu_16(cqe->wqe_index) & sq->txbb_cnt_mask; + do { + /* Free next descriptor. */ + nr_txbbs += + mlx4_txq_stamp_freed_wqe(sq, + (sq->tail + nr_txbbs) & sq->txbb_cnt_mask, + !!((sq->tail + nr_txbbs) & sq->txbb_cnt)); + pkts++; + } while (((sq->tail + nr_txbbs) & sq->txbb_cnt_mask) != + new_index); + cons_index++; + } while (1); + if (unlikely(pkts == 0)) + return 0; + /* Update CQ. */ + cq->cons_index = cons_index; + *cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index & MLX4_CQ_DB_CI_MASK); + sq->tail = sq->tail + nr_txbbs; + /* Update the list of packets posted for transmission. */ + elts_comp -= pkts; + assert(elts_comp <= txq->elts_comp); + /* + * Assume completion status is successful as nothing can be done about + * it anyway. + */ + elts_tail += pkts; + if (elts_tail >= elts_n) + elts_tail -= elts_n; + txq->elts_tail = elts_tail; + txq->elts_comp = elts_comp; + return 0; +} + +/** + * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which + * the cloned mbuf is allocated is returned instead. + * + * @param buf + * Pointer to mbuf. + * + * @return + * Memory pool where data is located for given mbuf. + */ +static struct rte_mempool * +mlx4_txq_mb2mp(struct rte_mbuf *buf) +{ + if (unlikely(RTE_MBUF_INDIRECT(buf))) + return rte_mbuf_from_indirect(buf)->pool; + return buf->pool; +} + +static int +mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq, + volatile struct mlx4_wqe_ctrl_seg **pctrl) +{ + int wqe_real_size; + int nr_txbbs; + struct pv *pv = (struct pv *)txq->bounce_buf; + struct mlx4_sq *sq = &txq->msq; + uint32_t head_idx = sq->head & sq->txbb_cnt_mask; + volatile struct mlx4_wqe_ctrl_seg *ctrl; + volatile struct mlx4_wqe_data_seg *dseg; + struct rte_mbuf *sbuf; + uint32_t lkey; + uintptr_t addr; + uint32_t byte_count; + int pv_counter = 0; + + /* Calculate the needed work queue entry size for this packet. */ + wqe_real_size = sizeof(volatile struct mlx4_wqe_ctrl_seg) + + buf->nb_segs * sizeof(volatile struct mlx4_wqe_data_seg); + nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size); + /* + * Check that there is room for this WQE in the send queue and that + * the WQE size is legal. + */ + if (((sq->head - sq->tail) + nr_txbbs + + sq->headroom_txbbs) >= sq->txbb_cnt || + nr_txbbs > MLX4_MAX_WQE_TXBBS) { + return -1; + } + /* Get the control and data entries of the WQE. */ + ctrl = (volatile struct mlx4_wqe_ctrl_seg *) + mlx4_get_send_wqe(sq, head_idx); + dseg = (volatile struct mlx4_wqe_data_seg *) + ((uintptr_t)ctrl + sizeof(struct mlx4_wqe_ctrl_seg)); + *pctrl = ctrl; + /* Fill the data segments with buffer information. */ + for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) { + addr = rte_pktmbuf_mtod(sbuf, uintptr_t); + rte_prefetch0((volatile void *)addr); + /* Handle WQE wraparound. */ + if (dseg >= (volatile struct mlx4_wqe_data_seg *)sq->eob) + dseg = (volatile struct mlx4_wqe_data_seg *)sq->buf; + dseg->addr = rte_cpu_to_be_64(addr); + /* Memory region key (big endian) for this memory pool. */ + lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf)); + dseg->lkey = rte_cpu_to_be_32(lkey); +#ifndef NDEBUG + /* Calculate the needed work queue entry size for this packet */ + if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) { + /* MR does not exist. */ + DEBUG("%p: unable to get MP <-> MR association", + (void *)txq); + /* + * Restamp entry in case of failure. + * Make sure that size is written correctly + * Note that we give ownership to the SW, not the HW. + */ + wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) + + buf->nb_segs * sizeof(struct mlx4_wqe_data_seg); + ctrl->fence_size = (wqe_real_size >> 4) & 0x3f; + mlx4_txq_stamp_freed_wqe(sq, head_idx, + (sq->head & sq->txbb_cnt) ? 0 : 1); + return -1; + } +#endif /* NDEBUG */ + if (likely(sbuf->data_len)) { + byte_count = rte_cpu_to_be_32(sbuf->data_len); + } else { + /* + * Zero length segment is treated as inline segment + * with zero data. + */ + byte_count = RTE_BE32(0x80000000); + } + /* + * If the data segment is not at the beginning of a + * Tx basic block (TXBB) then write the byte count, + * else postpone the writing to just before updating the + * control segment. + */ + if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) { +#if RTE_CACHE_LINE_SIZE < 64 + /* + * Need a barrier here before writing the byte_count + * fields to make sure that all the data is visible + * before the byte_count field is set. + * Otherwise, if the segment begins a new cacheline, + * the HCA prefetcher could grab the 64-byte chunk and + * get a valid (!= 0xffffffff) byte count but stale + * data, and end up sending the wrong data. + */ + rte_io_wmb(); +#endif /* RTE_CACHE_LINE_SIZE */ + dseg->byte_count = byte_count; + } else { + /* + * This data segment starts at the beginning of a new + * TXBB, so we need to postpone its byte_count writing + * for later. + */ + pv[pv_counter].dseg = dseg; + pv[pv_counter++].val = byte_count; + } + } + /* Write the first DWORD of each TXBB save earlier. */ + if (pv_counter) { + /* Need a barrier here before writing the byte_count. */ + rte_io_wmb(); + for (--pv_counter; pv_counter >= 0; pv_counter--) + pv[pv_counter].dseg->byte_count = pv[pv_counter].val; + } + /* Fill the control parameters for this packet. */ + ctrl->fence_size = (wqe_real_size >> 4) & 0x3f; + return nr_txbbs; +} + +/** + * DPDK callback for Tx. + * + * @param dpdk_txq + * Generic pointer to Tx queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct txq *txq = (struct txq *)dpdk_txq; + unsigned int elts_head = txq->elts_head; + const unsigned int elts_n = txq->elts_n; + unsigned int bytes_sent = 0; + unsigned int i; + unsigned int max; + struct mlx4_sq *sq = &txq->msq; + int nr_txbbs; + + assert(txq->elts_comp_cd != 0); + if (likely(txq->elts_comp != 0)) + mlx4_txq_complete(txq, elts_n, sq); + max = (elts_n - (elts_head - txq->elts_tail)); + if (max > elts_n) + max -= elts_n; + assert(max >= 1); + assert(max <= elts_n); + /* Always leave one free entry in the ring. */ + --max; + if (max > pkts_n) + max = pkts_n; + for (i = 0; (i != max); ++i) { + struct rte_mbuf *buf = pkts[i]; + unsigned int elts_head_next = + (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); + struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; + struct txq_elt *elt = &(*txq->elts)[elts_head]; + uint32_t owner_opcode = MLX4_OPCODE_SEND; + volatile struct mlx4_wqe_ctrl_seg *ctrl; + volatile struct mlx4_wqe_data_seg *dseg; + union { + uint32_t flags; + uint16_t flags16[2]; + } srcrb; + uint32_t head_idx = sq->head & sq->txbb_cnt_mask; + uint32_t lkey; + uintptr_t addr; + + /* Clean up old buffer. */ + if (likely(elt->buf != NULL)) { + struct rte_mbuf *tmp = elt->buf; + +#ifndef NDEBUG + /* Poisoning. */ + memset(elt, 0x66, sizeof(*elt)); +#endif + /* Faster than rte_pktmbuf_free(). */ + do { + struct rte_mbuf *next = tmp->next; + + rte_pktmbuf_free_seg(tmp); + tmp = next; + } while (tmp != NULL); + } + RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); + if (buf->nb_segs == 1) { + /* + * Check that there is room for this WQE in the send + * queue and that the WQE size is legal + */ + if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs) >= + sq->txbb_cnt || 1 > MLX4_MAX_WQE_TXBBS) { + elt->buf = NULL; + break; + } + /* Get the control and data entries of the WQE. */ + ctrl = (volatile struct mlx4_wqe_ctrl_seg *) + mlx4_get_send_wqe(sq, head_idx); + dseg = (volatile struct mlx4_wqe_data_seg *) + ((uintptr_t)ctrl + + sizeof(struct mlx4_wqe_ctrl_seg)); + addr = rte_pktmbuf_mtod(buf, uintptr_t); + rte_prefetch0((volatile void *)addr); + /* Handle WQE wraparound. */ + if (dseg >= + (volatile struct mlx4_wqe_data_seg *)sq->eob) + dseg = (volatile struct mlx4_wqe_data_seg *) + sq->buf; + dseg->addr = rte_cpu_to_be_64(addr); + /* Memory region key (big endian). */ + lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf)); + dseg->lkey = rte_cpu_to_be_32(lkey); +#ifndef NDEBUG + if (unlikely(dseg->lkey == + rte_cpu_to_be_32((uint32_t)-1))) { + /* MR does not exist. */ + DEBUG("%p: unable to get MP <-> MR association", + (void *)txq); + /* + * Restamp entry in case of failure. + * Make sure that size is written correctly + * Note that we give ownership to the SW, + * not the HW. + */ + ctrl->fence_size = + (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f; + mlx4_txq_stamp_freed_wqe(sq, head_idx, + (sq->head & sq->txbb_cnt) ? 0 : 1); + elt->buf = NULL; + break; + } +#endif /* NDEBUG */ + /* Never be TXBB aligned, no need compiler barrier. */ + dseg->byte_count = rte_cpu_to_be_32(buf->data_len); + /* Fill the control parameters for this packet. */ + ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f; + nr_txbbs = 1; + } else { + nr_txbbs = mlx4_tx_burst_segs(buf, txq, &ctrl); + if (nr_txbbs < 0) { + elt->buf = NULL; + break; + } + } + /* + * For raw Ethernet, the SOLICIT flag is used to indicate + * that no ICRC should be calculated. + */ + txq->elts_comp_cd -= nr_txbbs; + if (unlikely(txq->elts_comp_cd <= 0)) { + txq->elts_comp_cd = txq->elts_comp_cd_init; + srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT | + MLX4_WQE_CTRL_CQ_UPDATE); + } else { + srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT); + } + /* Enable HW checksum offload if requested */ + if (txq->csum && + (buf->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) { + const uint64_t is_tunneled = (buf->ol_flags & + (PKT_TX_TUNNEL_GRE | + PKT_TX_TUNNEL_VXLAN)); + + if (is_tunneled && txq->csum_l2tun) { + owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM | + MLX4_WQE_CTRL_IL4_HDR_CSUM; + if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) + srcrb.flags |= + RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM); + } else { + srcrb.flags |= + RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM | + MLX4_WQE_CTRL_TCP_UDP_CSUM); + } + } + if (txq->lb) { + /* + * Copy destination MAC address to the WQE, this allows + * loopback in eSwitch, so that VFs and PF can + * communicate with each other. + */ + srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *)); + ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *, + sizeof(uint16_t))); + } else { + ctrl->imm = 0; + } + ctrl->srcrb_flags = srcrb.flags; + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + rte_io_wmb(); + ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode | + ((sq->head & sq->txbb_cnt) ? + MLX4_BIT_WQE_OWN : 0)); + sq->head += nr_txbbs; + elt->buf = buf; + bytes_sent += buf->pkt_len; + elts_head = elts_head_next; + } + /* Take a shortcut if nothing must be sent. */ + if (unlikely(i == 0)) + return 0; + /* Increment send statistics counters. */ + txq->stats.opackets += i; + txq->stats.obytes += bytes_sent; + /* Make sure that descriptors are written before doorbell record. */ + rte_wmb(); + /* Ring QP doorbell. */ + rte_write32(txq->msq.doorbell_qpn, txq->msq.db); + txq->elts_head = elts_head; + txq->elts_comp += i; + return i; +} + +/** + * Translate Rx completion flags to packet type. + * + * @param[in] cqe + * Pointer to CQE. + * + * @return + * Packet type for struct rte_mbuf. + */ +static inline uint32_t +rxq_cq_to_pkt_type(volatile struct mlx4_cqe *cqe) +{ + uint8_t idx = 0; + uint32_t pinfo = rte_be_to_cpu_32(cqe->vlan_my_qpn); + uint32_t status = rte_be_to_cpu_32(cqe->status); + + /* + * The index to the array should have: + * bit[7] - MLX4_CQE_L2_TUNNEL + * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4 + */ + if (!(pinfo & MLX4_CQE_L2_VLAN_MASK) && (pinfo & MLX4_CQE_L2_TUNNEL)) + idx |= ((pinfo & MLX4_CQE_L2_TUNNEL) >> 20) | + ((pinfo & MLX4_CQE_L2_TUNNEL_IPV4) >> 19); + /* + * The index to the array should have: + * bit[5] - MLX4_CQE_STATUS_UDP + * bit[4] - MLX4_CQE_STATUS_TCP + * bit[3] - MLX4_CQE_STATUS_IPV4OPT + * bit[2] - MLX4_CQE_STATUS_IPV6 + * bit[1] - MLX4_CQE_STATUS_IPV4F + * bit[0] - MLX4_CQE_STATUS_IPV4 + * giving a total of up to 256 entries. + */ + idx |= ((status & MLX4_CQE_STATUS_PTYPE_MASK) >> 22); + return mlx4_ptype_table[idx]; +} + +/** + * Translate Rx completion flags to offload flags. + * + * @param flags + * Rx completion flags returned by mlx4_cqe_flags(). + * @param csum + * Whether Rx checksums are enabled. + * @param csum_l2tun + * Whether Rx L2 tunnel checksums are enabled. + * + * @return + * Offload flags (ol_flags) in mbuf format. + */ +static inline uint32_t +rxq_cq_to_ol_flags(uint32_t flags, int csum, int csum_l2tun) +{ + uint32_t ol_flags = 0; + + if (csum) + ol_flags |= + mlx4_transpose(flags, + MLX4_CQE_STATUS_IP_HDR_CSUM_OK, + PKT_RX_IP_CKSUM_GOOD) | + mlx4_transpose(flags, + MLX4_CQE_STATUS_TCP_UDP_CSUM_OK, + PKT_RX_L4_CKSUM_GOOD); + if ((flags & MLX4_CQE_L2_TUNNEL) && csum_l2tun) + ol_flags |= + mlx4_transpose(flags, + MLX4_CQE_L2_TUNNEL_IPOK, + PKT_RX_IP_CKSUM_GOOD) | + mlx4_transpose(flags, + MLX4_CQE_L2_TUNNEL_L4_CSUM, + PKT_RX_L4_CKSUM_GOOD); + return ol_flags; +} + +/** + * Extract checksum information from CQE flags. + * + * @param cqe + * Pointer to CQE structure. + * @param csum + * Whether Rx checksums are enabled. + * @param csum_l2tun + * Whether Rx L2 tunnel checksums are enabled. + * + * @return + * CQE checksum information. + */ +static inline uint32_t +mlx4_cqe_flags(volatile struct mlx4_cqe *cqe, int csum, int csum_l2tun) +{ + uint32_t flags = 0; + + /* + * The relevant bits are in different locations on their + * CQE fields therefore we can join them in one 32bit + * variable. + */ + if (csum) + flags = (rte_be_to_cpu_32(cqe->status) & + MLX4_CQE_STATUS_IPV4_CSUM_OK); + if (csum_l2tun) + flags |= (rte_be_to_cpu_32(cqe->vlan_my_qpn) & + (MLX4_CQE_L2_TUNNEL | + MLX4_CQE_L2_TUNNEL_IPOK | + MLX4_CQE_L2_TUNNEL_L4_CSUM | + MLX4_CQE_L2_TUNNEL_IPV4)); + return flags; +} + +/** + * Poll one CQE from CQ. + * + * @param rxq + * Pointer to the receive queue structure. + * @param[out] out + * Just polled CQE. + * + * @return + * Number of bytes of the CQE, 0 in case there is no completion. + */ +static unsigned int +mlx4_cq_poll_one(struct rxq *rxq, volatile struct mlx4_cqe **out) +{ + int ret = 0; + volatile struct mlx4_cqe *cqe = NULL; + struct mlx4_cq *cq = &rxq->mcq; + + cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index); + if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(cq->cons_index & cq->cqe_cnt)) + goto out; + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rte_rmb(); + assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)); + assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != + MLX4_CQE_OPCODE_ERROR); + ret = rte_be_to_cpu_32(cqe->byte_cnt); + ++cq->cons_index; +out: + *out = cqe; + return ret; +} + +/** + * DPDK callback for Rx with scattered packets support. + * + * @param dpdk_rxq + * Generic pointer to Rx queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct rxq *rxq = dpdk_rxq; + const uint32_t wr_cnt = (1 << rxq->elts_n) - 1; + const uint16_t sges_n = rxq->sges_n; + struct rte_mbuf *pkt = NULL; + struct rte_mbuf *seg = NULL; + unsigned int i = 0; + uint32_t rq_ci = rxq->rq_ci << sges_n; + int len = 0; + + while (pkts_n) { + volatile struct mlx4_cqe *cqe; + uint32_t idx = rq_ci & wr_cnt; + struct rte_mbuf *rep = (*rxq->elts)[idx]; + volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx]; + + /* Update the 'next' pointer of the previous segment. */ + if (pkt) + seg->next = rep; + seg = rep; + rte_prefetch0(seg); + rte_prefetch0(scat); + rep = rte_mbuf_raw_alloc(rxq->mp); + if (unlikely(rep == NULL)) { + ++rxq->stats.rx_nombuf; + if (!pkt) { + /* + * No buffers before we even started, + * bail out silently. + */ + break; + } + while (pkt != seg) { + assert(pkt != (*rxq->elts)[idx]); + rep = pkt->next; + pkt->next = NULL; + pkt->nb_segs = 1; + rte_mbuf_raw_free(pkt); + pkt = rep; + } + break; + } + if (!pkt) { + /* Looking for the new packet. */ + len = mlx4_cq_poll_one(rxq, &cqe); + if (!len) { + rte_mbuf_raw_free(rep); + break; + } + if (unlikely(len < 0)) { + /* Rx error, packet is likely too large. */ + rte_mbuf_raw_free(rep); + ++rxq->stats.idropped; + goto skip; + } + pkt = seg; + /* Update packet information. */ + pkt->packet_type = rxq_cq_to_pkt_type(cqe); + pkt->ol_flags = 0; + pkt->pkt_len = len; + if (rxq->csum | rxq->csum_l2tun) { + uint32_t flags = + mlx4_cqe_flags(cqe, + rxq->csum, + rxq->csum_l2tun); + + pkt->ol_flags = + rxq_cq_to_ol_flags(flags, + rxq->csum, + rxq->csum_l2tun); + } + } + rep->nb_segs = 1; + rep->port = rxq->port_id; + rep->data_len = seg->data_len; + rep->data_off = seg->data_off; + (*rxq->elts)[idx] = rep; + /* + * Fill NIC descriptor with the new buffer. The lkey and size + * of the buffers are already known, only the buffer address + * changes. + */ + scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); + if (len > seg->data_len) { + len -= seg->data_len; + ++pkt->nb_segs; + ++rq_ci; + continue; + } + /* The last segment. */ + seg->data_len = len; + /* Increment bytes counter. */ + rxq->stats.ibytes += pkt->pkt_len; + /* Return packet. */ + *(pkts++) = pkt; + pkt = NULL; + --pkts_n; + ++i; +skip: + /* Align consumer index to the next stride. */ + rq_ci >>= sges_n; + ++rq_ci; + rq_ci <<= sges_n; + } + if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci)) + return 0; + /* Update the consumer index. */ + rxq->rq_ci = rq_ci >> sges_n; + rte_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + *rxq->mcq.set_ci_db = + rte_cpu_to_be_32(rxq->mcq.cons_index & MLX4_CQ_DB_CI_MASK); + /* Increment packets counter. */ + rxq->stats.ipackets += i; + return i; +} + +/** + * Dummy DPDK callback for Tx. + * + * This function is used to temporarily replace the real callback during + * unsafe control operations on the queue, or in case of error. + * + * @param dpdk_txq + * Generic pointer to Tx queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + (void)dpdk_txq; + (void)pkts; + (void)pkts_n; + return 0; +} + +/** + * Dummy DPDK callback for Rx. + * + * This function is used to temporarily replace the real callback during + * unsafe control operations on the queue, or in case of error. + * + * @param dpdk_rxq + * Generic pointer to Rx queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + (void)dpdk_rxq; + (void)pkts; + (void)pkts_n; + return 0; +} diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h new file mode 100644 index 00000000..4acad801 --- /dev/null +++ b/drivers/net/mlx4/mlx4_rxtx.h @@ -0,0 +1,214 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MLX4_RXTX_H_ +#define MLX4_RXTX_H_ + +#include <stdint.h> +#include <sys/queue.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/mlx4dv.h> +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_ethdev.h> +#include <rte_mbuf.h> +#include <rte_mempool.h> + +#include "mlx4.h" +#include "mlx4_prm.h" + +/** Rx queue counters. */ +struct mlx4_rxq_stats { + unsigned int idx; /**< Mapping index. */ + uint64_t ipackets; /**< Total of successfully received packets. */ + uint64_t ibytes; /**< Total of successfully received bytes. */ + uint64_t idropped; /**< Total of packets dropped when Rx ring full. */ + uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */ +}; + +/** Rx queue descriptor. */ +struct rxq { + struct priv *priv; /**< Back pointer to private data. */ + struct rte_mempool *mp; /**< Memory pool for allocations. */ + struct mlx4_mr *mr; /**< Memory region. */ + struct ibv_cq *cq; /**< Completion queue. */ + struct ibv_wq *wq; /**< Work queue. */ + struct ibv_comp_channel *channel; /**< Rx completion channel. */ + uint16_t rq_ci; /**< Saved RQ consumer index. */ + uint16_t port_id; /**< Port ID for incoming packets. */ + uint16_t sges_n; /**< Number of segments per packet (log2 value). */ + uint16_t elts_n; /**< Mbuf queue size (log2 value). */ + struct rte_mbuf *(*elts)[]; /**< Rx elements. */ + volatile struct mlx4_wqe_data_seg (*wqes)[]; /**< HW queue entries. */ + volatile uint32_t *rq_db; /**< RQ doorbell record. */ + uint32_t csum:1; /**< Enable checksum offloading. */ + uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */ + struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */ + struct mlx4_rxq_stats stats; /**< Rx queue counters. */ + unsigned int socket; /**< CPU socket ID for allocations. */ + uint32_t usecnt; /**< Number of users relying on queue resources. */ + uint8_t data[]; /**< Remaining queue resources. */ +}; + +/** Shared flow target for Rx queues. */ +struct mlx4_rss { + LIST_ENTRY(mlx4_rss) next; /**< Next entry in list. */ + struct priv *priv; /**< Back pointer to private data. */ + uint32_t refcnt; /**< Reference count for this object. */ + uint32_t usecnt; /**< Number of users relying on @p qp and @p ind. */ + struct ibv_qp *qp; /**< Queue pair. */ + struct ibv_rwq_ind_table *ind; /**< Indirection table. */ + uint64_t fields; /**< Fields for RSS processing (Verbs format). */ + uint8_t key[MLX4_RSS_HASH_KEY_SIZE]; /**< Hash key to use. */ + uint16_t queues; /**< Number of target queues. */ + uint16_t queue_id[]; /**< Target queues. */ +}; + +/** Tx element. */ +struct txq_elt { + struct rte_mbuf *buf; /**< Buffer. */ +}; + +/** Rx queue counters. */ +struct mlx4_txq_stats { + unsigned int idx; /**< Mapping index. */ + uint64_t opackets; /**< Total of successfully sent packets. */ + uint64_t obytes; /**< Total of successfully sent bytes. */ + uint64_t odropped; /**< Total of packets not sent when Tx ring full. */ +}; + +/** Tx queue descriptor. */ +struct txq { + struct mlx4_sq msq; /**< Info for directly manipulating the SQ. */ + struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */ + unsigned int elts_head; /**< Current index in (*elts)[]. */ + unsigned int elts_tail; /**< First element awaiting completion. */ + unsigned int elts_comp; /**< Number of packets awaiting completion. */ + int elts_comp_cd; /**< Countdown for next completion. */ + unsigned int elts_comp_cd_init; /**< Initial value for countdown. */ + unsigned int elts_n; /**< (*elts)[] length. */ + struct txq_elt (*elts)[]; /**< Tx elements. */ + struct mlx4_txq_stats stats; /**< Tx queue counters. */ + uint32_t max_inline; /**< Max inline send size. */ + uint32_t csum:1; /**< Enable checksum offloading. */ + uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */ + uint32_t lb:1; /**< Whether packets should be looped back by eSwitch. */ + uint8_t *bounce_buf; + /**< Memory used for storing the first DWORD of data TXBBs. */ + struct { + const struct rte_mempool *mp; /**< Cached memory pool. */ + struct mlx4_mr *mr; /**< Memory region (for mp). */ + uint32_t lkey; /**< mr->lkey copy. */ + } mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */ + struct priv *priv; /**< Back pointer to private data. */ + unsigned int socket; /**< CPU socket ID for allocations. */ + struct ibv_cq *cq; /**< Completion queue. */ + struct ibv_qp *qp; /**< Queue pair. */ + uint8_t data[]; /**< Remaining queue resources. */ +}; + +/* mlx4_rxq.c */ + +uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE]; +int mlx4_rss_init(struct priv *priv); +void mlx4_rss_deinit(struct priv *priv); +struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields, + uint8_t key[MLX4_RSS_HASH_KEY_SIZE], + uint16_t queues, const uint16_t queue_id[]); +void mlx4_rss_put(struct mlx4_rss *rss); +int mlx4_rss_attach(struct mlx4_rss *rss); +void mlx4_rss_detach(struct mlx4_rss *rss); +int mlx4_rxq_attach(struct rxq *rxq); +void mlx4_rxq_detach(struct rxq *rxq); +int mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, unsigned int socket, + const struct rte_eth_rxconf *conf, + struct rte_mempool *mp); +void mlx4_rx_queue_release(void *dpdk_rxq); + +/* mlx4_rxtx.c */ + +uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); + +/* mlx4_txq.c */ + +int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, + uint16_t desc, unsigned int socket, + const struct rte_eth_txconf *conf); +void mlx4_tx_queue_release(void *dpdk_txq); + +/** + * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[]. + * Call mlx4_txq_add_mr() if MP is not registered yet. + * + * @param txq + * Pointer to Tx queue structure. + * @param[in] mp + * Memory pool for which a memory region lkey must be returned. + * + * @return + * mr->lkey on success, (uint32_t)-1 on failure. + */ +static inline uint32_t +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) +{ + unsigned int i; + + for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { + if (unlikely(txq->mp2mr[i].mp == NULL)) { + /* Unknown MP, add a new MR for it. */ + break; + } + if (txq->mp2mr[i].mp == mp) { + /* MP found MP. */ + return txq->mp2mr[i].lkey; + } + } + return mlx4_txq_add_mr(txq, mp, i); +} + +#endif /* MLX4_RXTX_H_ */ diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c new file mode 100644 index 00000000..7882a4d0 --- /dev/null +++ b/drivers/net/mlx4/mlx4_txq.c @@ -0,0 +1,414 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file + * Tx queues configuration for mlx4 driver. + */ + +#include <assert.h> +#include <errno.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_ethdev.h> +#include <rte_malloc.h> +#include <rte_mbuf.h> +#include <rte_mempool.h> + +#include "mlx4.h" +#include "mlx4_autoconf.h" +#include "mlx4_prm.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +/** + * Free Tx queue elements. + * + * @param txq + * Pointer to Tx queue structure. + */ +static void +mlx4_txq_free_elts(struct txq *txq) +{ + unsigned int elts_head = txq->elts_head; + unsigned int elts_tail = txq->elts_tail; + struct txq_elt (*elts)[txq->elts_n] = txq->elts; + + DEBUG("%p: freeing WRs", (void *)txq); + while (elts_tail != elts_head) { + struct txq_elt *elt = &(*elts)[elts_tail]; + + assert(elt->buf != NULL); + rte_pktmbuf_free(elt->buf); + elt->buf = NULL; + if (++elts_tail == RTE_DIM(*elts)) + elts_tail = 0; + } + txq->elts_tail = txq->elts_head; +} + +struct txq_mp2mr_mbuf_check_data { + int ret; +}; + +/** + * Callback function for rte_mempool_obj_iter() to check whether a given + * mempool object looks like a mbuf. + * + * @param[in] mp + * The mempool pointer + * @param[in] arg + * Context data (struct mlx4_txq_mp2mr_mbuf_check_data). Contains the + * return value. + * @param[in] obj + * Object address. + * @param index + * Object index, unused. + */ +static void +mlx4_txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj, + uint32_t index) +{ + struct txq_mp2mr_mbuf_check_data *data = arg; + struct rte_mbuf *buf = obj; + + (void)index; + /* + * Check whether mbuf structure fits element size and whether mempool + * pointer is valid. + */ + if (sizeof(*buf) > mp->elt_size || buf->pool != mp) + data->ret = -1; +} + +/** + * Iterator function for rte_mempool_walk() to register existing mempools and + * fill the MP to MR cache of a Tx queue. + * + * @param[in] mp + * Memory Pool to register. + * @param *arg + * Pointer to Tx queue structure. + */ +static void +mlx4_txq_mp2mr_iter(struct rte_mempool *mp, void *arg) +{ + struct txq *txq = arg; + struct txq_mp2mr_mbuf_check_data data = { + .ret = 0, + }; + + /* Register mempool only if the first element looks like a mbuf. */ + if (rte_mempool_obj_iter(mp, mlx4_txq_mp2mr_mbuf_check, &data) == 0 || + data.ret == -1) + return; + mlx4_txq_mp2mr(txq, mp); +} + +/** + * Retrieves information needed in order to directly access the Tx queue. + * + * @param txq + * Pointer to Tx queue structure. + * @param mlxdv + * Pointer to device information for this Tx queue. + */ +static void +mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv) +{ + struct mlx4_sq *sq = &txq->msq; + struct mlx4_cq *cq = &txq->mcq; + struct mlx4dv_qp *dqp = mlxdv->qp.out; + struct mlx4dv_cq *dcq = mlxdv->cq.out; + uint32_t sq_size = (uint32_t)dqp->rq.offset - (uint32_t)dqp->sq.offset; + + sq->buf = (uint8_t *)dqp->buf.buf + dqp->sq.offset; + /* Total length, including headroom and spare WQEs. */ + sq->eob = sq->buf + sq_size; + sq->head = 0; + sq->tail = 0; + sq->txbb_cnt = + (dqp->sq.wqe_cnt << dqp->sq.wqe_shift) >> MLX4_TXBB_SHIFT; + sq->txbb_cnt_mask = sq->txbb_cnt - 1; + sq->db = dqp->sdb; + sq->doorbell_qpn = dqp->doorbell_qpn; + sq->headroom_txbbs = + (2048 + (1 << dqp->sq.wqe_shift)) >> MLX4_TXBB_SHIFT; + cq->buf = dcq->buf.buf; + cq->cqe_cnt = dcq->cqe_cnt; + cq->set_ci_db = dcq->set_ci_db; + cq->cqe_64 = (dcq->cqe_size & 64) ? 1 : 0; +} + +/** + * DPDK callback to configure a Tx queue. + * + * @param dev + * Pointer to Ethernet device structure. + * @param idx + * Tx queue index. + * @param desc + * Number of descriptors to configure in queue. + * @param socket + * NUMA socket on which memory must be allocated. + * @param[in] conf + * Thresholds parameters. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + unsigned int socket, const struct rte_eth_txconf *conf) +{ + struct priv *priv = dev->data->dev_private; + struct mlx4dv_obj mlxdv; + struct mlx4dv_qp dv_qp; + struct mlx4dv_cq dv_cq; + struct txq_elt (*elts)[desc]; + struct ibv_qp_init_attr qp_init_attr; + struct txq *txq; + uint8_t *bounce_buf; + struct mlx4_malloc_vec vec[] = { + { + .align = RTE_CACHE_LINE_SIZE, + .size = sizeof(*txq), + .addr = (void **)&txq, + }, + { + .align = RTE_CACHE_LINE_SIZE, + .size = sizeof(*elts), + .addr = (void **)&elts, + }, + { + .align = RTE_CACHE_LINE_SIZE, + .size = MLX4_MAX_WQE_SIZE, + .addr = (void **)&bounce_buf, + }, + }; + int ret; + + (void)conf; /* Thresholds configuration (ignored). */ + DEBUG("%p: configuring queue %u for %u descriptors", + (void *)dev, idx, desc); + if (idx >= dev->data->nb_tx_queues) { + rte_errno = EOVERFLOW; + ERROR("%p: queue index out of range (%u >= %u)", + (void *)dev, idx, dev->data->nb_tx_queues); + return -rte_errno; + } + txq = dev->data->tx_queues[idx]; + if (txq) { + rte_errno = EEXIST; + DEBUG("%p: Tx queue %u already configured, release it first", + (void *)dev, idx); + return -rte_errno; + } + if (!desc) { + rte_errno = EINVAL; + ERROR("%p: invalid number of Tx descriptors", (void *)dev); + return -rte_errno; + } + /* Allocate and initialize Tx queue. */ + mlx4_zmallocv_socket("TXQ", vec, RTE_DIM(vec), socket); + if (!txq) { + ERROR("%p: unable to allocate queue index %u", + (void *)dev, idx); + return -rte_errno; + } + *txq = (struct txq){ + .priv = priv, + .stats = { + .idx = idx, + }, + .socket = socket, + .elts_n = desc, + .elts = elts, + .elts_head = 0, + .elts_tail = 0, + .elts_comp = 0, + /* + * Request send completion every MLX4_PMD_TX_PER_COMP_REQ + * packets or at least 4 times per ring. + */ + .elts_comp_cd = + RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4), + .elts_comp_cd_init = + RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4), + .csum = priv->hw_csum, + .csum_l2tun = priv->hw_csum_l2tun, + /* Enable Tx loopback for VF devices. */ + .lb = !!priv->vf, + .bounce_buf = bounce_buf, + }; + txq->cq = ibv_create_cq(priv->ctx, desc, NULL, NULL, 0); + if (!txq->cq) { + rte_errno = ENOMEM; + ERROR("%p: CQ creation failure: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + qp_init_attr = (struct ibv_qp_init_attr){ + .send_cq = txq->cq, + .recv_cq = txq->cq, + .cap = { + .max_send_wr = + RTE_MIN(priv->device_attr.max_qp_wr, desc), + .max_send_sge = 1, + .max_inline_data = MLX4_PMD_MAX_INLINE, + }, + .qp_type = IBV_QPT_RAW_PACKET, + /* No completion events must occur by default. */ + .sq_sig_all = 0, + }; + txq->qp = ibv_create_qp(priv->pd, &qp_init_attr); + if (!txq->qp) { + rte_errno = errno ? errno : EINVAL; + ERROR("%p: QP creation failure: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + txq->max_inline = qp_init_attr.cap.max_inline_data; + ret = ibv_modify_qp + (txq->qp, + &(struct ibv_qp_attr){ + .qp_state = IBV_QPS_INIT, + .port_num = priv->port, + }, + IBV_QP_STATE | IBV_QP_PORT); + if (ret) { + rte_errno = ret; + ERROR("%p: QP state to IBV_QPS_INIT failed: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + ret = ibv_modify_qp + (txq->qp, + &(struct ibv_qp_attr){ + .qp_state = IBV_QPS_RTR, + }, + IBV_QP_STATE); + if (ret) { + rte_errno = ret; + ERROR("%p: QP state to IBV_QPS_RTR failed: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + ret = ibv_modify_qp + (txq->qp, + &(struct ibv_qp_attr){ + .qp_state = IBV_QPS_RTS, + }, + IBV_QP_STATE); + if (ret) { + rte_errno = ret; + ERROR("%p: QP state to IBV_QPS_RTS failed: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } + /* Retrieve device queue information. */ + mlxdv.cq.in = txq->cq; + mlxdv.cq.out = &dv_cq; + mlxdv.qp.in = txq->qp; + mlxdv.qp.out = &dv_qp; + ret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_QP | MLX4DV_OBJ_CQ); + if (ret) { + rte_errno = EINVAL; + ERROR("%p: failed to obtain information needed for" + " accessing the device queues", (void *)dev); + goto error; + } + mlx4_txq_fill_dv_obj_info(txq, &mlxdv); + /* Pre-register known mempools. */ + rte_mempool_walk(mlx4_txq_mp2mr_iter, txq); + DEBUG("%p: adding Tx queue %p to list", (void *)dev, (void *)txq); + dev->data->tx_queues[idx] = txq; + return 0; +error: + dev->data->tx_queues[idx] = NULL; + ret = rte_errno; + mlx4_tx_queue_release(txq); + rte_errno = ret; + assert(rte_errno > 0); + return -rte_errno; +} + +/** + * DPDK callback to release a Tx queue. + * + * @param dpdk_txq + * Generic Tx queue pointer. + */ +void +mlx4_tx_queue_release(void *dpdk_txq) +{ + struct txq *txq = (struct txq *)dpdk_txq; + struct priv *priv; + unsigned int i; + + if (txq == NULL) + return; + priv = txq->priv; + for (i = 0; i != priv->dev->data->nb_tx_queues; ++i) + if (priv->dev->data->tx_queues[i] == txq) { + DEBUG("%p: removing Tx queue %p from list", + (void *)priv->dev, (void *)txq); + priv->dev->data->tx_queues[i] = NULL; + break; + } + mlx4_txq_free_elts(txq); + if (txq->qp) + claim_zero(ibv_destroy_qp(txq->qp)); + if (txq->cq) + claim_zero(ibv_destroy_cq(txq->cq)); + for (i = 0; i != RTE_DIM(txq->mp2mr); ++i) { + if (!txq->mp2mr[i].mp) + break; + assert(txq->mp2mr[i].mr); + mlx4_mr_put(txq->mp2mr[i].mr); + } + rte_free(txq); +} diff --git a/drivers/net/mlx4/mlx4_utils.c b/drivers/net/mlx4/mlx4_utils.c new file mode 100644 index 00000000..f18c7145 --- /dev/null +++ b/drivers/net/mlx4/mlx4_utils.c @@ -0,0 +1,217 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file + * Utility functions used by the mlx4 driver. + */ + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <stddef.h> +#include <stdint.h> + +#include <rte_errno.h> +#include <rte_malloc.h> +#include <rte_memory.h> + +#include "mlx4_utils.h" + +/** + * Make a file descriptor non-blocking. + * + * @param fd + * File descriptor to alter. + * + * @return + * 0 on success, negative errno value otherwise and rte_errno is set. + */ +int +mlx4_fd_set_non_blocking(int fd) +{ + int ret = fcntl(fd, F_GETFL); + + if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK)) + return 0; + assert(errno); + rte_errno = errno; + return -rte_errno; +} + +/** + * Internal helper to allocate memory once for several disparate objects. + * + * The most restrictive alignment constraint for standard objects is assumed + * to be sizeof(double) and is used as a default value. + * + * C11 code would include stdalign.h and use alignof(max_align_t) however + * we'll stick with C99 for the time being. + */ +static inline size_t +mlx4_mallocv_inline(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt, int zero, int socket) +{ + unsigned int i; + size_t size; + size_t least; + uint8_t *data = NULL; + int fill = !vec[0].addr; + +fill: + size = 0; + least = 0; + for (i = 0; i < cnt; ++i) { + size_t align = (uintptr_t)vec[i].align; + + if (!align) { + align = sizeof(double); + } else if (!rte_is_power_of_2(align)) { + rte_errno = EINVAL; + goto error; + } + if (least < align) + least = align; + align = RTE_ALIGN_CEIL(size, align); + size = align + vec[i].size; + if (fill && vec[i].addr) + *vec[i].addr = data + align; + } + if (fill) + return size; + if (!zero) + data = rte_malloc_socket(type, size, least, socket); + else + data = rte_zmalloc_socket(type, size, least, socket); + if (data) { + fill = 1; + goto fill; + } + rte_errno = ENOMEM; +error: + for (i = 0; i != cnt; ++i) + if (vec[i].addr) + *vec[i].addr = NULL; + return 0; +} + +/** + * Allocate memory once for several disparate objects. + * + * This function adds iovec-like semantics (e.g. readv()) to rte_malloc(). + * Memory is allocated once for several contiguous objects of nonuniform + * sizes and alignment constraints. + * + * Each entry of @p vec describes the size, alignment constraint and + * provides a buffer address where the resulting object pointer must be + * stored. + * + * The buffer of the first entry is guaranteed to point to the beginning of + * the allocated region and is safe to use with rte_free(). + * + * NULL buffers are silently ignored. + * + * Providing a NULL buffer in the first entry prevents this function from + * allocating any memory but has otherwise no effect on its behavior. In + * this case, the contents of remaining non-NULL buffers are updated with + * addresses relative to zero (i.e. offsets that would have been used during + * the allocation). + * + * @param[in] type + * A string identifying the type of allocated objects (useful for debug + * purposes, such as identifying the cause of a memory leak). Can be NULL. + * @param[in, out] vec + * Description of objects to allocate memory for. + * @param cnt + * Number of entries in @p vec. + * + * @return + * Size in bytes of the allocated region including any padding. In case of + * error, rte_errno is set, 0 is returned and NULL is stored in the + * non-NULL buffers pointed by @p vec. + * + * @see struct mlx4_malloc_vec + * @see rte_malloc() + */ +size_t +mlx4_mallocv(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt) +{ + return mlx4_mallocv_inline(type, vec, cnt, 0, SOCKET_ID_ANY); +} + +/** + * Combines the semantics of mlx4_mallocv() with those of rte_zmalloc(). + * + * @see mlx4_mallocv() + * @see rte_zmalloc() + */ +size_t +mlx4_zmallocv(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt) +{ + return mlx4_mallocv_inline(type, vec, cnt, 1, SOCKET_ID_ANY); +} + +/** + * Socket-aware version of mlx4_mallocv(). + * + * This function takes one additional parameter. + * + * @param socket + * NUMA socket to allocate memory on. If SOCKET_ID_ANY is used, this + * function will behave the same as mlx4_mallocv(). + * + * @see mlx4_mallocv() + * @see rte_malloc_socket() + */ +size_t +mlx4_mallocv_socket(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt, int socket) +{ + return mlx4_mallocv_inline(type, vec, cnt, 0, socket); +} + +/** + * Combines the semantics of mlx4_mallocv_socket() with those of + * mlx4_zmalloc_socket(). + * + * @see mlx4_mallocv_socket() + * @see rte_zmalloc_socket() + */ +size_t +mlx4_zmallocv_socket(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt, int socket) +{ + return mlx4_mallocv_inline(type, vec, cnt, 1, socket); +} diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h new file mode 100644 index 00000000..dc529c9c --- /dev/null +++ b/drivers/net/mlx4/mlx4_utils.h @@ -0,0 +1,133 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MLX4_UTILS_H_ +#define MLX4_UTILS_H_ + +#include <assert.h> +#include <stddef.h> +#include <stdio.h> + +#include <rte_common.h> +#include <rte_log.h> + +#include "mlx4.h" + +#ifndef NDEBUG + +/* + * When debugging is enabled (NDEBUG not defined), file, line and function + * information replace the driver name (MLX4_DRIVER_NAME) in log messages. + */ + +/** Return the file name part of a path. */ +static inline const char * +pmd_drv_log_basename(const char *s) +{ + const char *n = s; + + while (*n) + if (*(n++) == '/') + s = n; + return s; +} + +#define PMD_DRV_LOG(level, ...) \ + RTE_LOG(level, PMD, \ + RTE_FMT("%s:%u: %s(): " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ + pmd_drv_log_basename(__FILE__), \ + __LINE__, \ + __func__, \ + RTE_FMT_TAIL(__VA_ARGS__,))) +#define DEBUG(...) PMD_DRV_LOG(DEBUG, __VA_ARGS__) +#ifndef MLX4_PMD_DEBUG_BROKEN_VERBS +#define claim_zero(...) assert((__VA_ARGS__) == 0) +#else /* MLX4_PMD_DEBUG_BROKEN_VERBS */ +#define claim_zero(...) \ + (void)(((__VA_ARGS__) == 0) || \ + DEBUG("Assertion `(" # __VA_ARGS__ ") == 0' failed (IGNORED).")) +#endif /* MLX4_PMD_DEBUG_BROKEN_VERBS */ + +#else /* NDEBUG */ + +/* + * Like assert(), DEBUG() becomes a no-op and claim_zero() does not perform + * any check when debugging is disabled. + */ + +#define PMD_DRV_LOG(level, ...) \ + RTE_LOG(level, PMD, \ + RTE_FMT(MLX4_DRIVER_NAME ": " \ + RTE_FMT_HEAD(__VA_ARGS__,) "\n", \ + RTE_FMT_TAIL(__VA_ARGS__,))) +#define DEBUG(...) (void)0 +#define claim_zero(...) (__VA_ARGS__) + +#endif /* NDEBUG */ + +#define INFO(...) PMD_DRV_LOG(INFO, __VA_ARGS__) +#define WARN(...) PMD_DRV_LOG(WARNING, __VA_ARGS__) +#define ERROR(...) PMD_DRV_LOG(ERR, __VA_ARGS__) + +/** Allocate a buffer on the stack and fill it with a printf format string. */ +#define MKSTR(name, ...) \ + char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \ + \ + snprintf(name, sizeof(name), __VA_ARGS__) + +/** Generate a string out of the provided arguments. */ +#define MLX4_STR(...) # __VA_ARGS__ + +/** Similar to MLX4_STR() with enclosed macros expanded first. */ +#define MLX4_STR_EXPAND(...) MLX4_STR(__VA_ARGS__) + +/** Object description used with mlx4_mallocv() and similar functions. */ +struct mlx4_malloc_vec { + size_t align; /**< Alignment constraint (power of 2), 0 if unknown. */ + size_t size; /**< Object size. */ + void **addr; /**< Storage for allocation address. */ +}; + +/* mlx4_utils.c */ + +int mlx4_fd_set_non_blocking(int fd); +size_t mlx4_mallocv(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt); +size_t mlx4_zmallocv(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt); +size_t mlx4_mallocv_socket(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt, int socket); +size_t mlx4_zmallocv_socket(const char *type, const struct mlx4_malloc_vec *vec, + unsigned int cnt, int socket); + +#endif /* MLX4_UTILS_H_ */ |