aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/mlx4
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/mlx4')
-rw-r--r--drivers/net/mlx4/Makefile46
-rw-r--r--drivers/net/mlx4/mlx4.c6139
-rw-r--r--drivers/net/mlx4/mlx4.h401
-rw-r--r--drivers/net/mlx4/mlx4_ethdev.c1047
-rw-r--r--drivers/net/mlx4/mlx4_flow.c2062
-rw-r--r--drivers/net/mlx4/mlx4_flow.h73
-rw-r--r--drivers/net/mlx4/mlx4_intr.c397
-rw-r--r--drivers/net/mlx4/mlx4_mr.c293
-rw-r--r--drivers/net/mlx4/mlx4_prm.h177
-rw-r--r--drivers/net/mlx4/mlx4_rxq.c873
-rw-r--r--drivers/net/mlx4/mlx4_rxtx.c1071
-rw-r--r--drivers/net/mlx4/mlx4_rxtx.h214
-rw-r--r--drivers/net/mlx4/mlx4_txq.c414
-rw-r--r--drivers/net/mlx4/mlx4_utils.c217
-rw-r--r--drivers/net/mlx4/mlx4_utils.h133
15 files changed, 6318 insertions, 7239 deletions
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index c045bd79..f1f47c28 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -1,7 +1,7 @@
# BSD LICENSE
#
-# Copyright 2012-2015 6WIND S.A.
-# Copyright 2012 Mellanox.
+# Copyright 2012 6WIND S.A.
+# Copyright 2012 Mellanox
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
@@ -36,7 +36,14 @@ LIB = librte_pmd_mlx4.a
# Sources.
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_ethdev.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_txq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_utils.c
# Basic CFLAGS.
CFLAGS += -O3
@@ -47,7 +54,10 @@ CFLAGS += -D_BSD_SOURCE
CFLAGS += -D_DEFAULT_SOURCE
CFLAGS += -D_XOPEN_SOURCE=600
CFLAGS += $(WERROR_FLAGS)
-LDLIBS += -libverbs
+LDLIBS += -libverbs -lmlx4
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_pci
# A few warnings cannot be avoided in external headers.
CFLAGS += -Wno-error=cast-qual
@@ -68,22 +78,10 @@ else
CFLAGS += -DNDEBUG -UPEDANTIC
endif
-ifdef CONFIG_RTE_LIBRTE_MLX4_SGE_WR_N
-CFLAGS += -DMLX4_PMD_SGE_WR_N=$(CONFIG_RTE_LIBRTE_MLX4_SGE_WR_N)
-endif
-
-ifdef CONFIG_RTE_LIBRTE_MLX4_MAX_INLINE
-CFLAGS += -DMLX4_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX4_MAX_INLINE)
-endif
-
ifdef CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE
CFLAGS += -DMLX4_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE)
endif
-ifdef CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS
-CFLAGS += -DMLX4_PMD_SOFT_COUNTERS=$(CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS)
-endif
-
ifeq ($(CONFIG_RTE_LIBRTE_MLX4_DEBUG_BROKEN_VERBS),y)
CFLAGS += -DMLX4_PMD_DEBUG_BROKEN_VERBS
endif
@@ -103,23 +101,7 @@ mlx4_autoconf.h.new: FORCE
mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
$Q $(RM) -f -- '$@'
- $Q sh -- '$<' '$@' \
- RSS_SUPPORT \
- infiniband/verbs.h \
- enum IBV_EXP_DEVICE_UD_RSS $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- INLINE_RECV \
- infiniband/verbs.h \
- enum IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_EXP_QUERY_DEVICE \
- infiniband/verbs.h \
- type 'struct ibv_exp_device_attr' $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK \
- infiniband/verbs.h \
- enum IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK \
- $(AUTOCONF_OUTPUT)
+ $Q : > '$@'
# Create mlx4_autoconf.h or update it in case it differs from the new one.
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 055de49a..f9e4f9d7 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -1,8 +1,8 @@
/*-
* BSD LICENSE
*
- * Copyright 2012-2017 6WIND S.A.
- * Copyright 2012-2017 Mellanox.
+ * Copyright 2012 6WIND S.A.
+ * Copyright 2012 Mellanox
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -31,95 +31,52 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-/*
- * Known limitations:
- * - RSS hash key and options cannot be modified.
- * - Hardware counters aren't implemented.
+/**
+ * @file
+ * mlx4 driver initialization.
*/
-/* System headers. */
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
#include <stddef.h>
+#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
-#include <stdint.h>
-#include <inttypes.h>
#include <string.h>
-#include <errno.h>
-#include <unistd.h>
-#include <limits.h>
-#include <assert.h>
-#include <arpa/inet.h>
-#include <net/if.h>
-#include <dirent.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <linux/ethtool.h>
-#include <linux/sockios.h>
-#include <fcntl.h>
-#include <rte_ether.h>
-#include <rte_ethdev.h>
-#include <rte_ethdev_pci.h>
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
#include <rte_dev.h>
-#include <rte_mbuf.h>
#include <rte_errno.h>
-#include <rte_mempool.h>
-#include <rte_prefetch.h>
-#include <rte_malloc.h>
-#include <rte_spinlock.h>
-#include <rte_atomic.h>
-#include <rte_version.h>
-#include <rte_log.h>
-#include <rte_alarm.h>
-#include <rte_memory.h>
+#include <rte_ethdev.h>
+#include <rte_ethdev_pci.h>
+#include <rte_ether.h>
#include <rte_flow.h>
-#include <rte_kvargs.h>
#include <rte_interrupts.h>
+#include <rte_kvargs.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
-/* Generated configuration header. */
-#include "mlx4_autoconf.h"
-
-/* PMD headers. */
#include "mlx4.h"
#include "mlx4_flow.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
-/* Convenience macros for accessing mbuf fields. */
-#define NEXT(m) ((m)->next)
-#define DATA_LEN(m) ((m)->data_len)
-#define PKT_LEN(m) ((m)->pkt_len)
-#define DATA_OFF(m) ((m)->data_off)
-#define SET_DATA_OFF(m, o) ((m)->data_off = (o))
-#define NB_SEGS(m) ((m)->nb_segs)
-#define PORT(m) ((m)->port)
-
-/* Work Request ID data type (64 bit). */
-typedef union {
- struct {
- uint32_t id;
- uint16_t offset;
- } data;
- uint64_t raw;
-} wr_id_t;
-
-#define WR_ID(o) (((wr_id_t *)&(o))->data)
-
-/* Transpose flags. Useful to convert IBV to DPDK flags. */
-#define TRANSPOSE(val, from, to) \
- (((from) >= (to)) ? \
- (((val) & (from)) / ((from) / (to))) : \
- (((val) & (from)) * ((to) / (from))))
-
-/* Local storage for secondary process data. */
-struct mlx4_secondary_data {
- struct rte_eth_dev_data data; /* Local device data. */
- struct priv *primary_priv; /* Private structure from primary. */
- struct rte_eth_dev_data *shared_dev_data; /* Shared device data. */
- rte_spinlock_t lock; /* Port configuration lock. */
-} mlx4_secondary_data[RTE_MAX_ETHPORTS];
-
+/** Configuration structure for device arguments. */
struct mlx4_conf {
- uint8_t active_ports;
+ struct {
+ uint32_t present; /**< Bit-field for existing ports. */
+ uint32_t enabled; /**< Bit-field for user-enabled ports. */
+ } ports;
};
/* Available parameters list. */
@@ -128,593 +85,6 @@ const char *pmd_mlx4_init_params[] = {
NULL,
};
-static int
-mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
-
-static int
-mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
-
-static int
-priv_rx_intr_vec_enable(struct priv *priv);
-
-static void
-priv_rx_intr_vec_disable(struct priv *priv);
-
-/**
- * Check if running as a secondary process.
- *
- * @return
- * Nonzero if running as a secondary process.
- */
-static inline int
-mlx4_is_secondary(void)
-{
- return rte_eal_process_type() != RTE_PROC_PRIMARY;
-}
-
-/**
- * Return private structure associated with an Ethernet device.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- *
- * @return
- * Pointer to private structure.
- */
-static struct priv *
-mlx4_get_priv(struct rte_eth_dev *dev)
-{
- struct mlx4_secondary_data *sd;
-
- if (!mlx4_is_secondary())
- return dev->data->dev_private;
- sd = &mlx4_secondary_data[dev->data->port_id];
- return sd->data.dev_private;
-}
-
-/**
- * Lock private structure to protect it from concurrent access in the
- * control path.
- *
- * @param priv
- * Pointer to private structure.
- */
-void priv_lock(struct priv *priv)
-{
- rte_spinlock_lock(&priv->lock);
-}
-
-/**
- * Unlock private structure.
- *
- * @param priv
- * Pointer to private structure.
- */
-void priv_unlock(struct priv *priv)
-{
- rte_spinlock_unlock(&priv->lock);
-}
-
-/* Allocate a buffer on the stack and fill it with a printf format string. */
-#define MKSTR(name, ...) \
- char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \
- \
- snprintf(name, sizeof(name), __VA_ARGS__)
-
-/**
- * Get interface name from private structure.
- *
- * @param[in] priv
- * Pointer to private structure.
- * @param[out] ifname
- * Interface name output buffer.
- *
- * @return
- * 0 on success, -1 on failure and errno is set.
- */
-static int
-priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
-{
- DIR *dir;
- struct dirent *dent;
- unsigned int dev_type = 0;
- unsigned int dev_port_prev = ~0u;
- char match[IF_NAMESIZE] = "";
-
- {
- MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path);
-
- dir = opendir(path);
- if (dir == NULL)
- return -1;
- }
- while ((dent = readdir(dir)) != NULL) {
- char *name = dent->d_name;
- FILE *file;
- unsigned int dev_port;
- int r;
-
- if ((name[0] == '.') &&
- ((name[1] == '\0') ||
- ((name[1] == '.') && (name[2] == '\0'))))
- continue;
-
- MKSTR(path, "%s/device/net/%s/%s",
- priv->ctx->device->ibdev_path, name,
- (dev_type ? "dev_id" : "dev_port"));
-
- file = fopen(path, "rb");
- if (file == NULL) {
- if (errno != ENOENT)
- continue;
- /*
- * Switch to dev_id when dev_port does not exist as
- * is the case with Linux kernel versions < 3.15.
- */
-try_dev_id:
- match[0] = '\0';
- if (dev_type)
- break;
- dev_type = 1;
- dev_port_prev = ~0u;
- rewinddir(dir);
- continue;
- }
- r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
- fclose(file);
- if (r != 1)
- continue;
- /*
- * Switch to dev_id when dev_port returns the same value for
- * all ports. May happen when using a MOFED release older than
- * 3.0 with a Linux kernel >= 3.15.
- */
- if (dev_port == dev_port_prev)
- goto try_dev_id;
- dev_port_prev = dev_port;
- if (dev_port == (priv->port - 1u))
- snprintf(match, sizeof(match), "%s", name);
- }
- closedir(dir);
- if (match[0] == '\0')
- return -1;
- strncpy(*ifname, match, sizeof(*ifname));
- return 0;
-}
-
-/**
- * Read from sysfs entry.
- *
- * @param[in] priv
- * Pointer to private structure.
- * @param[in] entry
- * Entry name relative to sysfs path.
- * @param[out] buf
- * Data output buffer.
- * @param size
- * Buffer size.
- *
- * @return
- * 0 on success, -1 on failure and errno is set.
- */
-static int
-priv_sysfs_read(const struct priv *priv, const char *entry,
- char *buf, size_t size)
-{
- char ifname[IF_NAMESIZE];
- FILE *file;
- int ret;
- int err;
-
- if (priv_get_ifname(priv, &ifname))
- return -1;
-
- MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
- ifname, entry);
-
- file = fopen(path, "rb");
- if (file == NULL)
- return -1;
- ret = fread(buf, 1, size, file);
- err = errno;
- if (((size_t)ret < size) && (ferror(file)))
- ret = -1;
- else
- ret = size;
- fclose(file);
- errno = err;
- return ret;
-}
-
-/**
- * Write to sysfs entry.
- *
- * @param[in] priv
- * Pointer to private structure.
- * @param[in] entry
- * Entry name relative to sysfs path.
- * @param[in] buf
- * Data buffer.
- * @param size
- * Buffer size.
- *
- * @return
- * 0 on success, -1 on failure and errno is set.
- */
-static int
-priv_sysfs_write(const struct priv *priv, const char *entry,
- char *buf, size_t size)
-{
- char ifname[IF_NAMESIZE];
- FILE *file;
- int ret;
- int err;
-
- if (priv_get_ifname(priv, &ifname))
- return -1;
-
- MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
- ifname, entry);
-
- file = fopen(path, "wb");
- if (file == NULL)
- return -1;
- ret = fwrite(buf, 1, size, file);
- err = errno;
- if (((size_t)ret < size) || (ferror(file)))
- ret = -1;
- else
- ret = size;
- fclose(file);
- errno = err;
- return ret;
-}
-
-/**
- * Get unsigned long sysfs property.
- *
- * @param priv
- * Pointer to private structure.
- * @param[in] name
- * Entry name relative to sysfs path.
- * @param[out] value
- * Value output buffer.
- *
- * @return
- * 0 on success, -1 on failure and errno is set.
- */
-static int
-priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value)
-{
- int ret;
- unsigned long value_ret;
- char value_str[32];
-
- ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1));
- if (ret == -1) {
- DEBUG("cannot read %s value from sysfs: %s",
- name, strerror(errno));
- return -1;
- }
- value_str[ret] = '\0';
- errno = 0;
- value_ret = strtoul(value_str, NULL, 0);
- if (errno) {
- DEBUG("invalid %s value `%s': %s", name, value_str,
- strerror(errno));
- return -1;
- }
- *value = value_ret;
- return 0;
-}
-
-/**
- * Set unsigned long sysfs property.
- *
- * @param priv
- * Pointer to private structure.
- * @param[in] name
- * Entry name relative to sysfs path.
- * @param value
- * Value to set.
- *
- * @return
- * 0 on success, -1 on failure and errno is set.
- */
-static int
-priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value)
-{
- int ret;
- MKSTR(value_str, "%lu", value);
-
- ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1));
- if (ret == -1) {
- DEBUG("cannot write %s `%s' (%lu) to sysfs: %s",
- name, value_str, value, strerror(errno));
- return -1;
- }
- return 0;
-}
-
-/**
- * Perform ifreq ioctl() on associated Ethernet device.
- *
- * @param[in] priv
- * Pointer to private structure.
- * @param req
- * Request number to pass to ioctl().
- * @param[out] ifr
- * Interface request structure output buffer.
- *
- * @return
- * 0 on success, -1 on failure and errno is set.
- */
-static int
-priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr)
-{
- int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
- int ret = -1;
-
- if (sock == -1)
- return ret;
- if (priv_get_ifname(priv, &ifr->ifr_name) == 0)
- ret = ioctl(sock, req, ifr);
- close(sock);
- return ret;
-}
-
-/**
- * Get device MTU.
- *
- * @param priv
- * Pointer to private structure.
- * @param[out] mtu
- * MTU value output buffer.
- *
- * @return
- * 0 on success, -1 on failure and errno is set.
- */
-static int
-priv_get_mtu(struct priv *priv, uint16_t *mtu)
-{
- unsigned long ulong_mtu;
-
- if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1)
- return -1;
- *mtu = ulong_mtu;
- return 0;
-}
-
-/**
- * Set device MTU.
- *
- * @param priv
- * Pointer to private structure.
- * @param mtu
- * MTU value to set.
- *
- * @return
- * 0 on success, -1 on failure and errno is set.
- */
-static int
-priv_set_mtu(struct priv *priv, uint16_t mtu)
-{
- uint16_t new_mtu;
-
- if (priv_set_sysfs_ulong(priv, "mtu", mtu) ||
- priv_get_mtu(priv, &new_mtu))
- return -1;
- if (new_mtu == mtu)
- return 0;
- errno = EINVAL;
- return -1;
-}
-
-/**
- * Set device flags.
- *
- * @param priv
- * Pointer to private structure.
- * @param keep
- * Bitmask for flags that must remain untouched.
- * @param flags
- * Bitmask for flags to modify.
- *
- * @return
- * 0 on success, -1 on failure and errno is set.
- */
-static int
-priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags)
-{
- unsigned long tmp;
-
- if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1)
- return -1;
- tmp &= keep;
- tmp |= (flags & (~keep));
- return priv_set_sysfs_ulong(priv, "flags", tmp);
-}
-
-/* Device configuration. */
-
-static int
-txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
- unsigned int socket, const struct rte_eth_txconf *conf);
-
-static void
-txq_cleanup(struct txq *txq);
-
-static int
-rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
- unsigned int socket, int inactive,
- const struct rte_eth_rxconf *conf,
- struct rte_mempool *mp, int children_n,
- struct rxq *rxq_parent);
-
-static void
-rxq_cleanup(struct rxq *rxq);
-
-/**
- * Create RSS parent queue.
- *
- * The new parent is inserted in front of the list in the private structure.
- *
- * @param priv
- * Pointer to private structure.
- * @param queues
- * Queues indices array, if NULL use all Rx queues.
- * @param children_n
- * The number of entries in queues[].
- *
- * @return
- * Pointer to a parent rxq structure, NULL on failure.
- */
-struct rxq *
-priv_parent_create(struct priv *priv,
- uint16_t queues[],
- uint16_t children_n)
-{
- int ret;
- uint16_t i;
- struct rxq *parent;
-
- parent = rte_zmalloc("parent queue",
- sizeof(*parent),
- RTE_CACHE_LINE_SIZE);
- if (!parent) {
- ERROR("cannot allocate memory for RSS parent queue");
- return NULL;
- }
- ret = rxq_setup(priv->dev, parent, 0, 0, 0,
- NULL, NULL, children_n, NULL);
- if (ret) {
- rte_free(parent);
- return NULL;
- }
- parent->rss.queues_n = children_n;
- if (queues) {
- for (i = 0; i < children_n; ++i)
- parent->rss.queues[i] = queues[i];
- } else {
- /* the default RSS ring case */
- assert(priv->rxqs_n == children_n);
- for (i = 0; i < priv->rxqs_n; ++i)
- parent->rss.queues[i] = i;
- }
- LIST_INSERT_HEAD(&priv->parents, parent, next);
- return parent;
-}
-
-/**
- * Clean up RX queue parent structure.
- *
- * @param parent
- * RX queue parent structure.
- */
-void
-rxq_parent_cleanup(struct rxq *parent)
-{
- LIST_REMOVE(parent, next);
- rxq_cleanup(parent);
- rte_free(parent);
-}
-
-/**
- * Clean up parent structures from the parent list.
- *
- * @param priv
- * Pointer to private structure.
- */
-static void
-priv_parent_list_cleanup(struct priv *priv)
-{
- while (!LIST_EMPTY(&priv->parents))
- rxq_parent_cleanup(LIST_FIRST(&priv->parents));
-}
-
-/**
- * Ethernet device configuration.
- *
- * Prepare the driver for a given number of TX and RX queues.
- * Allocate parent RSS queue when several RX queues are requested.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-dev_configure(struct rte_eth_dev *dev)
-{
- struct priv *priv = dev->data->dev_private;
- unsigned int rxqs_n = dev->data->nb_rx_queues;
- unsigned int txqs_n = dev->data->nb_tx_queues;
- unsigned int tmp;
-
- priv->rxqs = (void *)dev->data->rx_queues;
- priv->txqs = (void *)dev->data->tx_queues;
- if (txqs_n != priv->txqs_n) {
- INFO("%p: TX queues number update: %u -> %u",
- (void *)dev, priv->txqs_n, txqs_n);
- priv->txqs_n = txqs_n;
- }
- if (rxqs_n == priv->rxqs_n)
- return 0;
- if (!rte_is_power_of_2(rxqs_n) && !priv->isolated) {
- unsigned n_active;
-
- n_active = rte_align32pow2(rxqs_n + 1) >> 1;
- WARN("%p: number of RX queues must be a power"
- " of 2: %u queues among %u will be active",
- (void *)dev, n_active, rxqs_n);
- }
-
- INFO("%p: RX queues number update: %u -> %u",
- (void *)dev, priv->rxqs_n, rxqs_n);
- /* If RSS is enabled, disable it first. */
- if (priv->rss) {
- unsigned int i;
-
- /* Only if there are no remaining child RX queues. */
- for (i = 0; (i != priv->rxqs_n); ++i)
- if ((*priv->rxqs)[i] != NULL)
- return EINVAL;
- priv_parent_list_cleanup(priv);
- priv->rss = 0;
- priv->rxqs_n = 0;
- }
- if (rxqs_n <= 1) {
- /* Nothing else to do. */
- priv->rxqs_n = rxqs_n;
- return 0;
- }
- /* Allocate a new RSS parent queue if supported by hardware. */
- if (!priv->hw_rss) {
- ERROR("%p: only a single RX queue can be configured when"
- " hardware doesn't support RSS",
- (void *)dev);
- return EINVAL;
- }
- /* Fail if hardware doesn't support that many RSS queues. */
- if (rxqs_n >= priv->max_rss_tbl_sz) {
- ERROR("%p: only %u RX queues can be configured for RSS",
- (void *)dev, priv->max_rss_tbl_sz);
- return EINVAL;
- }
- priv->rss = 1;
- tmp = priv->rxqs_n;
- priv->rxqs_n = rxqs_n;
- if (priv->isolated)
- return 0;
- if (priv_parent_create(priv, NULL, priv->rxqs_n))
- return 0;
- /* Failure, rollback. */
- priv->rss = 0;
- priv->rxqs_n = tmp;
- return ENOMEM;
-}
-
/**
* DPDK callback for Ethernet device configuration.
*
@@ -722,3490 +92,78 @@ dev_configure(struct rte_eth_dev *dev)
* Pointer to Ethernet device structure.
*
* @return
- * 0 on success, negative errno value on failure.
+ * 0 on success, negative errno value otherwise and rte_errno is set.
*/
static int
mlx4_dev_configure(struct rte_eth_dev *dev)
{
struct priv *priv = dev->data->dev_private;
+ struct rte_flow_error error;
int ret;
- if (mlx4_is_secondary())
- return -E_RTE_SECONDARY;
- priv_lock(priv);
- ret = dev_configure(dev);
- assert(ret >= 0);
- priv_unlock(priv);
- return -ret;
-}
-
-static uint16_t mlx4_tx_burst(void *, struct rte_mbuf **, uint16_t);
-static uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
-
-/**
- * Configure secondary process queues from a private data pointer (primary
- * or secondary) and update burst callbacks. Can take place only once.
- *
- * All queues must have been previously created by the primary process to
- * avoid undefined behavior.
- *
- * @param priv
- * Private data pointer from either primary or secondary process.
- *
- * @return
- * Private data pointer from secondary process, NULL in case of error.
- */
-static struct priv *
-mlx4_secondary_data_setup(struct priv *priv)
-{
- unsigned int port_id = 0;
- struct mlx4_secondary_data *sd;
- void **tx_queues;
- void **rx_queues;
- unsigned int nb_tx_queues;
- unsigned int nb_rx_queues;
- unsigned int i;
-
- /* priv must be valid at this point. */
- assert(priv != NULL);
- /* priv->dev must also be valid but may point to local memory from
- * another process, possibly with the same address and must not
- * be dereferenced yet. */
- assert(priv->dev != NULL);
- /* Determine port ID by finding out where priv comes from. */
- while (1) {
- sd = &mlx4_secondary_data[port_id];
- rte_spinlock_lock(&sd->lock);
- /* Primary process? */
- if (sd->primary_priv == priv)
- break;
- /* Secondary process? */
- if (sd->data.dev_private == priv)
- break;
- rte_spinlock_unlock(&sd->lock);
- if (++port_id == RTE_DIM(mlx4_secondary_data))
- port_id = 0;
- }
- /* Switch to secondary private structure. If private data has already
- * been updated by another thread, there is nothing else to do. */
- priv = sd->data.dev_private;
- if (priv->dev->data == &sd->data)
- goto end;
- /* Sanity checks. Secondary private structure is supposed to point
- * to local eth_dev, itself still pointing to the shared device data
- * structure allocated by the primary process. */
- assert(sd->shared_dev_data != &sd->data);
- assert(sd->data.nb_tx_queues == 0);
- assert(sd->data.tx_queues == NULL);
- assert(sd->data.nb_rx_queues == 0);
- assert(sd->data.rx_queues == NULL);
- assert(priv != sd->primary_priv);
- assert(priv->dev->data == sd->shared_dev_data);
- assert(priv->txqs_n == 0);
- assert(priv->txqs == NULL);
- assert(priv->rxqs_n == 0);
- assert(priv->rxqs == NULL);
- nb_tx_queues = sd->shared_dev_data->nb_tx_queues;
- nb_rx_queues = sd->shared_dev_data->nb_rx_queues;
- /* Allocate local storage for queues. */
- tx_queues = rte_zmalloc("secondary ethdev->tx_queues",
- sizeof(sd->data.tx_queues[0]) * nb_tx_queues,
- RTE_CACHE_LINE_SIZE);
- rx_queues = rte_zmalloc("secondary ethdev->rx_queues",
- sizeof(sd->data.rx_queues[0]) * nb_rx_queues,
- RTE_CACHE_LINE_SIZE);
- if (tx_queues == NULL || rx_queues == NULL)
- goto error;
- /* Lock to prevent control operations during setup. */
- priv_lock(priv);
- /* TX queues. */
- for (i = 0; i != nb_tx_queues; ++i) {
- struct txq *primary_txq = (*sd->primary_priv->txqs)[i];
- struct txq *txq;
-
- if (primary_txq == NULL)
- continue;
- txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0,
- primary_txq->socket);
- if (txq != NULL) {
- if (txq_setup(priv->dev,
- txq,
- primary_txq->elts_n * MLX4_PMD_SGE_WR_N,
- primary_txq->socket,
- NULL) == 0) {
- txq->stats.idx = primary_txq->stats.idx;
- tx_queues[i] = txq;
- continue;
- }
- rte_free(txq);
- }
- while (i) {
- txq = tx_queues[--i];
- txq_cleanup(txq);
- rte_free(txq);
- }
- goto error;
- }
- /* RX queues. */
- for (i = 0; i != nb_rx_queues; ++i) {
- struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i];
-
- if (primary_rxq == NULL)
- continue;
- /* Not supported yet. */
- rx_queues[i] = NULL;
- }
- /* Update everything. */
- priv->txqs = (void *)tx_queues;
- priv->txqs_n = nb_tx_queues;
- priv->rxqs = (void *)rx_queues;
- priv->rxqs_n = nb_rx_queues;
- sd->data.rx_queues = rx_queues;
- sd->data.tx_queues = tx_queues;
- sd->data.nb_rx_queues = nb_rx_queues;
- sd->data.nb_tx_queues = nb_tx_queues;
- sd->data.dev_link = sd->shared_dev_data->dev_link;
- sd->data.mtu = sd->shared_dev_data->mtu;
- memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state,
- sizeof(sd->data.rx_queue_state));
- memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state,
- sizeof(sd->data.tx_queue_state));
- sd->data.dev_flags = sd->shared_dev_data->dev_flags;
- /* Use local data from now on. */
- rte_mb();
- priv->dev->data = &sd->data;
- rte_mb();
- priv->dev->tx_pkt_burst = mlx4_tx_burst;
- priv->dev->rx_pkt_burst = removed_rx_burst;
- priv_unlock(priv);
-end:
- /* More sanity checks. */
- assert(priv->dev->tx_pkt_burst == mlx4_tx_burst);
- assert(priv->dev->rx_pkt_burst == removed_rx_burst);
- assert(priv->dev->data == &sd->data);
- rte_spinlock_unlock(&sd->lock);
- return priv;
-error:
- priv_unlock(priv);
- rte_free(tx_queues);
- rte_free(rx_queues);
- rte_spinlock_unlock(&sd->lock);
- return NULL;
-}
-
-/* TX queues handling. */
-
-/**
- * Allocate TX queue elements.
- *
- * @param txq
- * Pointer to TX queue structure.
- * @param elts_n
- * Number of elements to allocate.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-txq_alloc_elts(struct txq *txq, unsigned int elts_n)
-{
- unsigned int i;
- struct txq_elt (*elts)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
- linear_t (*elts_linear)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts_linear), 0,
- txq->socket);
- struct ibv_mr *mr_linear = NULL;
- int ret = 0;
-
- if ((elts == NULL) || (elts_linear == NULL)) {
- ERROR("%p: can't allocate packets array", (void *)txq);
- ret = ENOMEM;
- goto error;
- }
- mr_linear =
- ibv_reg_mr(txq->priv->pd, elts_linear, sizeof(*elts_linear),
- IBV_ACCESS_LOCAL_WRITE);
- if (mr_linear == NULL) {
- ERROR("%p: unable to configure MR, ibv_reg_mr() failed",
- (void *)txq);
- ret = EINVAL;
- goto error;
- }
- for (i = 0; (i != elts_n); ++i) {
- struct txq_elt *elt = &(*elts)[i];
-
- elt->buf = NULL;
- }
- DEBUG("%p: allocated and configured %u WRs", (void *)txq, elts_n);
- txq->elts_n = elts_n;
- txq->elts = elts;
- txq->elts_head = 0;
- txq->elts_tail = 0;
- txq->elts_comp = 0;
- /* Request send completion every MLX4_PMD_TX_PER_COMP_REQ packets or
- * at least 4 times per ring. */
- txq->elts_comp_cd_init =
- ((MLX4_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
- MLX4_PMD_TX_PER_COMP_REQ : (elts_n / 4));
- txq->elts_comp_cd = txq->elts_comp_cd_init;
- txq->elts_linear = elts_linear;
- txq->mr_linear = mr_linear;
- assert(ret == 0);
- return 0;
-error:
- if (mr_linear != NULL)
- claim_zero(ibv_dereg_mr(mr_linear));
-
- rte_free(elts_linear);
- rte_free(elts);
-
- DEBUG("%p: failed, freed everything", (void *)txq);
- assert(ret > 0);
- return ret;
-}
-
-/**
- * Free TX queue elements.
- *
- * @param txq
- * Pointer to TX queue structure.
- */
-static void
-txq_free_elts(struct txq *txq)
-{
- unsigned int elts_n = txq->elts_n;
- unsigned int elts_head = txq->elts_head;
- unsigned int elts_tail = txq->elts_tail;
- struct txq_elt (*elts)[elts_n] = txq->elts;
- linear_t (*elts_linear)[elts_n] = txq->elts_linear;
- struct ibv_mr *mr_linear = txq->mr_linear;
-
- DEBUG("%p: freeing WRs", (void *)txq);
- txq->elts_n = 0;
- txq->elts_head = 0;
- txq->elts_tail = 0;
- txq->elts_comp = 0;
- txq->elts_comp_cd = 0;
- txq->elts_comp_cd_init = 0;
- txq->elts = NULL;
- txq->elts_linear = NULL;
- txq->mr_linear = NULL;
- if (mr_linear != NULL)
- claim_zero(ibv_dereg_mr(mr_linear));
-
- rte_free(elts_linear);
- if (elts == NULL)
- return;
- while (elts_tail != elts_head) {
- struct txq_elt *elt = &(*elts)[elts_tail];
-
- assert(elt->buf != NULL);
- rte_pktmbuf_free(elt->buf);
-#ifndef NDEBUG
- /* Poisoning. */
- memset(elt, 0x77, sizeof(*elt));
-#endif
- if (++elts_tail == elts_n)
- elts_tail = 0;
- }
- rte_free(elts);
-}
-
-
-/**
- * Clean up a TX queue.
- *
- * Destroy objects, free allocated memory and reset the structure for reuse.
- *
- * @param txq
- * Pointer to TX queue structure.
- */
-static void
-txq_cleanup(struct txq *txq)
-{
- struct ibv_exp_release_intf_params params;
- size_t i;
-
- DEBUG("cleaning up %p", (void *)txq);
- txq_free_elts(txq);
- if (txq->if_qp != NULL) {
- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- assert(txq->qp != NULL);
- params = (struct ibv_exp_release_intf_params){
- .comp_mask = 0,
- };
- claim_zero(ibv_exp_release_intf(txq->priv->ctx,
- txq->if_qp,
- &params));
- }
- if (txq->if_cq != NULL) {
- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- assert(txq->cq != NULL);
- params = (struct ibv_exp_release_intf_params){
- .comp_mask = 0,
- };
- claim_zero(ibv_exp_release_intf(txq->priv->ctx,
- txq->if_cq,
- &params));
- }
- if (txq->qp != NULL)
- claim_zero(ibv_destroy_qp(txq->qp));
- if (txq->cq != NULL)
- claim_zero(ibv_destroy_cq(txq->cq));
- if (txq->rd != NULL) {
- struct ibv_exp_destroy_res_domain_attr attr = {
- .comp_mask = 0,
- };
-
- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(txq->priv->ctx,
- txq->rd,
- &attr));
- }
- for (i = 0; (i != elemof(txq->mp2mr)); ++i) {
- if (txq->mp2mr[i].mp == NULL)
- break;
- assert(txq->mp2mr[i].mr != NULL);
- claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr));
- }
- memset(txq, 0, sizeof(*txq));
-}
-
-/**
- * Manage TX completions.
- *
- * When sending a burst, mlx4_tx_burst() posts several WRs.
- * To improve performance, a completion event is only required once every
- * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
- * for other WRs, but this information would not be used anyway.
- *
- * @param txq
- * Pointer to TX queue structure.
- *
- * @return
- * 0 on success, -1 on failure.
- */
-static int
-txq_complete(struct txq *txq)
-{
- unsigned int elts_comp = txq->elts_comp;
- unsigned int elts_tail = txq->elts_tail;
- const unsigned int elts_n = txq->elts_n;
- int wcs_n;
-
- if (unlikely(elts_comp == 0))
- return 0;
-#ifdef DEBUG_SEND
- DEBUG("%p: processing %u work requests completions",
- (void *)txq, elts_comp);
-#endif
- wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp);
- if (unlikely(wcs_n == 0))
- return 0;
- if (unlikely(wcs_n < 0)) {
- DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
- (void *)txq, wcs_n);
- return -1;
- }
- elts_comp -= wcs_n;
- assert(elts_comp <= txq->elts_comp);
- /*
- * Assume WC status is successful as nothing can be done about it
- * anyway.
- */
- elts_tail += wcs_n * txq->elts_comp_cd_init;
- if (elts_tail >= elts_n)
- elts_tail -= elts_n;
- txq->elts_tail = elts_tail;
- txq->elts_comp = elts_comp;
- return 0;
-}
-
-struct mlx4_check_mempool_data {
- int ret;
- char *start;
- char *end;
-};
-
-/* Called by mlx4_check_mempool() when iterating the memory chunks. */
-static void mlx4_check_mempool_cb(struct rte_mempool *mp,
- void *opaque, struct rte_mempool_memhdr *memhdr,
- unsigned mem_idx)
-{
- struct mlx4_check_mempool_data *data = opaque;
-
- (void)mp;
- (void)mem_idx;
-
- /* It already failed, skip the next chunks. */
- if (data->ret != 0)
- return;
- /* It is the first chunk. */
- if (data->start == NULL && data->end == NULL) {
- data->start = memhdr->addr;
- data->end = data->start + memhdr->len;
- return;
- }
- if (data->end == memhdr->addr) {
- data->end += memhdr->len;
- return;
- }
- if (data->start == (char *)memhdr->addr + memhdr->len) {
- data->start -= memhdr->len;
- return;
- }
- /* Error, mempool is not virtually contigous. */
- data->ret = -1;
-}
-
-/**
- * Check if a mempool can be used: it must be virtually contiguous.
- *
- * @param[in] mp
- * Pointer to memory pool.
- * @param[out] start
- * Pointer to the start address of the mempool virtual memory area
- * @param[out] end
- * Pointer to the end address of the mempool virtual memory area
- *
- * @return
- * 0 on success (mempool is virtually contiguous), -1 on error.
- */
-static int mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start,
- uintptr_t *end)
-{
- struct mlx4_check_mempool_data data;
-
- memset(&data, 0, sizeof(data));
- rte_mempool_mem_iter(mp, mlx4_check_mempool_cb, &data);
- *start = (uintptr_t)data.start;
- *end = (uintptr_t)data.end;
-
- return data.ret;
-}
-
-/* For best performance, this function should not be inlined. */
-static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *)
- __rte_noinline;
-
-/**
- * Register mempool as a memory region.
- *
- * @param pd
- * Pointer to protection domain.
- * @param mp
- * Pointer to memory pool.
- *
- * @return
- * Memory region pointer, NULL in case of error.
- */
-static struct ibv_mr *
-mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
-{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- uintptr_t start;
- uintptr_t end;
- unsigned int i;
-
- if (mlx4_check_mempool(mp, &start, &end) != 0) {
- ERROR("mempool %p: not virtually contiguous",
- (void *)mp);
- return NULL;
- }
-
- DEBUG("mempool %p area start=%p end=%p size=%zu",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- /* Round start and end to page boundary if found in memory segments. */
- for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
- uintptr_t addr = (uintptr_t)ms[i].addr;
- size_t len = ms[i].len;
- unsigned int align = ms[i].hugepage_sz;
-
- if ((start > addr) && (start < addr + len))
- start = RTE_ALIGN_FLOOR(start, align);
- if ((end > addr) && (end < addr + len))
- end = RTE_ALIGN_CEIL(end, align);
- }
- DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- return ibv_reg_mr(pd,
- (void *)start,
- end - start,
- IBV_ACCESS_LOCAL_WRITE);
-}
-
-/**
- * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
- * the cloned mbuf is allocated is returned instead.
- *
- * @param buf
- * Pointer to mbuf.
- *
- * @return
- * Memory pool where data is located for given mbuf.
- */
-static struct rte_mempool *
-txq_mb2mp(struct rte_mbuf *buf)
-{
- if (unlikely(RTE_MBUF_INDIRECT(buf)))
- return rte_mbuf_from_indirect(buf)->pool;
- return buf->pool;
-}
-
-/**
- * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
- *
- * @param txq
- * Pointer to TX queue structure.
- * @param[in] mp
- * Memory Pool for which a Memory Region lkey must be returned.
- *
- * @return
- * mr->lkey on success, (uint32_t)-1 on failure.
- */
-static uint32_t
-txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
-{
- unsigned int i;
- struct ibv_mr *mr;
-
- for (i = 0; (i != elemof(txq->mp2mr)); ++i) {
- if (unlikely(txq->mp2mr[i].mp == NULL)) {
- /* Unknown MP, add a new MR for it. */
- break;
- }
- if (txq->mp2mr[i].mp == mp) {
- assert(txq->mp2mr[i].lkey != (uint32_t)-1);
- assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
- }
- }
- /* Add a new entry, register MR first. */
- DEBUG("%p: discovered new memory pool \"%s\" (%p)",
- (void *)txq, mp->name, (void *)mp);
- mr = mlx4_mp2mr(txq->priv->pd, mp);
- if (unlikely(mr == NULL)) {
- DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
- (void *)txq);
- return (uint32_t)-1;
- }
- if (unlikely(i == elemof(txq->mp2mr))) {
- /* Table is full, remove oldest entry. */
- DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
- (void *)txq);
- --i;
- claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
- memmove(&txq->mp2mr[0], &txq->mp2mr[1],
- (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
- }
- /* Store the new entry. */
- txq->mp2mr[i].mp = mp;
- txq->mp2mr[i].mr = mr;
- txq->mp2mr[i].lkey = mr->lkey;
- DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
- (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
-}
-
-struct txq_mp2mr_mbuf_check_data {
- int ret;
-};
-
-/**
- * Callback function for rte_mempool_obj_iter() to check whether a given
- * mempool object looks like a mbuf.
- *
- * @param[in] mp
- * The mempool pointer
- * @param[in] arg
- * Context data (struct txq_mp2mr_mbuf_check_data). Contains the
- * return value.
- * @param[in] obj
- * Object address.
- * @param index
- * Object index, unused.
- */
-static void
-txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
- uint32_t index __rte_unused)
-{
- struct txq_mp2mr_mbuf_check_data *data = arg;
- struct rte_mbuf *buf = obj;
-
- /* Check whether mbuf structure fits element size and whether mempool
- * pointer is valid. */
- if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
- data->ret = -1;
-}
-
-/**
- * Iterator function for rte_mempool_walk() to register existing mempools and
- * fill the MP to MR cache of a TX queue.
- *
- * @param[in] mp
- * Memory Pool to register.
- * @param *arg
- * Pointer to TX queue structure.
- */
-static void
-txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
-{
- struct txq *txq = arg;
- struct txq_mp2mr_mbuf_check_data data = {
- .ret = 0,
- };
-
- /* Register mempool only if the first element looks like a mbuf. */
- if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
- data.ret == -1)
- return;
- txq_mp2mr(txq, mp);
-}
-
-#if MLX4_PMD_SGE_WR_N > 1
-
-/**
- * Copy scattered mbuf contents to a single linear buffer.
- *
- * @param[out] linear
- * Linear output buffer.
- * @param[in] buf
- * Scattered input buffer.
- *
- * @return
- * Number of bytes copied to the output buffer or 0 if not large enough.
- */
-static unsigned int
-linearize_mbuf(linear_t *linear, struct rte_mbuf *buf)
-{
- unsigned int size = 0;
- unsigned int offset;
-
- do {
- unsigned int len = DATA_LEN(buf);
-
- offset = size;
- size += len;
- if (unlikely(size > sizeof(*linear)))
- return 0;
- memcpy(&(*linear)[offset],
- rte_pktmbuf_mtod(buf, uint8_t *),
- len);
- buf = NEXT(buf);
- } while (buf != NULL);
- return size;
-}
-
-/**
- * Handle scattered buffers for mlx4_tx_burst().
- *
- * @param txq
- * TX queue structure.
- * @param segs
- * Number of segments in buf.
- * @param elt
- * TX queue element to fill.
- * @param[in] buf
- * Buffer to process.
- * @param elts_head
- * Index of the linear buffer to use if necessary (normally txq->elts_head).
- * @param[out] sges
- * Array filled with SGEs on success.
- *
- * @return
- * A structure containing the processed packet size in bytes and the
- * number of SGEs. Both fields are set to (unsigned int)-1 in case of
- * failure.
- */
-static struct tx_burst_sg_ret {
- unsigned int length;
- unsigned int num;
-}
-tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt,
- struct rte_mbuf *buf, unsigned int elts_head,
- struct ibv_sge (*sges)[MLX4_PMD_SGE_WR_N])
-{
- unsigned int sent_size = 0;
- unsigned int j;
- int linearize = 0;
-
- /* When there are too many segments, extra segments are
- * linearized in the last SGE. */
- if (unlikely(segs > elemof(*sges))) {
- segs = (elemof(*sges) - 1);
- linearize = 1;
- }
- /* Update element. */
- elt->buf = buf;
- /* Register segments as SGEs. */
- for (j = 0; (j != segs); ++j) {
- struct ibv_sge *sge = &(*sges)[j];
- uint32_t lkey;
-
- /* Retrieve Memory Region key for this memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR association",
- (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* Update SGE. */
- sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
- if (txq->priv->vf)
- rte_prefetch0((volatile void *)
- (uintptr_t)sge->addr);
- sge->length = DATA_LEN(buf);
- sge->lkey = lkey;
- sent_size += sge->length;
- buf = NEXT(buf);
- }
- /* If buf is not NULL here and is not going to be linearized,
- * nb_segs is not valid. */
- assert(j == segs);
- assert((buf == NULL) || (linearize));
- /* Linearize extra segments. */
- if (linearize) {
- struct ibv_sge *sge = &(*sges)[segs];
- linear_t *linear = &(*txq->elts_linear)[elts_head];
- unsigned int size = linearize_mbuf(linear, buf);
-
- assert(segs == (elemof(*sges) - 1));
- if (size == 0) {
- /* Invalid packet. */
- DEBUG("%p: packet too large to be linearized.",
- (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* If MLX4_PMD_SGE_WR_N is 1, free mbuf immediately. */
- if (elemof(*sges) == 1) {
- do {
- struct rte_mbuf *next = NEXT(buf);
-
- rte_pktmbuf_free_seg(buf);
- buf = next;
- } while (buf != NULL);
- elt->buf = NULL;
- }
- /* Update SGE. */
- sge->addr = (uintptr_t)&(*linear)[0];
- sge->length = size;
- sge->lkey = txq->mr_linear->lkey;
- sent_size += size;
- /* Include last segment. */
- segs++;
- }
- return (struct tx_burst_sg_ret){
- .length = sent_size,
- .num = segs,
- };
-stop:
- return (struct tx_burst_sg_ret){
- .length = -1,
- .num = -1,
- };
-}
-
-#endif /* MLX4_PMD_SGE_WR_N > 1 */
-
-/**
- * DPDK callback for TX.
- *
- * @param dpdk_txq
- * Generic pointer to TX queue structure.
- * @param[in] pkts
- * Packets to transmit.
- * @param pkts_n
- * Number of packets in array.
- *
- * @return
- * Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- struct txq *txq = (struct txq *)dpdk_txq;
- unsigned int elts_head = txq->elts_head;
- const unsigned int elts_n = txq->elts_n;
- unsigned int elts_comp_cd = txq->elts_comp_cd;
- unsigned int elts_comp = 0;
- unsigned int i;
- unsigned int max;
- int err;
-
- assert(elts_comp_cd != 0);
- txq_complete(txq);
- max = (elts_n - (elts_head - txq->elts_tail));
- if (max > elts_n)
- max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- struct rte_mbuf *buf = pkts[i];
- unsigned int elts_head_next =
- (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
- struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
- struct txq_elt *elt = &(*txq->elts)[elts_head];
- unsigned int segs = NB_SEGS(buf);
-#ifdef MLX4_PMD_SOFT_COUNTERS
- unsigned int sent_size = 0;
-#endif
- uint32_t send_flags = 0;
-
- /* Clean up old buffer. */
- if (likely(elt->buf != NULL)) {
- struct rte_mbuf *tmp = elt->buf;
-
-#ifndef NDEBUG
- /* Poisoning. */
- memset(elt, 0x66, sizeof(*elt));
-#endif
- /* Faster than rte_pktmbuf_free(). */
- do {
- struct rte_mbuf *next = NEXT(tmp);
-
- rte_pktmbuf_free_seg(tmp);
- tmp = next;
- } while (tmp != NULL);
- }
- /* Request TX completion. */
- if (unlikely(--elts_comp_cd == 0)) {
- elts_comp_cd = txq->elts_comp_cd_init;
- ++elts_comp;
- send_flags |= IBV_EXP_QP_BURST_SIGNALED;
- }
- /* Should we enable HW CKSUM offload */
- if (buf->ol_flags &
- (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
- send_flags |= IBV_EXP_QP_BURST_IP_CSUM;
- /* HW does not support checksum offloads at arbitrary
- * offsets but automatically recognizes the packet
- * type. For inner L3/L4 checksums, only VXLAN (UDP)
- * tunnels are currently supported. */
- if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type))
- send_flags |= IBV_EXP_QP_BURST_TUNNEL;
- }
- if (likely(segs == 1)) {
- uintptr_t addr;
- uint32_t length;
- uint32_t lkey;
-
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = DATA_LEN(buf);
- /* Retrieve Memory Region key for this memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* Update element. */
- elt->buf = buf;
- if (txq->priv->vf)
- rte_prefetch0((volatile void *)
- (uintptr_t)addr);
- RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
- /* Put packet into send queue. */
-#if MLX4_PMD_MAX_INLINE > 0
- if (length <= txq->max_inline)
- err = txq->if_qp->send_pending_inline
- (txq->qp,
- (void *)addr,
- length,
- send_flags);
- else
-#endif
- err = txq->if_qp->send_pending
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags);
- if (unlikely(err))
- goto stop;
-#ifdef MLX4_PMD_SOFT_COUNTERS
- sent_size += length;
-#endif
- } else {
-#if MLX4_PMD_SGE_WR_N > 1
- struct ibv_sge sges[MLX4_PMD_SGE_WR_N];
- struct tx_burst_sg_ret ret;
-
- ret = tx_burst_sg(txq, segs, elt, buf, elts_head,
- &sges);
- if (ret.length == (unsigned int)-1)
- goto stop;
- RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
- /* Put SG list into send queue. */
- err = txq->if_qp->send_pending_sg_list
- (txq->qp,
- sges,
- ret.num,
- send_flags);
- if (unlikely(err))
- goto stop;
-#ifdef MLX4_PMD_SOFT_COUNTERS
- sent_size += ret.length;
-#endif
-#else /* MLX4_PMD_SGE_WR_N > 1 */
- DEBUG("%p: TX scattered buffers support not"
- " compiled in", (void *)txq);
- goto stop;
-#endif /* MLX4_PMD_SGE_WR_N > 1 */
- }
- elts_head = elts_head_next;
-#ifdef MLX4_PMD_SOFT_COUNTERS
- /* Increment sent bytes counter. */
- txq->stats.obytes += sent_size;
-#endif
- }
-stop:
- /* Take a shortcut if nothing must be sent. */
- if (unlikely(i == 0))
- return 0;
-#ifdef MLX4_PMD_SOFT_COUNTERS
- /* Increment sent packets counter. */
- txq->stats.opackets += i;
-#endif
- /* Ring QP doorbell. */
- err = txq->if_qp->send_flush(txq->qp);
- if (unlikely(err)) {
- /* A nonzero value is not supposed to be returned.
- * Nothing can be done about it. */
- DEBUG("%p: send_flush() failed with error %d",
- (void *)txq, err);
- }
- txq->elts_head = elts_head;
- txq->elts_comp += elts_comp;
- txq->elts_comp_cd = elts_comp_cd;
- return i;
-}
-
-/**
- * DPDK callback for TX in secondary processes.
- *
- * This function configures all queues from primary process information
- * if necessary before reverting to the normal TX burst callback.
- *
- * @param dpdk_txq
- * Generic pointer to TX queue structure.
- * @param[in] pkts
- * Packets to transmit.
- * @param pkts_n
- * Number of packets in array.
- *
- * @return
- * Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-mlx4_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts,
- uint16_t pkts_n)
-{
- struct txq *txq = dpdk_txq;
- struct priv *priv = mlx4_secondary_data_setup(txq->priv);
- struct priv *primary_priv;
- unsigned int index;
-
- if (priv == NULL)
- return 0;
- primary_priv =
- mlx4_secondary_data[priv->dev->data->port_id].primary_priv;
- /* Look for queue index in both private structures. */
- for (index = 0; index != priv->txqs_n; ++index)
- if (((*primary_priv->txqs)[index] == txq) ||
- ((*priv->txqs)[index] == txq))
- break;
- if (index == priv->txqs_n)
- return 0;
- txq = (*priv->txqs)[index];
- return priv->dev->tx_pkt_burst(txq, pkts, pkts_n);
-}
-
-/**
- * Configure a TX queue.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param txq
- * Pointer to TX queue structure.
- * @param desc
- * Number of descriptors to configure in queue.
- * @param socket
- * NUMA socket on which memory must be allocated.
- * @param[in] conf
- * Thresholds parameters.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
- unsigned int socket, const struct rte_eth_txconf *conf)
-{
- struct priv *priv = mlx4_get_priv(dev);
- struct txq tmpl = {
- .priv = priv,
- .socket = socket
- };
- union {
- struct ibv_exp_query_intf_params params;
- struct ibv_exp_qp_init_attr init;
- struct ibv_exp_res_domain_init_attr rd;
- struct ibv_exp_cq_init_attr cq;
- struct ibv_exp_qp_attr mod;
- } attr;
- enum ibv_exp_query_intf_status status;
- int ret = 0;
-
- (void)conf; /* Thresholds configuration (ignored). */
- if (priv == NULL)
- return EINVAL;
- if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) {
- ERROR("%p: invalid number of TX descriptors (must be a"
- " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N);
- return EINVAL;
- }
- desc /= MLX4_PMD_SGE_WR_N;
- /* MRs will be registered in mp2mr[] later. */
- attr.rd = (struct ibv_exp_res_domain_init_attr){
- .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
- IBV_EXP_RES_DOMAIN_MSG_MODEL),
- .thread_model = IBV_EXP_THREAD_SINGLE,
- .msg_model = IBV_EXP_MSG_HIGH_BW,
- };
- tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
- if (tmpl.rd == NULL) {
- ret = ENOMEM;
- ERROR("%p: RD creation failure: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
- attr.cq = (struct ibv_exp_cq_init_attr){
- .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
- .res_domain = tmpl.rd,
- };
- tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
- if (tmpl.cq == NULL) {
- ret = ENOMEM;
- ERROR("%p: CQ creation failure: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
- DEBUG("priv->device_attr.max_qp_wr is %d",
- priv->device_attr.max_qp_wr);
- DEBUG("priv->device_attr.max_sge is %d",
- priv->device_attr.max_sge);
- attr.init = (struct ibv_exp_qp_init_attr){
- /* CQ to be associated with the send queue. */
- .send_cq = tmpl.cq,
- /* CQ to be associated with the receive queue. */
- .recv_cq = tmpl.cq,
- .cap = {
- /* Max number of outstanding WRs. */
- .max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
- priv->device_attr.max_qp_wr :
- desc),
- /* Max number of scatter/gather elements in a WR. */
- .max_send_sge = ((priv->device_attr.max_sge <
- MLX4_PMD_SGE_WR_N) ?
- priv->device_attr.max_sge :
- MLX4_PMD_SGE_WR_N),
-#if MLX4_PMD_MAX_INLINE > 0
- .max_inline_data = MLX4_PMD_MAX_INLINE,
-#endif
- },
- .qp_type = IBV_QPT_RAW_PACKET,
- /* Do *NOT* enable this, completions events are managed per
- * TX burst. */
- .sq_sig_all = 0,
- .pd = priv->pd,
- .res_domain = tmpl.rd,
- .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
- IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
- };
- tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
- if (tmpl.qp == NULL) {
- ret = (errno ? errno : EINVAL);
- ERROR("%p: QP creation failure: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
-#if MLX4_PMD_MAX_INLINE > 0
- /* ibv_create_qp() updates this value. */
- tmpl.max_inline = attr.init.cap.max_inline_data;
-#endif
- attr.mod = (struct ibv_exp_qp_attr){
- /* Move the QP to this state. */
- .qp_state = IBV_QPS_INIT,
- /* Primary port number. */
- .port_num = priv->port
- };
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
- (IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
- if (ret) {
- ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
- ret = txq_alloc_elts(&tmpl, desc);
- if (ret) {
- ERROR("%p: TXQ allocation failed: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
- attr.mod = (struct ibv_exp_qp_attr){
- .qp_state = IBV_QPS_RTR
- };
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
- if (ret) {
- ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
- attr.mod.qp_state = IBV_QPS_RTS;
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
- if (ret) {
- ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
- attr.params = (struct ibv_exp_query_intf_params){
- .intf_scope = IBV_EXP_INTF_GLOBAL,
- .intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.cq,
- };
- tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
- if (tmpl.if_cq == NULL) {
- ERROR("%p: CQ interface family query failed with status %d",
- (void *)dev, status);
- goto error;
- }
- attr.params = (struct ibv_exp_query_intf_params){
- .intf_scope = IBV_EXP_INTF_GLOBAL,
- .intf = IBV_EXP_INTF_QP_BURST,
- .obj = tmpl.qp,
-#ifdef HAVE_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK
- /* MC loopback must be disabled when not using a VF. */
- .family_flags =
- (!priv->vf ?
- IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK :
- 0),
-#endif
- };
- tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
- if (tmpl.if_qp == NULL) {
- ERROR("%p: QP interface family query failed with status %d",
- (void *)dev, status);
- goto error;
- }
- /* Clean up txq in case we're reinitializing it. */
- DEBUG("%p: cleaning-up old txq just in case", (void *)txq);
- txq_cleanup(txq);
- *txq = tmpl;
- DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl);
- /* Pre-register known mempools. */
- rte_mempool_walk(txq_mp2mr_iter, txq);
- assert(ret == 0);
- return 0;
-error:
- txq_cleanup(&tmpl);
- assert(ret > 0);
- return ret;
-}
-
-/**
- * DPDK callback to configure a TX queue.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param idx
- * TX queue index.
- * @param desc
- * Number of descriptors to configure in queue.
- * @param socket
- * NUMA socket on which memory must be allocated.
- * @param[in] conf
- * Thresholds parameters.
- *
- * @return
- * 0 on success, negative errno value on failure.
- */
-static int
-mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
- unsigned int socket, const struct rte_eth_txconf *conf)
-{
- struct priv *priv = dev->data->dev_private;
- struct txq *txq = (*priv->txqs)[idx];
- int ret;
-
- if (mlx4_is_secondary())
- return -E_RTE_SECONDARY;
- priv_lock(priv);
- DEBUG("%p: configuring queue %u for %u descriptors",
- (void *)dev, idx, desc);
- if (idx >= priv->txqs_n) {
- ERROR("%p: queue index out of range (%u >= %u)",
- (void *)dev, idx, priv->txqs_n);
- priv_unlock(priv);
- return -EOVERFLOW;
- }
- if (txq != NULL) {
- DEBUG("%p: reusing already allocated queue index %u (%p)",
- (void *)dev, idx, (void *)txq);
- if (priv->started) {
- priv_unlock(priv);
- return -EEXIST;
- }
- (*priv->txqs)[idx] = NULL;
- txq_cleanup(txq);
- } else {
- txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, socket);
- if (txq == NULL) {
- ERROR("%p: unable to allocate queue index %u",
- (void *)dev, idx);
- priv_unlock(priv);
- return -ENOMEM;
- }
- }
- ret = txq_setup(dev, txq, desc, socket, conf);
- if (ret)
- rte_free(txq);
- else {
- txq->stats.idx = idx;
- DEBUG("%p: adding TX queue %p to list",
- (void *)dev, (void *)txq);
- (*priv->txqs)[idx] = txq;
- /* Update send callback. */
- dev->tx_pkt_burst = mlx4_tx_burst;
- }
- priv_unlock(priv);
- return -ret;
-}
-
-/**
- * DPDK callback to release a TX queue.
- *
- * @param dpdk_txq
- * Generic TX queue pointer.
- */
-static void
-mlx4_tx_queue_release(void *dpdk_txq)
-{
- struct txq *txq = (struct txq *)dpdk_txq;
- struct priv *priv;
- unsigned int i;
-
- if (mlx4_is_secondary())
- return;
- if (txq == NULL)
- return;
- priv = txq->priv;
- priv_lock(priv);
- for (i = 0; (i != priv->txqs_n); ++i)
- if ((*priv->txqs)[i] == txq) {
- DEBUG("%p: removing TX queue %p from list",
- (void *)priv->dev, (void *)txq);
- (*priv->txqs)[i] = NULL;
- break;
- }
- txq_cleanup(txq);
- rte_free(txq);
- priv_unlock(priv);
-}
-
-/* RX queues handling. */
-
-/**
- * Allocate RX queue elements with scattered packets support.
- *
- * @param rxq
- * Pointer to RX queue structure.
- * @param elts_n
- * Number of elements to allocate.
- * @param[in] pool
- * If not NULL, fetch buffers from this array instead of allocating them
- * with rte_pktmbuf_alloc().
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
- struct rte_mbuf **pool)
-{
- unsigned int i;
- struct rxq_elt_sp (*elts)[elts_n] =
- rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq->socket);
- int ret = 0;
-
- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- /* For each WR (packet). */
- for (i = 0; (i != elts_n); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
- struct ibv_recv_wr *wr = &elt->wr;
- struct ibv_sge (*sges)[(elemof(elt->sges))] = &elt->sges;
-
- /* These two arrays must have the same size. */
- assert(elemof(elt->sges) == elemof(elt->bufs));
- /* Configure WR. */
- wr->wr_id = i;
- wr->next = &(*elts)[(i + 1)].wr;
- wr->sg_list = &(*sges)[0];
- wr->num_sge = elemof(*sges);
- /* For each SGE (segment). */
- for (j = 0; (j != elemof(elt->bufs)); ++j) {
- struct ibv_sge *sge = &(*sges)[j];
- struct rte_mbuf *buf;
-
- if (pool != NULL) {
- buf = *(pool++);
- assert(buf != NULL);
- rte_pktmbuf_reset(buf);
- } else
- buf = rte_pktmbuf_alloc(rxq->mp);
- if (buf == NULL) {
- assert(pool == NULL);
- ERROR("%p: empty mbuf pool", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- elt->bufs[j] = buf;
- /* Headroom is reserved by rte_pktmbuf_alloc(). */
- assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
- /* Buffer is supposed to be empty. */
- assert(rte_pktmbuf_data_len(buf) == 0);
- assert(rte_pktmbuf_pkt_len(buf) == 0);
- /* sge->addr must be able to store a pointer. */
- assert(sizeof(sge->addr) >= sizeof(uintptr_t));
- if (j == 0) {
- /* The first SGE keeps its headroom. */
- sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
- sge->length = (buf->buf_len -
- RTE_PKTMBUF_HEADROOM);
- } else {
- /* Subsequent SGEs lose theirs. */
- assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
- SET_DATA_OFF(buf, 0);
- sge->addr = (uintptr_t)buf->buf_addr;
- sge->length = buf->buf_len;
- }
- sge->lkey = rxq->mr->lkey;
- /* Redundant check for tailroom. */
- assert(sge->length == rte_pktmbuf_tailroom(buf));
- }
- }
- /* The last WR pointer must be NULL. */
- (*elts)[(i - 1)].wr.next = NULL;
- DEBUG("%p: allocated and configured %u WRs (%zu segments)",
- (void *)rxq, elts_n, (elts_n * elemof((*elts)[0].sges)));
- rxq->elts_n = elts_n;
- rxq->elts_head = 0;
- rxq->elts.sp = elts;
- assert(ret == 0);
- return 0;
-error:
- if (elts != NULL) {
- assert(pool == NULL);
- for (i = 0; (i != elemof(*elts)); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
-
- for (j = 0; (j != elemof(elt->bufs)); ++j) {
- struct rte_mbuf *buf = elt->bufs[j];
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- }
- rte_free(elts);
- }
- DEBUG("%p: failed, freed everything", (void *)rxq);
- assert(ret > 0);
- return ret;
-}
-
-/**
- * Free RX queue elements with scattered packets support.
- *
- * @param rxq
- * Pointer to RX queue structure.
- */
-static void
-rxq_free_elts_sp(struct rxq *rxq)
-{
- unsigned int i;
- unsigned int elts_n = rxq->elts_n;
- struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
-
- DEBUG("%p: freeing WRs", (void *)rxq);
- rxq->elts_n = 0;
- rxq->elts.sp = NULL;
- if (elts == NULL)
- return;
- for (i = 0; (i != elemof(*elts)); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
-
- for (j = 0; (j != elemof(elt->bufs)); ++j) {
- struct rte_mbuf *buf = elt->bufs[j];
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- }
- rte_free(elts);
-}
-
-/**
- * Allocate RX queue elements.
- *
- * @param rxq
- * Pointer to RX queue structure.
- * @param elts_n
- * Number of elements to allocate.
- * @param[in] pool
- * If not NULL, fetch buffers from this array instead of allocating them
- * with rte_pktmbuf_alloc().
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
-{
- unsigned int i;
- struct rxq_elt (*elts)[elts_n] =
- rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq->socket);
- int ret = 0;
-
- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- /* For each WR (packet). */
- for (i = 0; (i != elts_n); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct ibv_recv_wr *wr = &elt->wr;
- struct ibv_sge *sge = &(*elts)[i].sge;
- struct rte_mbuf *buf;
-
- if (pool != NULL) {
- buf = *(pool++);
- assert(buf != NULL);
- rte_pktmbuf_reset(buf);
- } else
- buf = rte_pktmbuf_alloc(rxq->mp);
- if (buf == NULL) {
- assert(pool == NULL);
- ERROR("%p: empty mbuf pool", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- /* Configure WR. Work request ID contains its own index in
- * the elts array and the offset between SGE buffer header and
- * its data. */
- WR_ID(wr->wr_id).id = i;
- WR_ID(wr->wr_id).offset =
- (((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) -
- (uintptr_t)buf);
- wr->next = &(*elts)[(i + 1)].wr;
- wr->sg_list = sge;
- wr->num_sge = 1;
- /* Headroom is reserved by rte_pktmbuf_alloc(). */
- assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
- /* Buffer is supposed to be empty. */
- assert(rte_pktmbuf_data_len(buf) == 0);
- assert(rte_pktmbuf_pkt_len(buf) == 0);
- /* sge->addr must be able to store a pointer. */
- assert(sizeof(sge->addr) >= sizeof(uintptr_t));
- /* SGE keeps its headroom. */
- sge->addr = (uintptr_t)
- ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
- sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
- sge->lkey = rxq->mr->lkey;
- /* Redundant check for tailroom. */
- assert(sge->length == rte_pktmbuf_tailroom(buf));
- /* Make sure elts index and SGE mbuf pointer can be deduced
- * from WR ID. */
- if ((WR_ID(wr->wr_id).id != i) ||
- ((void *)((uintptr_t)sge->addr -
- WR_ID(wr->wr_id).offset) != buf)) {
- ERROR("%p: cannot store index and offset in WR ID",
- (void *)rxq);
- sge->addr = 0;
- rte_pktmbuf_free(buf);
- ret = EOVERFLOW;
- goto error;
- }
- }
- /* The last WR pointer must be NULL. */
- (*elts)[(i - 1)].wr.next = NULL;
- DEBUG("%p: allocated and configured %u single-segment WRs",
- (void *)rxq, elts_n);
- rxq->elts_n = elts_n;
- rxq->elts_head = 0;
- rxq->elts.no_sp = elts;
- assert(ret == 0);
- return 0;
-error:
- if (elts != NULL) {
- assert(pool == NULL);
- for (i = 0; (i != elemof(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf;
-
- if (elt->sge.addr == 0)
- continue;
- assert(WR_ID(elt->wr.wr_id).id == i);
- buf = (void *)((uintptr_t)elt->sge.addr -
- WR_ID(elt->wr.wr_id).offset);
- rte_pktmbuf_free_seg(buf);
- }
- rte_free(elts);
- }
- DEBUG("%p: failed, freed everything", (void *)rxq);
- assert(ret > 0);
- return ret;
-}
-
-/**
- * Free RX queue elements.
- *
- * @param rxq
- * Pointer to RX queue structure.
- */
-static void
-rxq_free_elts(struct rxq *rxq)
-{
- unsigned int i;
- unsigned int elts_n = rxq->elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
-
- DEBUG("%p: freeing WRs", (void *)rxq);
- rxq->elts_n = 0;
- rxq->elts.no_sp = NULL;
- if (elts == NULL)
- return;
- for (i = 0; (i != elemof(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf;
-
- if (elt->sge.addr == 0)
- continue;
- assert(WR_ID(elt->wr.wr_id).id == i);
- buf = (void *)((uintptr_t)elt->sge.addr -
- WR_ID(elt->wr.wr_id).offset);
- rte_pktmbuf_free_seg(buf);
- }
- rte_free(elts);
-}
-
-/**
- * Delete flow steering rule.
- *
- * @param rxq
- * Pointer to RX queue structure.
- * @param mac_index
- * MAC address index.
- * @param vlan_index
- * VLAN index.
- */
-static void
-rxq_del_flow(struct rxq *rxq, unsigned int mac_index, unsigned int vlan_index)
-{
-#ifndef NDEBUG
- struct priv *priv = rxq->priv;
- const uint8_t (*mac)[ETHER_ADDR_LEN] =
- (const uint8_t (*)[ETHER_ADDR_LEN])
- priv->mac[mac_index].addr_bytes;
-#endif
- assert(rxq->mac_flow[mac_index][vlan_index] != NULL);
- DEBUG("%p: removing MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u"
- " (VLAN ID %" PRIu16 ")",
- (void *)rxq,
- (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5],
- mac_index, priv->vlan_filter[vlan_index].id);
- claim_zero(ibv_destroy_flow(rxq->mac_flow[mac_index][vlan_index]));
- rxq->mac_flow[mac_index][vlan_index] = NULL;
-}
-
-/**
- * Unregister a MAC address from a RX queue.
- *
- * @param rxq
- * Pointer to RX queue structure.
- * @param mac_index
- * MAC address index.
- */
-static void
-rxq_mac_addr_del(struct rxq *rxq, unsigned int mac_index)
-{
- struct priv *priv = rxq->priv;
- unsigned int i;
- unsigned int vlans = 0;
-
- assert(mac_index < elemof(priv->mac));
- if (!BITFIELD_ISSET(rxq->mac_configured, mac_index))
- return;
- for (i = 0; (i != elemof(priv->vlan_filter)); ++i) {
- if (!priv->vlan_filter[i].enabled)
- continue;
- rxq_del_flow(rxq, mac_index, i);
- vlans++;
- }
- if (!vlans) {
- rxq_del_flow(rxq, mac_index, 0);
- }
- BITFIELD_RESET(rxq->mac_configured, mac_index);
-}
-
-/**
- * Unregister all MAC addresses from a RX queue.
- *
- * @param rxq
- * Pointer to RX queue structure.
- */
-static void
-rxq_mac_addrs_del(struct rxq *rxq)
-{
- struct priv *priv = rxq->priv;
- unsigned int i;
-
- for (i = 0; (i != elemof(priv->mac)); ++i)
- rxq_mac_addr_del(rxq, i);
-}
-
-static int rxq_promiscuous_enable(struct rxq *);
-static void rxq_promiscuous_disable(struct rxq *);
-
-/**
- * Add single flow steering rule.
- *
- * @param rxq
- * Pointer to RX queue structure.
- * @param mac_index
- * MAC address index to register.
- * @param vlan_index
- * VLAN index. Use -1 for a flow without VLAN.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_add_flow(struct rxq *rxq, unsigned int mac_index, unsigned int vlan_index)
-{
- struct ibv_flow *flow;
- struct priv *priv = rxq->priv;
- const uint8_t (*mac)[ETHER_ADDR_LEN] =
- (const uint8_t (*)[ETHER_ADDR_LEN])
- priv->mac[mac_index].addr_bytes;
-
- /* Allocate flow specification on the stack. */
- struct __attribute__((packed)) {
- struct ibv_flow_attr attr;
- struct ibv_flow_spec_eth spec;
- } data;
- struct ibv_flow_attr *attr = &data.attr;
- struct ibv_flow_spec_eth *spec = &data.spec;
-
- assert(mac_index < elemof(priv->mac));
- assert((vlan_index < elemof(priv->vlan_filter)) || (vlan_index == -1u));
- /*
- * No padding must be inserted by the compiler between attr and spec.
- * This layout is expected by libibverbs.
- */
- assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec);
- *attr = (struct ibv_flow_attr){
- .type = IBV_FLOW_ATTR_NORMAL,
- .priority = 3,
- .num_of_specs = 1,
- .port = priv->port,
- .flags = 0
- };
- *spec = (struct ibv_flow_spec_eth){
- .type = IBV_FLOW_SPEC_ETH,
- .size = sizeof(*spec),
- .val = {
- .dst_mac = {
- (*mac)[0], (*mac)[1], (*mac)[2],
- (*mac)[3], (*mac)[4], (*mac)[5]
- },
- .vlan_tag = ((vlan_index != -1u) ?
- htons(priv->vlan_filter[vlan_index].id) :
- 0),
- },
- .mask = {
- .dst_mac = "\xff\xff\xff\xff\xff\xff",
- .vlan_tag = ((vlan_index != -1u) ? htons(0xfff) : 0),
- }
- };
- DEBUG("%p: adding MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u"
- " (VLAN %s %" PRIu16 ")",
- (void *)rxq,
- (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5],
- mac_index,
- ((vlan_index != -1u) ? "ID" : "index"),
- ((vlan_index != -1u) ? priv->vlan_filter[vlan_index].id : -1u));
- /* Create related flow. */
- errno = 0;
- flow = ibv_create_flow(rxq->qp, attr);
- if (flow == NULL) {
- /* It's not clear whether errno is always set in this case. */
- ERROR("%p: flow configuration failed, errno=%d: %s",
- (void *)rxq, errno,
- (errno ? strerror(errno) : "Unknown error"));
- if (errno)
- return errno;
- return EINVAL;
- }
- if (vlan_index == -1u)
- vlan_index = 0;
- assert(rxq->mac_flow[mac_index][vlan_index] == NULL);
- rxq->mac_flow[mac_index][vlan_index] = flow;
- return 0;
-}
-
-/**
- * Register a MAC address in a RX queue.
- *
- * @param rxq
- * Pointer to RX queue structure.
- * @param mac_index
- * MAC address index to register.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_mac_addr_add(struct rxq *rxq, unsigned int mac_index)
-{
- struct priv *priv = rxq->priv;
- unsigned int i;
- unsigned int vlans = 0;
- int ret;
-
- assert(mac_index < elemof(priv->mac));
- if (BITFIELD_ISSET(rxq->mac_configured, mac_index))
- rxq_mac_addr_del(rxq, mac_index);
- /* Fill VLAN specifications. */
- for (i = 0; (i != elemof(priv->vlan_filter)); ++i) {
- if (!priv->vlan_filter[i].enabled)
- continue;
- /* Create related flow. */
- ret = rxq_add_flow(rxq, mac_index, i);
- if (!ret) {
- vlans++;
- continue;
- }
- /* Failure, rollback. */
- while (i != 0)
- if (priv->vlan_filter[--i].enabled)
- rxq_del_flow(rxq, mac_index, i);
- assert(ret > 0);
- return ret;
- }
- /* In case there is no VLAN filter. */
- if (!vlans) {
- ret = rxq_add_flow(rxq, mac_index, -1);
- if (ret)
- return ret;
- }
- BITFIELD_SET(rxq->mac_configured, mac_index);
- return 0;
-}
-
-/**
- * Register all MAC addresses in a RX queue.
- *
- * @param rxq
- * Pointer to RX queue structure.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_mac_addrs_add(struct rxq *rxq)
-{
- struct priv *priv = rxq->priv;
- unsigned int i;
- int ret;
-
- for (i = 0; (i != elemof(priv->mac)); ++i) {
- if (!BITFIELD_ISSET(priv->mac_configured, i))
- continue;
- ret = rxq_mac_addr_add(rxq, i);
- if (!ret)
- continue;
- /* Failure, rollback. */
- while (i != 0)
- rxq_mac_addr_del(rxq, --i);
- assert(ret > 0);
- return ret;
- }
- return 0;
-}
-
-/**
- * Unregister a MAC address.
- *
- * In RSS mode, the MAC address is unregistered from the parent queue,
- * otherwise it is unregistered from each queue directly.
- *
- * @param priv
- * Pointer to private structure.
- * @param mac_index
- * MAC address index.
- */
-static void
-priv_mac_addr_del(struct priv *priv, unsigned int mac_index)
-{
- unsigned int i;
-
- assert(!priv->isolated);
- assert(mac_index < elemof(priv->mac));
- if (!BITFIELD_ISSET(priv->mac_configured, mac_index))
- return;
- if (priv->rss) {
- rxq_mac_addr_del(LIST_FIRST(&priv->parents), mac_index);
- goto end;
- }
- for (i = 0; (i != priv->dev->data->nb_rx_queues); ++i)
- rxq_mac_addr_del((*priv->rxqs)[i], mac_index);
-end:
- BITFIELD_RESET(priv->mac_configured, mac_index);
-}
-
-/**
- * Register a MAC address.
- *
- * In RSS mode, the MAC address is registered in the parent queue,
- * otherwise it is registered in each queue directly.
- *
- * @param priv
- * Pointer to private structure.
- * @param mac_index
- * MAC address index to use.
- * @param mac
- * MAC address to register.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-priv_mac_addr_add(struct priv *priv, unsigned int mac_index,
- const uint8_t (*mac)[ETHER_ADDR_LEN])
-{
- unsigned int i;
- int ret;
-
- assert(mac_index < elemof(priv->mac));
- /* First, make sure this address isn't already configured. */
- for (i = 0; (i != elemof(priv->mac)); ++i) {
- /* Skip this index, it's going to be reconfigured. */
- if (i == mac_index)
- continue;
- if (!BITFIELD_ISSET(priv->mac_configured, i))
- continue;
- if (memcmp(priv->mac[i].addr_bytes, *mac, sizeof(*mac)))
- continue;
- /* Address already configured elsewhere, return with error. */
- return EADDRINUSE;
- }
- if (BITFIELD_ISSET(priv->mac_configured, mac_index))
- priv_mac_addr_del(priv, mac_index);
- priv->mac[mac_index] = (struct ether_addr){
- {
- (*mac)[0], (*mac)[1], (*mac)[2],
- (*mac)[3], (*mac)[4], (*mac)[5]
- }
- };
- /* If device isn't started, this is all we need to do. */
- if (!priv->started) {
-#ifndef NDEBUG
- /* Verify that all queues have this index disabled. */
- for (i = 0; (i != priv->rxqs_n); ++i) {
- if ((*priv->rxqs)[i] == NULL)
- continue;
- assert(!BITFIELD_ISSET
- ((*priv->rxqs)[i]->mac_configured, mac_index));
- }
-#endif
- goto end;
- }
- if (priv->rss) {
- ret = rxq_mac_addr_add(LIST_FIRST(&priv->parents), mac_index);
- if (ret)
- return ret;
- goto end;
- }
- for (i = 0; (i != priv->rxqs_n); ++i) {
- if ((*priv->rxqs)[i] == NULL)
- continue;
- ret = rxq_mac_addr_add((*priv->rxqs)[i], mac_index);
- if (!ret)
- continue;
- /* Failure, rollback. */
- while (i != 0)
- if ((*priv->rxqs)[(--i)] != NULL)
- rxq_mac_addr_del((*priv->rxqs)[i], mac_index);
- return ret;
- }
-end:
- BITFIELD_SET(priv->mac_configured, mac_index);
- return 0;
-}
-
-/**
- * Enable allmulti mode in a RX queue.
- *
- * @param rxq
- * Pointer to RX queue structure.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_allmulticast_enable(struct rxq *rxq)
-{
- struct ibv_flow *flow;
- struct ibv_flow_attr attr = {
- .type = IBV_FLOW_ATTR_MC_DEFAULT,
- .num_of_specs = 0,
- .port = rxq->priv->port,
- .flags = 0
- };
-
- DEBUG("%p: enabling allmulticast mode", (void *)rxq);
- if (rxq->allmulti_flow != NULL)
- return EBUSY;
- errno = 0;
- flow = ibv_create_flow(rxq->qp, &attr);
- if (flow == NULL) {
- /* It's not clear whether errno is always set in this case. */
- ERROR("%p: flow configuration failed, errno=%d: %s",
- (void *)rxq, errno,
- (errno ? strerror(errno) : "Unknown error"));
- if (errno)
- return errno;
- return EINVAL;
- }
- rxq->allmulti_flow = flow;
- DEBUG("%p: allmulticast mode enabled", (void *)rxq);
- return 0;
-}
-
-/**
- * Disable allmulti mode in a RX queue.
- *
- * @param rxq
- * Pointer to RX queue structure.
- */
-static void
-rxq_allmulticast_disable(struct rxq *rxq)
-{
- DEBUG("%p: disabling allmulticast mode", (void *)rxq);
- if (rxq->allmulti_flow == NULL)
- return;
- claim_zero(ibv_destroy_flow(rxq->allmulti_flow));
- rxq->allmulti_flow = NULL;
- DEBUG("%p: allmulticast mode disabled", (void *)rxq);
-}
-
-/**
- * Enable promiscuous mode in a RX queue.
- *
- * @param rxq
- * Pointer to RX queue structure.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_promiscuous_enable(struct rxq *rxq)
-{
- struct ibv_flow *flow;
- struct ibv_flow_attr attr = {
- .type = IBV_FLOW_ATTR_ALL_DEFAULT,
- .num_of_specs = 0,
- .port = rxq->priv->port,
- .flags = 0
- };
-
- if (rxq->priv->vf)
- return 0;
- DEBUG("%p: enabling promiscuous mode", (void *)rxq);
- if (rxq->promisc_flow != NULL)
- return EBUSY;
- errno = 0;
- flow = ibv_create_flow(rxq->qp, &attr);
- if (flow == NULL) {
- /* It's not clear whether errno is always set in this case. */
- ERROR("%p: flow configuration failed, errno=%d: %s",
- (void *)rxq, errno,
- (errno ? strerror(errno) : "Unknown error"));
- if (errno)
- return errno;
- return EINVAL;
- }
- rxq->promisc_flow = flow;
- DEBUG("%p: promiscuous mode enabled", (void *)rxq);
- return 0;
-}
-
-/**
- * Disable promiscuous mode in a RX queue.
- *
- * @param rxq
- * Pointer to RX queue structure.
- */
-static void
-rxq_promiscuous_disable(struct rxq *rxq)
-{
- if (rxq->priv->vf)
- return;
- DEBUG("%p: disabling promiscuous mode", (void *)rxq);
- if (rxq->promisc_flow == NULL)
- return;
- claim_zero(ibv_destroy_flow(rxq->promisc_flow));
- rxq->promisc_flow = NULL;
- DEBUG("%p: promiscuous mode disabled", (void *)rxq);
-}
-
-/**
- * Clean up a RX queue.
- *
- * Destroy objects, free allocated memory and reset the structure for reuse.
- *
- * @param rxq
- * Pointer to RX queue structure.
- */
-static void
-rxq_cleanup(struct rxq *rxq)
-{
- struct ibv_exp_release_intf_params params;
-
- DEBUG("cleaning up %p", (void *)rxq);
- if (rxq->sp)
- rxq_free_elts_sp(rxq);
- else
- rxq_free_elts(rxq);
- if (rxq->if_qp != NULL) {
- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- assert(rxq->qp != NULL);
- params = (struct ibv_exp_release_intf_params){
- .comp_mask = 0,
- };
- claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
- rxq->if_qp,
- &params));
- }
- if (rxq->if_cq != NULL) {
- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- assert(rxq->cq != NULL);
- params = (struct ibv_exp_release_intf_params){
- .comp_mask = 0,
- };
- claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
- rxq->if_cq,
- &params));
- }
- if (rxq->qp != NULL && !rxq->priv->isolated) {
- rxq_promiscuous_disable(rxq);
- rxq_allmulticast_disable(rxq);
- rxq_mac_addrs_del(rxq);
- }
- if (rxq->qp != NULL)
- claim_zero(ibv_destroy_qp(rxq->qp));
- if (rxq->cq != NULL)
- claim_zero(ibv_destroy_cq(rxq->cq));
- if (rxq->channel != NULL)
- claim_zero(ibv_destroy_comp_channel(rxq->channel));
- if (rxq->rd != NULL) {
- struct ibv_exp_destroy_res_domain_attr attr = {
- .comp_mask = 0,
- };
-
- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
- rxq->rd,
- &attr));
- }
- if (rxq->mr != NULL)
- claim_zero(ibv_dereg_mr(rxq->mr));
- memset(rxq, 0, sizeof(*rxq));
-}
-
-/**
- * Translate RX completion flags to packet type.
- *
- * @param flags
- * RX completion flags returned by poll_length_flags().
- *
- * @note: fix mlx4_dev_supported_ptypes_get() if any change here.
- *
- * @return
- * Packet type for struct rte_mbuf.
- */
-static inline uint32_t
-rxq_cq_to_pkt_type(uint32_t flags)
-{
- uint32_t pkt_type;
-
- if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET)
- pkt_type =
- TRANSPOSE(flags,
- IBV_EXP_CQ_RX_OUTER_IPV4_PACKET,
- RTE_PTYPE_L3_IPV4_EXT_UNKNOWN) |
- TRANSPOSE(flags,
- IBV_EXP_CQ_RX_OUTER_IPV6_PACKET,
- RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) |
- TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV4_PACKET,
- RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) |
- TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV6_PACKET,
- RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN);
- else
- pkt_type =
- TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV4_PACKET,
- RTE_PTYPE_L3_IPV4_EXT_UNKNOWN) |
- TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV6_PACKET,
- RTE_PTYPE_L3_IPV6_EXT_UNKNOWN);
- return pkt_type;
-}
-
-/**
- * Translate RX completion flags to offload flags.
- *
- * @param[in] rxq
- * Pointer to RX queue structure.
- * @param flags
- * RX completion flags returned by poll_length_flags().
- *
- * @return
- * Offload flags (ol_flags) for struct rte_mbuf.
- */
-static inline uint32_t
-rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
-{
- uint32_t ol_flags = 0;
-
- if (rxq->csum)
- ol_flags |=
- TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IP_CSUM_OK,
- PKT_RX_IP_CKSUM_GOOD) |
- TRANSPOSE(flags,
- IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
- PKT_RX_L4_CKSUM_GOOD);
- if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
- ol_flags |=
- TRANSPOSE(flags,
- IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
- PKT_RX_IP_CKSUM_GOOD) |
- TRANSPOSE(flags,
- IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
- PKT_RX_L4_CKSUM_GOOD);
- return ol_flags;
-}
-
-static uint16_t
-mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
-
-/**
- * DPDK callback for RX with scattered packets support.
- *
- * @param dpdk_rxq
- * Generic pointer to RX queue structure.
- * @param[out] pkts
- * Array to store received packets.
- * @param pkts_n
- * Maximum number of packets in array.
- *
- * @return
- * Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- struct ibv_recv_wr head;
- struct ibv_recv_wr **next = &head.next;
- struct ibv_recv_wr *bad_wr;
- unsigned int i;
- unsigned int pkts_ret = 0;
- int ret;
-
- if (unlikely(!rxq->sp))
- return mlx4_rx_burst(dpdk_rxq, pkts, pkts_n);
- if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */
- return 0;
- for (i = 0; (i != pkts_n); ++i) {
- struct rxq_elt_sp *elt = &(*elts)[elts_head];
- struct ibv_recv_wr *wr = &elt->wr;
- uint64_t wr_id = wr->wr_id;
- unsigned int len;
- unsigned int pkt_buf_len;
- struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */
- struct rte_mbuf **pkt_buf_next = &pkt_buf;
- unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM;
- unsigned int j = 0;
- uint32_t flags;
-
- /* Sanity checks. */
-#ifdef NDEBUG
- (void)wr_id;
-#endif
- assert(wr_id < rxq->elts_n);
- assert(wr->sg_list == elt->sges);
- assert(wr->num_sge == elemof(elt->sges));
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
- &flags);
- if (unlikely(ret < 0)) {
- struct ibv_wc wc;
- int wcs_n;
-
- DEBUG("rxq=%p, poll_length() failed (ret=%d)",
- (void *)rxq, ret);
- /* ibv_poll_cq() must be used in case of failure. */
- wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
- if (unlikely(wcs_n == 0))
- break;
- if (unlikely(wcs_n < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
- (void *)rxq, wcs_n);
- break;
- }
- assert(wcs_n == 1);
- if (unlikely(wc.status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
- " completion status (%d): %s",
- (void *)rxq, wc.wr_id, wc.status,
- ibv_wc_status_str(wc.status));
-#ifdef MLX4_PMD_SOFT_COUNTERS
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
-#endif
- /* Link completed WRs together for repost. */
- *next = wr;
- next = &wr->next;
- goto repost;
- }
- ret = wc.byte_len;
- }
- if (ret == 0)
- break;
- len = ret;
- pkt_buf_len = len;
- /* Link completed WRs together for repost. */
- *next = wr;
- next = &wr->next;
- /*
- * Replace spent segments with new ones, concatenate and
- * return them as pkt_buf.
- */
- while (1) {
- struct ibv_sge *sge = &elt->sges[j];
- struct rte_mbuf *seg = elt->bufs[j];
- struct rte_mbuf *rep;
- unsigned int seg_tailroom;
-
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_prefetch0(seg);
- rep = rte_mbuf_raw_alloc(rxq->mp);
- if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ":"
- " can't allocate a new mbuf",
- (void *)rxq, wr_id);
- if (pkt_buf != NULL) {
- *pkt_buf_next = NULL;
- rte_pktmbuf_free(pkt_buf);
- }
- /* Increase out of memory counters. */
- ++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
- }
-#ifndef NDEBUG
- /* Poison user-modifiable fields in rep. */
- NEXT(rep) = (void *)((uintptr_t)-1);
- SET_DATA_OFF(rep, 0xdead);
- DATA_LEN(rep) = 0xd00d;
- PKT_LEN(rep) = 0xdeadd00d;
- NB_SEGS(rep) = 0x2a;
- PORT(rep) = 0x2a;
- rep->ol_flags = -1;
- /*
- * Clear special flags in mbuf to avoid
- * crashing while freeing.
- */
- rep->ol_flags &=
- ~(uint64_t)(IND_ATTACHED_MBUF |
- CTRL_MBUF_FLAG);
-#endif
- assert(rep->buf_len == seg->buf_len);
- /* Reconfigure sge to use rep instead of seg. */
- assert(sge->lkey == rxq->mr->lkey);
- sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom);
- elt->bufs[j] = rep;
- ++j;
- /* Update pkt_buf if it's the first segment, or link
- * seg to the previous one and update pkt_buf_next. */
- *pkt_buf_next = seg;
- pkt_buf_next = &NEXT(seg);
- /* Update seg information. */
- seg_tailroom = (seg->buf_len - seg_headroom);
- assert(sge->length == seg_tailroom);
- SET_DATA_OFF(seg, seg_headroom);
- if (likely(len <= seg_tailroom)) {
- /* Last segment. */
- DATA_LEN(seg) = len;
- PKT_LEN(seg) = len;
- /* Sanity check. */
- assert(rte_pktmbuf_headroom(seg) ==
- seg_headroom);
- assert(rte_pktmbuf_tailroom(seg) ==
- (seg_tailroom - len));
- break;
- }
- DATA_LEN(seg) = seg_tailroom;
- PKT_LEN(seg) = seg_tailroom;
- /* Sanity check. */
- assert(rte_pktmbuf_headroom(seg) == seg_headroom);
- assert(rte_pktmbuf_tailroom(seg) == 0);
- /* Fix len and clear headroom for next segments. */
- len -= seg_tailroom;
- seg_headroom = 0;
- }
- /* Update head and tail segments. */
- *pkt_buf_next = NULL;
- assert(pkt_buf != NULL);
- assert(j != 0);
- NB_SEGS(pkt_buf) = j;
- PORT(pkt_buf) = rxq->port_id;
- PKT_LEN(pkt_buf) = pkt_buf_len;
- pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
- pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-
- /* Return packet. */
- *(pkts++) = pkt_buf;
- ++pkts_ret;
-#ifdef MLX4_PMD_SOFT_COUNTERS
- /* Increase bytes counter. */
- rxq->stats.ibytes += pkt_buf_len;
-#endif
-repost:
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
- }
- if (unlikely(i == 0))
- return 0;
- *next = NULL;
- /* Repost WRs. */
-#ifdef DEBUG_RECV
- DEBUG("%p: reposting %d WRs", (void *)rxq, i);
-#endif
- ret = ibv_post_recv(rxq->qp, head.next, &bad_wr);
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: ibv_post_recv(): failed for WR %p: %s",
- (void *)rxq->priv,
- (void *)bad_wr,
- strerror(ret));
- abort();
- }
- rxq->elts_head = elts_head;
-#ifdef MLX4_PMD_SOFT_COUNTERS
- /* Increase packets counter. */
- rxq->stats.ipackets += pkts_ret;
-#endif
- return pkts_ret;
-}
-
-/**
- * DPDK callback for RX.
- *
- * The following function is the same as mlx4_rx_burst_sp(), except it doesn't
- * manage scattered packets. Improves performance when MRU is lower than the
- * size of the first segment.
- *
- * @param dpdk_rxq
- * Generic pointer to RX queue structure.
- * @param[out] pkts
- * Array to store received packets.
- * @param pkts_n
- * Maximum number of packets in array.
- *
- * @return
- * Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- struct ibv_sge sges[pkts_n];
- unsigned int i;
- unsigned int pkts_ret = 0;
- int ret;
-
- if (unlikely(rxq->sp))
- return mlx4_rx_burst_sp(dpdk_rxq, pkts, pkts_n);
- for (i = 0; (i != pkts_n); ++i) {
- struct rxq_elt *elt = &(*elts)[elts_head];
- struct ibv_recv_wr *wr = &elt->wr;
- uint64_t wr_id = wr->wr_id;
- unsigned int len;
- struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr -
- WR_ID(wr_id).offset);
- struct rte_mbuf *rep;
- uint32_t flags;
-
- /* Sanity checks. */
- assert(WR_ID(wr_id).id < rxq->elts_n);
- assert(wr->sg_list == &elt->sge);
- assert(wr->num_sge == 1);
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_mbuf_prefetch_part1(seg);
- rte_mbuf_prefetch_part2(seg);
- ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
- &flags);
- if (unlikely(ret < 0)) {
- struct ibv_wc wc;
- int wcs_n;
-
- DEBUG("rxq=%p, poll_length() failed (ret=%d)",
- (void *)rxq, ret);
- /* ibv_poll_cq() must be used in case of failure. */
- wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
- if (unlikely(wcs_n == 0))
- break;
- if (unlikely(wcs_n < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
- (void *)rxq, wcs_n);
- break;
- }
- assert(wcs_n == 1);
- if (unlikely(wc.status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
- " completion status (%d): %s",
- (void *)rxq, wc.wr_id, wc.status,
- ibv_wc_status_str(wc.status));
-#ifdef MLX4_PMD_SOFT_COUNTERS
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
-#endif
- /* Add SGE to array for repost. */
- sges[i] = elt->sge;
- goto repost;
- }
- ret = wc.byte_len;
- }
- if (ret == 0)
- break;
- len = ret;
- rep = rte_mbuf_raw_alloc(rxq->mp);
- if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p, wr_id=%" PRIu32 ":"
- " can't allocate a new mbuf",
- (void *)rxq, WR_ID(wr_id).id);
- /* Increase out of memory counters. */
- ++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- /* Add SGE to array for repost. */
- sges[i] = elt->sge;
- goto repost;
- }
-
- /* Reconfigure sge to use rep instead of seg. */
- elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
- assert(elt->sge.lkey == rxq->mr->lkey);
- WR_ID(wr->wr_id).offset =
- (((uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM) -
- (uintptr_t)rep);
- assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id);
-
- /* Add SGE to array for repost. */
- sges[i] = elt->sge;
-
- /* Update seg information. */
- SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
- NB_SEGS(seg) = 1;
- PORT(seg) = rxq->port_id;
- NEXT(seg) = NULL;
- PKT_LEN(seg) = len;
- DATA_LEN(seg) = len;
- seg->packet_type = rxq_cq_to_pkt_type(flags);
- seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-
- /* Return packet. */
- *(pkts++) = seg;
- ++pkts_ret;
-#ifdef MLX4_PMD_SOFT_COUNTERS
- /* Increase bytes counter. */
- rxq->stats.ibytes += len;
-#endif
-repost:
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
- }
- if (unlikely(i == 0))
- return 0;
- /* Repost WRs. */
-#ifdef DEBUG_RECV
- DEBUG("%p: reposting %u WRs", (void *)rxq, i);
-#endif
- ret = rxq->if_qp->recv_burst(rxq->qp, sges, i);
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_burst(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- rxq->elts_head = elts_head;
-#ifdef MLX4_PMD_SOFT_COUNTERS
- /* Increase packets counter. */
- rxq->stats.ipackets += pkts_ret;
-#endif
- return pkts_ret;
-}
-
-/**
- * DPDK callback for RX in secondary processes.
- *
- * This function configures all queues from primary process information
- * if necessary before reverting to the normal RX burst callback.
- *
- * @param dpdk_rxq
- * Generic pointer to RX queue structure.
- * @param[out] pkts
- * Array to store received packets.
- * @param pkts_n
- * Maximum number of packets in array.
- *
- * @return
- * Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-mlx4_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
- uint16_t pkts_n)
-{
- struct rxq *rxq = dpdk_rxq;
- struct priv *priv = mlx4_secondary_data_setup(rxq->priv);
- struct priv *primary_priv;
- unsigned int index;
-
- if (priv == NULL)
- return 0;
- primary_priv =
- mlx4_secondary_data[priv->dev->data->port_id].primary_priv;
- /* Look for queue index in both private structures. */
- for (index = 0; index != priv->rxqs_n; ++index)
- if (((*primary_priv->rxqs)[index] == rxq) ||
- ((*priv->rxqs)[index] == rxq))
- break;
- if (index == priv->rxqs_n)
- return 0;
- rxq = (*priv->rxqs)[index];
- return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
-}
-
-/**
- * Allocate a Queue Pair.
- * Optionally setup inline receive if supported.
- *
- * @param priv
- * Pointer to private structure.
- * @param cq
- * Completion queue to associate with QP.
- * @param desc
- * Number of descriptors in QP (hint only).
- *
- * @return
- * QP pointer or NULL in case of error.
- */
-static struct ibv_qp *
-rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
- struct ibv_exp_res_domain *rd)
-{
- struct ibv_exp_qp_init_attr attr = {
- /* CQ to be associated with the send queue. */
- .send_cq = cq,
- /* CQ to be associated with the receive queue. */
- .recv_cq = cq,
- .cap = {
- /* Max number of outstanding WRs. */
- .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
- priv->device_attr.max_qp_wr :
- desc),
- /* Max number of scatter/gather elements in a WR. */
- .max_recv_sge = ((priv->device_attr.max_sge <
- MLX4_PMD_SGE_WR_N) ?
- priv->device_attr.max_sge :
- MLX4_PMD_SGE_WR_N),
- },
- .qp_type = IBV_QPT_RAW_PACKET,
- .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
- IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
- .pd = priv->pd,
- .res_domain = rd,
- };
-
-#ifdef INLINE_RECV
- attr.max_inl_recv = priv->inl_recv_size;
- attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV;
-#endif
- return ibv_exp_create_qp(priv->ctx, &attr);
-}
-
-#ifdef RSS_SUPPORT
-
-/**
- * Allocate a RSS Queue Pair.
- * Optionally setup inline receive if supported.
- *
- * @param priv
- * Pointer to private structure.
- * @param cq
- * Completion queue to associate with QP.
- * @param desc
- * Number of descriptors in QP (hint only).
- * @param children_n
- * If nonzero, a number of children for parent QP and zero for a child.
- * @param rxq_parent
- * Pointer for a parent in a child case, NULL otherwise.
- *
- * @return
- * QP pointer or NULL in case of error.
- */
-static struct ibv_qp *
-rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
- int children_n, struct ibv_exp_res_domain *rd,
- struct rxq *rxq_parent)
-{
- struct ibv_exp_qp_init_attr attr = {
- /* CQ to be associated with the send queue. */
- .send_cq = cq,
- /* CQ to be associated with the receive queue. */
- .recv_cq = cq,
- .cap = {
- /* Max number of outstanding WRs. */
- .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
- priv->device_attr.max_qp_wr :
- desc),
- /* Max number of scatter/gather elements in a WR. */
- .max_recv_sge = ((priv->device_attr.max_sge <
- MLX4_PMD_SGE_WR_N) ?
- priv->device_attr.max_sge :
- MLX4_PMD_SGE_WR_N),
- },
- .qp_type = IBV_QPT_RAW_PACKET,
- .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
- IBV_EXP_QP_INIT_ATTR_RES_DOMAIN |
- IBV_EXP_QP_INIT_ATTR_QPG),
- .pd = priv->pd,
- .res_domain = rd,
- };
-
-#ifdef INLINE_RECV
- attr.max_inl_recv = priv->inl_recv_size,
- attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV;
-#endif
- if (children_n > 0) {
- attr.qpg.qpg_type = IBV_EXP_QPG_PARENT;
- /* TSS isn't necessary. */
- attr.qpg.parent_attrib.tss_child_count = 0;
- attr.qpg.parent_attrib.rss_child_count =
- rte_align32pow2(children_n + 1) >> 1;
- DEBUG("initializing parent RSS queue");
- } else {
- attr.qpg.qpg_type = IBV_EXP_QPG_CHILD_RX;
- attr.qpg.qpg_parent = rxq_parent->qp;
- DEBUG("initializing child RSS queue");
- }
- return ibv_exp_create_qp(priv->ctx, &attr);
-}
-
-#endif /* RSS_SUPPORT */
-
-/**
- * Reconfigure a RX queue with new parameters.
- *
- * rxq_rehash() does not allocate mbufs, which, if not done from the right
- * thread (such as a control thread), may corrupt the pool.
- * In case of failure, the queue is left untouched.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param rxq
- * RX queue pointer.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
-{
- struct priv *priv = rxq->priv;
- struct rxq tmpl = *rxq;
- unsigned int mbuf_n;
- unsigned int desc_n;
- struct rte_mbuf **pool;
- unsigned int i, k;
- struct ibv_exp_qp_attr mod;
- struct ibv_recv_wr *bad_wr;
- unsigned int mb_len;
- int err;
-
- mb_len = rte_pktmbuf_data_room_size(rxq->mp);
- DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
- /* Number of descriptors and mbufs currently allocated. */
- desc_n = (tmpl.elts_n * (tmpl.sp ? MLX4_PMD_SGE_WR_N : 1));
- mbuf_n = desc_n;
- /* Toggle RX checksum offload if hardware supports it. */
- if (priv->hw_csum) {
- tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq->csum = tmpl.csum;
- }
- if (priv->hw_csum_l2tun) {
- tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq->csum_l2tun = tmpl.csum_l2tun;
- }
- /* Enable scattered packets support for this queue if necessary. */
- assert(mb_len >= RTE_PKTMBUF_HEADROOM);
- if (dev->data->dev_conf.rxmode.enable_scatter &&
- (dev->data->dev_conf.rxmode.max_rx_pkt_len >
- (mb_len - RTE_PKTMBUF_HEADROOM))) {
- tmpl.sp = 1;
- desc_n /= MLX4_PMD_SGE_WR_N;
- } else
- tmpl.sp = 0;
- DEBUG("%p: %s scattered packets support (%u WRs)",
- (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
- /* If scatter mode is the same as before, nothing to do. */
- if (tmpl.sp == rxq->sp) {
- DEBUG("%p: nothing to do", (void *)dev);
- return 0;
- }
- /* Remove attached flows if RSS is disabled (no parent queue). */
- if (!priv->rss && !priv->isolated) {
- rxq_allmulticast_disable(&tmpl);
- rxq_promiscuous_disable(&tmpl);
- rxq_mac_addrs_del(&tmpl);
- /* Update original queue in case of failure. */
- rxq->allmulti_flow = tmpl.allmulti_flow;
- rxq->promisc_flow = tmpl.promisc_flow;
- memcpy(rxq->mac_configured, tmpl.mac_configured,
- sizeof(rxq->mac_configured));
- memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow));
- }
- /* From now on, any failure will render the queue unusable.
- * Reinitialize QP. */
- if (!tmpl.qp)
- goto skip_init;
- mod = (struct ibv_exp_qp_attr){ .qp_state = IBV_QPS_RESET };
- err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE);
- if (err) {
- ERROR("%p: cannot reset QP: %s", (void *)dev, strerror(err));
- assert(err > 0);
- return err;
- }
- mod = (struct ibv_exp_qp_attr){
- /* Move the QP to this state. */
- .qp_state = IBV_QPS_INIT,
- /* Primary port number. */
- .port_num = priv->port
- };
- err = ibv_exp_modify_qp(tmpl.qp, &mod,
- (IBV_EXP_QP_STATE |
- IBV_EXP_QP_PORT));
- if (err) {
- ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
- (void *)dev, strerror(err));
- assert(err > 0);
- return err;
- };
-skip_init:
- err = ibv_resize_cq(tmpl.cq, desc_n);
- if (err) {
- ERROR("%p: cannot resize CQ: %s", (void *)dev, strerror(err));
- assert(err > 0);
- return err;
- }
- /* Reconfigure flows. Do not care for errors. */
- if (!priv->rss && !priv->isolated) {
- rxq_mac_addrs_add(&tmpl);
- if (priv->promisc)
- rxq_promiscuous_enable(&tmpl);
- if (priv->allmulti)
- rxq_allmulticast_enable(&tmpl);
- /* Update original queue in case of failure. */
- rxq->allmulti_flow = tmpl.allmulti_flow;
- rxq->promisc_flow = tmpl.promisc_flow;
- memcpy(rxq->mac_configured, tmpl.mac_configured,
- sizeof(rxq->mac_configured));
- memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow));
- }
- /* Allocate pool. */
- pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
- if (pool == NULL) {
- ERROR("%p: cannot allocate memory", (void *)dev);
- return ENOBUFS;
- }
- /* Snatch mbufs from original queue. */
- k = 0;
- if (rxq->sp) {
- struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
-
- for (i = 0; (i != elemof(*elts)); ++i) {
- struct rxq_elt_sp *elt = &(*elts)[i];
- unsigned int j;
-
- for (j = 0; (j != elemof(elt->bufs)); ++j) {
- assert(elt->bufs[j] != NULL);
- pool[k++] = elt->bufs[j];
- }
- }
- } else {
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
-
- for (i = 0; (i != elemof(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = (void *)
- ((uintptr_t)elt->sge.addr -
- WR_ID(elt->wr.wr_id).offset);
-
- assert(WR_ID(elt->wr.wr_id).id == i);
- pool[k++] = buf;
- }
- }
- assert(k == mbuf_n);
- tmpl.elts_n = 0;
- tmpl.elts.sp = NULL;
- assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
- err = ((tmpl.sp) ?
- rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
- rxq_alloc_elts(&tmpl, desc_n, pool));
- if (err) {
- ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
- rte_free(pool);
- assert(err > 0);
- return err;
- }
- assert(tmpl.elts_n == desc_n);
- assert(tmpl.elts.sp != NULL);
- rte_free(pool);
- /* Clean up original data. */
- rxq->elts_n = 0;
- rte_free(rxq->elts.sp);
- rxq->elts.sp = NULL;
- if (!tmpl.qp)
- goto skip_rtr;
- /* Post WRs. */
- err = ibv_post_recv(tmpl.qp,
- (tmpl.sp ?
- &(*tmpl.elts.sp)[0].wr :
- &(*tmpl.elts.no_sp)[0].wr),
- &bad_wr);
- if (err) {
- ERROR("%p: ibv_post_recv() failed for WR %p: %s",
- (void *)dev,
- (void *)bad_wr,
- strerror(err));
- goto skip_rtr;
- }
- mod = (struct ibv_exp_qp_attr){
- .qp_state = IBV_QPS_RTR
- };
- err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE);
- if (err)
- ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
- (void *)dev, strerror(err));
-skip_rtr:
- *rxq = tmpl;
- assert(err >= 0);
- return err;
-}
-
-/**
- * Create verbs QP resources associated with a rxq.
- *
- * @param rxq
- * Pointer to RX queue structure.
- * @param desc
- * Number of descriptors to configure in queue.
- * @param inactive
- * If true, the queue is disabled because its index is higher or
- * equal to the real number of queues, which must be a power of 2.
- * @param children_n
- * The number of children in a parent case, zero for a child.
- * @param rxq_parent
- * The pointer to a parent RX structure for a child in RSS case,
- * NULL for parent.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-int
-rxq_create_qp(struct rxq *rxq,
- uint16_t desc,
- int inactive,
- int children_n,
- struct rxq *rxq_parent)
-{
- int ret;
- struct ibv_exp_qp_attr mod;
- struct ibv_exp_query_intf_params params;
- enum ibv_exp_query_intf_status status;
- struct ibv_recv_wr *bad_wr;
- int parent = (children_n > 0);
- struct priv *priv = rxq->priv;
-
-#ifdef RSS_SUPPORT
- if (priv->rss && !inactive && (rxq_parent || parent))
- rxq->qp = rxq_setup_qp_rss(priv, rxq->cq, desc,
- children_n, rxq->rd,
- rxq_parent);
- else
-#endif /* RSS_SUPPORT */
- rxq->qp = rxq_setup_qp(priv, rxq->cq, desc, rxq->rd);
- if (rxq->qp == NULL) {
- ret = (errno ? errno : EINVAL);
- ERROR("QP creation failure: %s",
- strerror(ret));
- return ret;
- }
- mod = (struct ibv_exp_qp_attr){
- /* Move the QP to this state. */
- .qp_state = IBV_QPS_INIT,
- /* Primary port number. */
- .port_num = priv->port
- };
- ret = ibv_exp_modify_qp(rxq->qp, &mod,
- (IBV_EXP_QP_STATE |
-#ifdef RSS_SUPPORT
- (parent ? IBV_EXP_QP_GROUP_RSS : 0) |
-#endif /* RSS_SUPPORT */
- IBV_EXP_QP_PORT));
- if (ret) {
- ERROR("QP state to IBV_QPS_INIT failed: %s",
- strerror(ret));
- return ret;
- }
- if (!priv->isolated && (parent || !priv->rss)) {
- /* Configure MAC and broadcast addresses. */
- ret = rxq_mac_addrs_add(rxq);
- if (ret) {
- ERROR("QP flow attachment failed: %s",
- strerror(ret));
- return ret;
- }
- }
- if (!parent) {
- ret = ibv_post_recv(rxq->qp,
- (rxq->sp ?
- &(*rxq->elts.sp)[0].wr :
- &(*rxq->elts.no_sp)[0].wr),
- &bad_wr);
- if (ret) {
- ERROR("ibv_post_recv() failed for WR %p: %s",
- (void *)bad_wr,
- strerror(ret));
- return ret;
- }
- }
- mod = (struct ibv_exp_qp_attr){
- .qp_state = IBV_QPS_RTR
- };
- ret = ibv_exp_modify_qp(rxq->qp, &mod, IBV_EXP_QP_STATE);
- if (ret) {
- ERROR("QP state to IBV_QPS_RTR failed: %s",
- strerror(ret));
- return ret;
- }
- params = (struct ibv_exp_query_intf_params){
- .intf_scope = IBV_EXP_INTF_GLOBAL,
- .intf = IBV_EXP_INTF_QP_BURST,
- .obj = rxq->qp,
- };
- rxq->if_qp = ibv_exp_query_intf(priv->ctx, &params, &status);
- if (rxq->if_qp == NULL) {
- ERROR("QP interface family query failed with status %d",
- status);
- return errno;
- }
- return 0;
-}
-
-/**
- * Configure a RX queue.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param rxq
- * Pointer to RX queue structure.
- * @param desc
- * Number of descriptors to configure in queue.
- * @param socket
- * NUMA socket on which memory must be allocated.
- * @param inactive
- * If true, the queue is disabled because its index is higher or
- * equal to the real number of queues, which must be a power of 2.
- * @param[in] conf
- * Thresholds parameters.
- * @param mp
- * Memory pool for buffer allocations.
- * @param children_n
- * The number of children in a parent case, zero for a child.
- * @param rxq_parent
- * The pointer to a parent RX structure (or NULL) in a child case,
- * NULL for parent.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
- unsigned int socket, int inactive,
- const struct rte_eth_rxconf *conf,
- struct rte_mempool *mp, int children_n,
- struct rxq *rxq_parent)
-{
- struct priv *priv = dev->data->dev_private;
- struct rxq tmpl = {
- .priv = priv,
- .mp = mp,
- .socket = socket
- };
- union {
- struct ibv_exp_query_intf_params params;
- struct ibv_exp_cq_init_attr cq;
- struct ibv_exp_res_domain_init_attr rd;
- } attr;
- enum ibv_exp_query_intf_status status;
- unsigned int mb_len;
- int ret = 0;
- int parent = (children_n > 0);
-
- (void)conf; /* Thresholds configuration (ignored). */
- /*
- * If this is a parent queue, hardware must support RSS and
- * RSS must be enabled.
- */
- assert((!parent) || ((priv->hw_rss) && (priv->rss)));
- if (parent) {
- /* Even if unused, ibv_create_cq() requires at least one
- * descriptor. */
- desc = 1;
- goto skip_mr;
- }
- mb_len = rte_pktmbuf_data_room_size(mp);
- if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) {
- ERROR("%p: invalid number of RX descriptors (must be a"
- " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N);
- return EINVAL;
- }
- /* Toggle RX checksum offload if hardware supports it. */
- if (priv->hw_csum)
- tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- if (priv->hw_csum_l2tun)
- tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- /* Enable scattered packets support for this queue if necessary. */
- assert(mb_len >= RTE_PKTMBUF_HEADROOM);
- if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
- (mb_len - RTE_PKTMBUF_HEADROOM)) {
- tmpl.sp = 0;
- } else if (dev->data->dev_conf.rxmode.enable_scatter) {
- tmpl.sp = 1;
- desc /= MLX4_PMD_SGE_WR_N;
- } else {
- WARN("%p: the requested maximum Rx packet size (%u) is"
- " larger than a single mbuf (%u) and scattered"
- " mode has not been requested",
- (void *)dev,
- dev->data->dev_conf.rxmode.max_rx_pkt_len,
- mb_len - RTE_PKTMBUF_HEADROOM);
- }
- DEBUG("%p: %s scattered packets support (%u WRs)",
- (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
- /* Use the entire RX mempool as the memory region. */
- tmpl.mr = mlx4_mp2mr(priv->pd, mp);
- if (tmpl.mr == NULL) {
- ret = EINVAL;
- ERROR("%p: MR creation failure: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
-skip_mr:
- attr.rd = (struct ibv_exp_res_domain_init_attr){
- .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
- IBV_EXP_RES_DOMAIN_MSG_MODEL),
- .thread_model = IBV_EXP_THREAD_SINGLE,
- .msg_model = IBV_EXP_MSG_HIGH_BW,
- };
- tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
- if (tmpl.rd == NULL) {
- ret = ENOMEM;
- ERROR("%p: RD creation failure: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
- if (dev->data->dev_conf.intr_conf.rxq) {
- tmpl.channel = ibv_create_comp_channel(priv->ctx);
- if (tmpl.channel == NULL) {
- ret = ENOMEM;
- ERROR("%p: Rx interrupt completion channel creation"
- " failure: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
- }
- attr.cq = (struct ibv_exp_cq_init_attr){
- .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
- .res_domain = tmpl.rd,
- };
- tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, tmpl.channel, 0,
- &attr.cq);
- if (tmpl.cq == NULL) {
- ret = ENOMEM;
- ERROR("%p: CQ creation failure: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
- DEBUG("priv->device_attr.max_qp_wr is %d",
- priv->device_attr.max_qp_wr);
- DEBUG("priv->device_attr.max_sge is %d",
- priv->device_attr.max_sge);
- /* Allocate descriptors for RX queues, except for the RSS parent. */
- if (parent)
- goto skip_alloc;
- if (tmpl.sp)
- ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
- else
- ret = rxq_alloc_elts(&tmpl, desc, NULL);
+ /* Prepare internal flow rules. */
+ ret = mlx4_flow_sync(priv, &error);
if (ret) {
- ERROR("%p: RXQ allocation failed: %s",
- (void *)dev, strerror(ret));
- return ret;
+ ERROR("cannot set up internal flow rules (code %d, \"%s\"),"
+ " flow error type %d, cause %p, message: %s",
+ -ret, strerror(-ret), error.type, error.cause,
+ error.message ? error.message : "(unspecified)");
}
-skip_alloc:
- if (parent || rxq_parent || !priv->rss) {
- ret = rxq_create_qp(&tmpl, desc, inactive,
- children_n, rxq_parent);
- if (ret)
- goto error;
- }
- /* Save port ID. */
- tmpl.port_id = dev->data->port_id;
- DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
- attr.params = (struct ibv_exp_query_intf_params){
- .intf_scope = IBV_EXP_INTF_GLOBAL,
- .intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.cq,
- };
- tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
- if (tmpl.if_cq == NULL) {
- ret = EINVAL;
- ERROR("%p: CQ interface family query failed with status %d",
- (void *)dev, status);
- goto error;
- }
- /* Clean up rxq in case we're reinitializing it. */
- DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
- rxq_cleanup(rxq);
- *rxq = tmpl;
- DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
- assert(ret == 0);
- return 0;
-error:
- rxq_cleanup(&tmpl);
- assert(ret > 0);
return ret;
}
/**
- * DPDK callback to configure a RX queue.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param idx
- * RX queue index.
- * @param desc
- * Number of descriptors to configure in queue.
- * @param socket
- * NUMA socket on which memory must be allocated.
- * @param[in] conf
- * Thresholds parameters.
- * @param mp
- * Memory pool for buffer allocations.
- *
- * @return
- * 0 on success, negative errno value on failure.
- */
-static int
-mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
- unsigned int socket, const struct rte_eth_rxconf *conf,
- struct rte_mempool *mp)
-{
- struct rxq *parent;
- struct priv *priv = dev->data->dev_private;
- struct rxq *rxq = (*priv->rxqs)[idx];
- int inactive = 0;
- int ret;
-
- if (mlx4_is_secondary())
- return -E_RTE_SECONDARY;
- priv_lock(priv);
- DEBUG("%p: configuring queue %u for %u descriptors",
- (void *)dev, idx, desc);
- if (idx >= priv->rxqs_n) {
- ERROR("%p: queue index out of range (%u >= %u)",
- (void *)dev, idx, priv->rxqs_n);
- priv_unlock(priv);
- return -EOVERFLOW;
- }
- if (rxq != NULL) {
- DEBUG("%p: reusing already allocated queue index %u (%p)",
- (void *)dev, idx, (void *)rxq);
- if (priv->started) {
- priv_unlock(priv);
- return -EEXIST;
- }
- (*priv->rxqs)[idx] = NULL;
- rxq_cleanup(rxq);
- } else {
- rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
- if (rxq == NULL) {
- ERROR("%p: unable to allocate queue index %u",
- (void *)dev, idx);
- priv_unlock(priv);
- return -ENOMEM;
- }
- }
- if (priv->rss && !priv->isolated) {
- /* The list consists of the single default one. */
- parent = LIST_FIRST(&priv->parents);
- if (idx >= rte_align32pow2(priv->rxqs_n + 1) >> 1)
- inactive = 1;
- } else {
- parent = NULL;
- }
- ret = rxq_setup(dev, rxq, desc, socket,
- inactive, conf, mp, 0, parent);
- if (ret)
- rte_free(rxq);
- else {
- rxq->stats.idx = idx;
- DEBUG("%p: adding RX queue %p to list",
- (void *)dev, (void *)rxq);
- (*priv->rxqs)[idx] = rxq;
- /* Update receive callback. */
- if (rxq->sp)
- dev->rx_pkt_burst = mlx4_rx_burst_sp;
- else
- dev->rx_pkt_burst = mlx4_rx_burst;
- }
- priv_unlock(priv);
- return -ret;
-}
-
-/**
- * DPDK callback to release a RX queue.
- *
- * @param dpdk_rxq
- * Generic RX queue pointer.
- */
-static void
-mlx4_rx_queue_release(void *dpdk_rxq)
-{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct priv *priv;
- unsigned int i;
-
- if (mlx4_is_secondary())
- return;
- if (rxq == NULL)
- return;
- priv = rxq->priv;
- priv_lock(priv);
- for (i = 0; (i != priv->rxqs_n); ++i)
- if ((*priv->rxqs)[i] == rxq) {
- DEBUG("%p: removing RX queue %p from list",
- (void *)priv->dev, (void *)rxq);
- (*priv->rxqs)[i] = NULL;
- break;
- }
- rxq_cleanup(rxq);
- rte_free(rxq);
- priv_unlock(priv);
-}
-
-static int
-priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *);
-
-static int
-priv_dev_removal_interrupt_handler_install(struct priv *, struct rte_eth_dev *);
-
-static int
-priv_dev_link_interrupt_handler_install(struct priv *, struct rte_eth_dev *);
-
-/**
* DPDK callback to start the device.
*
- * Simulate device start by attaching all configured flows.
+ * Simulate device start by initializing common RSS resources and attaching
+ * all configured flows.
*
* @param dev
* Pointer to Ethernet device structure.
*
* @return
- * 0 on success, negative errno value on failure.
+ * 0 on success, negative errno value otherwise and rte_errno is set.
*/
static int
mlx4_dev_start(struct rte_eth_dev *dev)
{
struct priv *priv = dev->data->dev_private;
- unsigned int i = 0;
- unsigned int r;
- struct rxq *rxq;
+ struct rte_flow_error error;
int ret;
- if (mlx4_is_secondary())
- return -E_RTE_SECONDARY;
- priv_lock(priv);
- if (priv->started) {
- priv_unlock(priv);
+ if (priv->started)
return 0;
- }
DEBUG("%p: attaching configured flows to all RX queues", (void *)dev);
priv->started = 1;
- if (priv->isolated) {
- rxq = NULL;
- r = 1;
- } else if (priv->rss) {
- rxq = LIST_FIRST(&priv->parents);
- r = 1;
- } else {
- rxq = (*priv->rxqs)[0];
- r = priv->rxqs_n;
- }
- /* Iterate only once when RSS is enabled. */
- do {
- /* Ignore nonexistent RX queues. */
- if (rxq == NULL)
- continue;
- ret = rxq_mac_addrs_add(rxq);
- if (!ret && priv->promisc)
- ret = rxq_promiscuous_enable(rxq);
- if (!ret && priv->allmulti)
- ret = rxq_allmulticast_enable(rxq);
- if (!ret)
- continue;
- WARN("%p: QP flow attachment failed: %s",
- (void *)dev, strerror(ret));
- goto err;
- } while ((--r) && ((rxq = (*priv->rxqs)[++i]), i));
- ret = priv_dev_link_interrupt_handler_install(priv, dev);
+ ret = mlx4_rss_init(priv);
if (ret) {
- ERROR("%p: LSC handler install failed",
- (void *)dev);
+ ERROR("%p: cannot initialize RSS resources: %s",
+ (void *)dev, strerror(-ret));
goto err;
}
- ret = priv_dev_removal_interrupt_handler_install(priv, dev);
+ ret = mlx4_intr_install(priv);
if (ret) {
- ERROR("%p: RMV handler install failed",
+ ERROR("%p: interrupt handler installation failed",
(void *)dev);
goto err;
}
- ret = priv_rx_intr_vec_enable(priv);
- if (ret) {
- ERROR("%p: Rx interrupt vector creation failed",
- (void *)dev);
- goto err;
- }
- ret = mlx4_priv_flow_start(priv);
+ ret = mlx4_flow_sync(priv, &error);
if (ret) {
- ERROR("%p: flow start failed: %s",
- (void *)dev, strerror(ret));
+ ERROR("%p: cannot attach flow rules (code %d, \"%s\"),"
+ " flow error type %d, cause %p, message: %s",
+ (void *)dev,
+ -ret, strerror(-ret), error.type, error.cause,
+ error.message ? error.message : "(unspecified)");
goto err;
}
- priv_unlock(priv);
+ rte_wmb();
+ dev->tx_pkt_burst = mlx4_tx_burst;
+ dev->rx_pkt_burst = mlx4_rx_burst;
return 0;
err:
/* Rollback. */
- while (i != 0) {
- rxq = (*priv->rxqs)[i--];
- if (rxq != NULL) {
- rxq_allmulticast_disable(rxq);
- rxq_promiscuous_disable(rxq);
- rxq_mac_addrs_del(rxq);
- }
- }
priv->started = 0;
- priv_unlock(priv);
- return -ret;
+ return ret;
}
/**
@@ -4220,102 +178,19 @@ static void
mlx4_dev_stop(struct rte_eth_dev *dev)
{
struct priv *priv = dev->data->dev_private;
- unsigned int i = 0;
- unsigned int r;
- struct rxq *rxq;
- if (mlx4_is_secondary())
- return;
- priv_lock(priv);
- if (!priv->started) {
- priv_unlock(priv);
+ if (!priv->started)
return;
- }
DEBUG("%p: detaching flows from all RX queues", (void *)dev);
priv->started = 0;
- if (priv->isolated) {
- rxq = NULL;
- r = 1;
- } else if (priv->rss) {
- rxq = LIST_FIRST(&priv->parents);
- r = 1;
- } else {
- rxq = (*priv->rxqs)[0];
- r = priv->rxqs_n;
- }
- mlx4_priv_flow_stop(priv);
- /* Iterate only once when RSS is enabled. */
- do {
- /* Ignore nonexistent RX queues. */
- if (rxq == NULL)
- continue;
- rxq_allmulticast_disable(rxq);
- rxq_promiscuous_disable(rxq);
- rxq_mac_addrs_del(rxq);
- } while ((--r) && ((rxq = (*priv->rxqs)[++i]), i));
- priv_unlock(priv);
-}
-
-/**
- * Dummy DPDK callback for TX.
- *
- * This function is used to temporarily replace the real callback during
- * unsafe control operations on the queue, or in case of error.
- *
- * @param dpdk_txq
- * Generic pointer to TX queue structure.
- * @param[in] pkts
- * Packets to transmit.
- * @param pkts_n
- * Number of packets in array.
- *
- * @return
- * Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- (void)dpdk_txq;
- (void)pkts;
- (void)pkts_n;
- return 0;
-}
-
-/**
- * Dummy DPDK callback for RX.
- *
- * This function is used to temporarily replace the real callback during
- * unsafe control operations on the queue, or in case of error.
- *
- * @param dpdk_rxq
- * Generic pointer to RX queue structure.
- * @param[out] pkts
- * Array to store received packets.
- * @param pkts_n
- * Maximum number of packets in array.
- *
- * @return
- * Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- (void)dpdk_rxq;
- (void)pkts;
- (void)pkts_n;
- return 0;
+ dev->tx_pkt_burst = mlx4_tx_burst_removed;
+ dev->rx_pkt_burst = mlx4_rx_burst_removed;
+ rte_wmb();
+ mlx4_flow_sync(priv, NULL);
+ mlx4_intr_uninstall(priv);
+ mlx4_rss_deinit(priv);
}
-static int
-priv_dev_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *);
-
-static int
-priv_dev_removal_interrupt_handler_uninstall(struct priv *,
- struct rte_eth_dev *);
-
-static int
-priv_dev_link_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *);
-
/**
* DPDK callback to close the device.
*
@@ -4327,1047 +202,58 @@ priv_dev_link_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *);
static void
mlx4_dev_close(struct rte_eth_dev *dev)
{
- struct priv *priv = mlx4_get_priv(dev);
- void *tmp;
+ struct priv *priv = dev->data->dev_private;
unsigned int i;
- if (priv == NULL)
- return;
- priv_lock(priv);
DEBUG("%p: closing device \"%s\"",
(void *)dev,
((priv->ctx != NULL) ? priv->ctx->device->name : ""));
- /* Prevent crashes when queues are still in use. This is unfortunately
- * still required for DPDK 1.3 because some programs (such as testpmd)
- * never release them before closing the device. */
- dev->rx_pkt_burst = removed_rx_burst;
- dev->tx_pkt_burst = removed_tx_burst;
- if (priv->rxqs != NULL) {
- /* XXX race condition if mlx4_rx_burst() is still running. */
- usleep(1000);
- for (i = 0; (i != priv->rxqs_n); ++i) {
- tmp = (*priv->rxqs)[i];
- if (tmp == NULL)
- continue;
- (*priv->rxqs)[i] = NULL;
- rxq_cleanup(tmp);
- rte_free(tmp);
- }
- priv->rxqs_n = 0;
- priv->rxqs = NULL;
- }
- if (priv->txqs != NULL) {
- /* XXX race condition if mlx4_tx_burst() is still running. */
- usleep(1000);
- for (i = 0; (i != priv->txqs_n); ++i) {
- tmp = (*priv->txqs)[i];
- if (tmp == NULL)
- continue;
- (*priv->txqs)[i] = NULL;
- txq_cleanup(tmp);
- rte_free(tmp);
- }
- priv->txqs_n = 0;
- priv->txqs = NULL;
- }
- if (priv->rss)
- priv_parent_list_cleanup(priv);
+ dev->rx_pkt_burst = mlx4_rx_burst_removed;
+ dev->tx_pkt_burst = mlx4_tx_burst_removed;
+ rte_wmb();
+ mlx4_flow_clean(priv);
+ for (i = 0; i != dev->data->nb_rx_queues; ++i)
+ mlx4_rx_queue_release(dev->data->rx_queues[i]);
+ for (i = 0; i != dev->data->nb_tx_queues; ++i)
+ mlx4_tx_queue_release(dev->data->tx_queues[i]);
if (priv->pd != NULL) {
assert(priv->ctx != NULL);
claim_zero(ibv_dealloc_pd(priv->pd));
claim_zero(ibv_close_device(priv->ctx));
} else
assert(priv->ctx == NULL);
- priv_dev_removal_interrupt_handler_uninstall(priv, dev);
- priv_dev_link_interrupt_handler_uninstall(priv, dev);
- priv_rx_intr_vec_disable(priv);
- priv_unlock(priv);
+ mlx4_intr_uninstall(priv);
memset(priv, 0, sizeof(*priv));
}
-/**
- * Change the link state (UP / DOWN).
- *
- * @param priv
- * Pointer to Ethernet device private data.
- * @param up
- * Nonzero for link up, otherwise link down.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-priv_set_link(struct priv *priv, int up)
-{
- struct rte_eth_dev *dev = priv->dev;
- int err;
- unsigned int i;
-
- if (up) {
- err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
- if (err)
- return err;
- for (i = 0; i < priv->rxqs_n; i++)
- if ((*priv->rxqs)[i]->sp)
- break;
- /* Check if an sp queue exists.
- * Note: Some old frames might be received.
- */
- if (i == priv->rxqs_n)
- dev->rx_pkt_burst = mlx4_rx_burst;
- else
- dev->rx_pkt_burst = mlx4_rx_burst_sp;
- dev->tx_pkt_burst = mlx4_tx_burst;
- } else {
- err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
- if (err)
- return err;
- dev->rx_pkt_burst = removed_rx_burst;
- dev->tx_pkt_burst = removed_tx_burst;
- }
- return 0;
-}
-
-/**
- * DPDK callback to bring the link DOWN.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-mlx4_set_link_down(struct rte_eth_dev *dev)
-{
- struct priv *priv = dev->data->dev_private;
- int err;
-
- priv_lock(priv);
- err = priv_set_link(priv, 0);
- priv_unlock(priv);
- return err;
-}
-
-/**
- * DPDK callback to bring the link UP.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-mlx4_set_link_up(struct rte_eth_dev *dev)
-{
- struct priv *priv = dev->data->dev_private;
- int err;
-
- priv_lock(priv);
- err = priv_set_link(priv, 1);
- priv_unlock(priv);
- return err;
-}
-/**
- * DPDK callback to get information about the device.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param[out] info
- * Info structure output buffer.
- */
-static void
-mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
-{
- struct priv *priv = mlx4_get_priv(dev);
- unsigned int max;
- char ifname[IF_NAMESIZE];
-
- info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
-
- if (priv == NULL)
- return;
- priv_lock(priv);
- /* FIXME: we should ask the device for these values. */
- info->min_rx_bufsize = 32;
- info->max_rx_pktlen = 65536;
- /*
- * Since we need one CQ per QP, the limit is the minimum number
- * between the two values.
- */
- max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ?
- priv->device_attr.max_qp : priv->device_attr.max_cq);
- /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
- if (max >= 65535)
- max = 65535;
- info->max_rx_queues = max;
- info->max_tx_queues = max;
- /* Last array entry is reserved for broadcast. */
- info->max_mac_addrs = (elemof(priv->mac) - 1);
- info->rx_offload_capa =
- (priv->hw_csum ?
- (DEV_RX_OFFLOAD_IPV4_CKSUM |
- DEV_RX_OFFLOAD_UDP_CKSUM |
- DEV_RX_OFFLOAD_TCP_CKSUM) :
- 0);
- info->tx_offload_capa =
- (priv->hw_csum ?
- (DEV_TX_OFFLOAD_IPV4_CKSUM |
- DEV_TX_OFFLOAD_UDP_CKSUM |
- DEV_TX_OFFLOAD_TCP_CKSUM) :
- 0);
- if (priv_get_ifname(priv, &ifname) == 0)
- info->if_index = if_nametoindex(ifname);
- info->speed_capa =
- ETH_LINK_SPEED_1G |
- ETH_LINK_SPEED_10G |
- ETH_LINK_SPEED_20G |
- ETH_LINK_SPEED_40G |
- ETH_LINK_SPEED_56G;
- priv_unlock(priv);
-}
-
-static const uint32_t *
-mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev)
-{
- static const uint32_t ptypes[] = {
- /* refers to rxq_cq_to_pkt_type() */
- RTE_PTYPE_L3_IPV4,
- RTE_PTYPE_L3_IPV6,
- RTE_PTYPE_INNER_L3_IPV4,
- RTE_PTYPE_INNER_L3_IPV6,
- RTE_PTYPE_UNKNOWN
- };
-
- if (dev->rx_pkt_burst == mlx4_rx_burst ||
- dev->rx_pkt_burst == mlx4_rx_burst_sp)
- return ptypes;
- return NULL;
-}
-
-/**
- * DPDK callback to get device statistics.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param[out] stats
- * Stats structure output buffer.
- */
-static void
-mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
-{
- struct priv *priv = mlx4_get_priv(dev);
- struct rte_eth_stats tmp = {0};
- unsigned int i;
- unsigned int idx;
-
- if (priv == NULL)
- return;
- priv_lock(priv);
- /* Add software counters. */
- for (i = 0; (i != priv->rxqs_n); ++i) {
- struct rxq *rxq = (*priv->rxqs)[i];
-
- if (rxq == NULL)
- continue;
- idx = rxq->stats.idx;
- if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
-#ifdef MLX4_PMD_SOFT_COUNTERS
- tmp.q_ipackets[idx] += rxq->stats.ipackets;
- tmp.q_ibytes[idx] += rxq->stats.ibytes;
-#endif
- tmp.q_errors[idx] += (rxq->stats.idropped +
- rxq->stats.rx_nombuf);
- }
-#ifdef MLX4_PMD_SOFT_COUNTERS
- tmp.ipackets += rxq->stats.ipackets;
- tmp.ibytes += rxq->stats.ibytes;
-#endif
- tmp.ierrors += rxq->stats.idropped;
- tmp.rx_nombuf += rxq->stats.rx_nombuf;
- }
- for (i = 0; (i != priv->txqs_n); ++i) {
- struct txq *txq = (*priv->txqs)[i];
-
- if (txq == NULL)
- continue;
- idx = txq->stats.idx;
- if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
-#ifdef MLX4_PMD_SOFT_COUNTERS
- tmp.q_opackets[idx] += txq->stats.opackets;
- tmp.q_obytes[idx] += txq->stats.obytes;
-#endif
- tmp.q_errors[idx] += txq->stats.odropped;
- }
-#ifdef MLX4_PMD_SOFT_COUNTERS
- tmp.opackets += txq->stats.opackets;
- tmp.obytes += txq->stats.obytes;
-#endif
- tmp.oerrors += txq->stats.odropped;
- }
-#ifndef MLX4_PMD_SOFT_COUNTERS
- /* FIXME: retrieve and add hardware counters. */
-#endif
- *stats = tmp;
- priv_unlock(priv);
-}
-
-/**
- * DPDK callback to clear device statistics.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- */
-static void
-mlx4_stats_reset(struct rte_eth_dev *dev)
-{
- struct priv *priv = mlx4_get_priv(dev);
- unsigned int i;
- unsigned int idx;
-
- if (priv == NULL)
- return;
- priv_lock(priv);
- for (i = 0; (i != priv->rxqs_n); ++i) {
- if ((*priv->rxqs)[i] == NULL)
- continue;
- idx = (*priv->rxqs)[i]->stats.idx;
- (*priv->rxqs)[i]->stats =
- (struct mlx4_rxq_stats){ .idx = idx };
- }
- for (i = 0; (i != priv->txqs_n); ++i) {
- if ((*priv->txqs)[i] == NULL)
- continue;
- idx = (*priv->txqs)[i]->stats.idx;
- (*priv->txqs)[i]->stats =
- (struct mlx4_txq_stats){ .idx = idx };
- }
-#ifndef MLX4_PMD_SOFT_COUNTERS
- /* FIXME: reset hardware counters. */
-#endif
- priv_unlock(priv);
-}
-
-/**
- * DPDK callback to remove a MAC address.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param index
- * MAC address index.
- */
-static void
-mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
-{
- struct priv *priv = dev->data->dev_private;
-
- if (mlx4_is_secondary())
- return;
- priv_lock(priv);
- if (priv->isolated)
- goto end;
- DEBUG("%p: removing MAC address from index %" PRIu32,
- (void *)dev, index);
- /* Last array entry is reserved for broadcast. */
- if (index >= (elemof(priv->mac) - 1))
- goto end;
- priv_mac_addr_del(priv, index);
-end:
- priv_unlock(priv);
-}
-
-/**
- * DPDK callback to add a MAC address.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param mac_addr
- * MAC address to register.
- * @param index
- * MAC address index.
- * @param vmdq
- * VMDq pool index to associate address with (ignored).
- */
-static int
-mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
- uint32_t index, uint32_t vmdq)
-{
- struct priv *priv = dev->data->dev_private;
- int re;
-
- if (mlx4_is_secondary())
- return -ENOTSUP;
- (void)vmdq;
- priv_lock(priv);
- if (priv->isolated) {
- DEBUG("%p: cannot add MAC address, "
- "device is in isolated mode", (void *)dev);
- re = EPERM;
- goto end;
- }
- DEBUG("%p: adding MAC address at index %" PRIu32,
- (void *)dev, index);
- /* Last array entry is reserved for broadcast. */
- if (index >= (elemof(priv->mac) - 1)) {
- re = EINVAL;
- goto end;
- }
- re = priv_mac_addr_add(priv, index,
- (const uint8_t (*)[ETHER_ADDR_LEN])
- mac_addr->addr_bytes);
-end:
- priv_unlock(priv);
- return -re;
-}
-
-/**
- * DPDK callback to set the primary MAC address.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param mac_addr
- * MAC address to register.
- */
-static void
-mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
-{
- DEBUG("%p: setting primary MAC address", (void *)dev);
- mlx4_mac_addr_remove(dev, 0);
- mlx4_mac_addr_add(dev, mac_addr, 0, 0);
-}
-
-/**
- * DPDK callback to enable promiscuous mode.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- */
-static void
-mlx4_promiscuous_enable(struct rte_eth_dev *dev)
-{
- struct priv *priv = dev->data->dev_private;
- unsigned int i;
- int ret;
-
- if (mlx4_is_secondary())
- return;
- priv_lock(priv);
- if (priv->isolated) {
- DEBUG("%p: cannot enable promiscuous, "
- "device is in isolated mode", (void *)dev);
- priv_unlock(priv);
- return;
- }
- if (priv->promisc) {
- priv_unlock(priv);
- return;
- }
- /* If device isn't started, this is all we need to do. */
- if (!priv->started)
- goto end;
- if (priv->rss) {
- ret = rxq_promiscuous_enable(LIST_FIRST(&priv->parents));
- if (ret) {
- priv_unlock(priv);
- return;
- }
- goto end;
- }
- for (i = 0; (i != priv->rxqs_n); ++i) {
- if ((*priv->rxqs)[i] == NULL)
- continue;
- ret = rxq_promiscuous_enable((*priv->rxqs)[i]);
- if (!ret)
- continue;
- /* Failure, rollback. */
- while (i != 0)
- if ((*priv->rxqs)[--i] != NULL)
- rxq_promiscuous_disable((*priv->rxqs)[i]);
- priv_unlock(priv);
- return;
- }
-end:
- priv->promisc = 1;
- priv_unlock(priv);
-}
-
-/**
- * DPDK callback to disable promiscuous mode.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- */
-static void
-mlx4_promiscuous_disable(struct rte_eth_dev *dev)
-{
- struct priv *priv = dev->data->dev_private;
- unsigned int i;
-
- if (mlx4_is_secondary())
- return;
- priv_lock(priv);
- if (!priv->promisc || priv->isolated) {
- priv_unlock(priv);
- return;
- }
- if (priv->rss) {
- rxq_promiscuous_disable(LIST_FIRST(&priv->parents));
- goto end;
- }
- for (i = 0; (i != priv->rxqs_n); ++i)
- if ((*priv->rxqs)[i] != NULL)
- rxq_promiscuous_disable((*priv->rxqs)[i]);
-end:
- priv->promisc = 0;
- priv_unlock(priv);
-}
-
-/**
- * DPDK callback to enable allmulti mode.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- */
-static void
-mlx4_allmulticast_enable(struct rte_eth_dev *dev)
-{
- struct priv *priv = dev->data->dev_private;
- unsigned int i;
- int ret;
-
- if (mlx4_is_secondary())
- return;
- priv_lock(priv);
- if (priv->isolated) {
- DEBUG("%p: cannot enable allmulticast, "
- "device is in isolated mode", (void *)dev);
- priv_unlock(priv);
- return;
- }
- if (priv->allmulti) {
- priv_unlock(priv);
- return;
- }
- /* If device isn't started, this is all we need to do. */
- if (!priv->started)
- goto end;
- if (priv->rss) {
- ret = rxq_allmulticast_enable(LIST_FIRST(&priv->parents));
- if (ret) {
- priv_unlock(priv);
- return;
- }
- goto end;
- }
- for (i = 0; (i != priv->rxqs_n); ++i) {
- if ((*priv->rxqs)[i] == NULL)
- continue;
- ret = rxq_allmulticast_enable((*priv->rxqs)[i]);
- if (!ret)
- continue;
- /* Failure, rollback. */
- while (i != 0)
- if ((*priv->rxqs)[--i] != NULL)
- rxq_allmulticast_disable((*priv->rxqs)[i]);
- priv_unlock(priv);
- return;
- }
-end:
- priv->allmulti = 1;
- priv_unlock(priv);
-}
-
-/**
- * DPDK callback to disable allmulti mode.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- */
-static void
-mlx4_allmulticast_disable(struct rte_eth_dev *dev)
-{
- struct priv *priv = dev->data->dev_private;
- unsigned int i;
-
- if (mlx4_is_secondary())
- return;
- priv_lock(priv);
- if (!priv->allmulti || priv->isolated) {
- priv_unlock(priv);
- return;
- }
- if (priv->rss) {
- rxq_allmulticast_disable(LIST_FIRST(&priv->parents));
- goto end;
- }
- for (i = 0; (i != priv->rxqs_n); ++i)
- if ((*priv->rxqs)[i] != NULL)
- rxq_allmulticast_disable((*priv->rxqs)[i]);
-end:
- priv->allmulti = 0;
- priv_unlock(priv);
-}
-
-/**
- * DPDK callback to retrieve physical link information.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param wait_to_complete
- * Wait for request completion (ignored).
- */
-static int
-mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete)
-{
- const struct priv *priv = mlx4_get_priv(dev);
- struct ethtool_cmd edata = {
- .cmd = ETHTOOL_GSET
- };
- struct ifreq ifr;
- struct rte_eth_link dev_link;
- int link_speed = 0;
-
- /* priv_lock() is not taken to allow concurrent calls. */
-
- if (priv == NULL)
- return -EINVAL;
- (void)wait_to_complete;
- if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
- WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno));
- return -1;
- }
- memset(&dev_link, 0, sizeof(dev_link));
- dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
- (ifr.ifr_flags & IFF_RUNNING));
- ifr.ifr_data = (void *)&edata;
- if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
- WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
- strerror(errno));
- return -1;
- }
- link_speed = ethtool_cmd_speed(&edata);
- if (link_speed == -1)
- dev_link.link_speed = 0;
- else
- dev_link.link_speed = link_speed;
- dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
- ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
- dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
- ETH_LINK_SPEED_FIXED);
- if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
- /* Link status changed. */
- dev->data->dev_link = dev_link;
- return 0;
- }
- /* Link status is still the same. */
- return -1;
-}
-
-static int
-mlx4_ibv_device_to_pci_addr(const struct ibv_device *device,
- struct rte_pci_addr *pci_addr);
-
-/**
- * DPDK callback to change the MTU.
- *
- * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be
- * received). Use this as a hint to enable/disable scattered packets support
- * and improve performance when not needed.
- * Since failure is not an option, reconfiguring queues on the fly is not
- * recommended.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param in_mtu
- * New MTU.
- *
- * @return
- * 0 on success, negative errno value on failure.
- */
-static int
-mlx4_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
-{
- struct priv *priv = dev->data->dev_private;
- int ret = 0;
- unsigned int i;
- uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) =
- mlx4_rx_burst;
-
- if (mlx4_is_secondary())
- return -E_RTE_SECONDARY;
- priv_lock(priv);
- /* Set kernel interface MTU first. */
- if (priv_set_mtu(priv, mtu)) {
- ret = errno;
- WARN("cannot set port %u MTU to %u: %s", priv->port, mtu,
- strerror(ret));
- goto out;
- } else
- DEBUG("adapter port %u MTU set to %u", priv->port, mtu);
- priv->mtu = mtu;
- /* Temporarily replace RX handler with a fake one, assuming it has not
- * been copied elsewhere. */
- dev->rx_pkt_burst = removed_rx_burst;
- /* Make sure everyone has left mlx4_rx_burst() and uses
- * removed_rx_burst() instead. */
- rte_wmb();
- usleep(1000);
- /* Reconfigure each RX queue. */
- for (i = 0; (i != priv->rxqs_n); ++i) {
- struct rxq *rxq = (*priv->rxqs)[i];
- unsigned int max_frame_len;
-
- if (rxq == NULL)
- continue;
- /* Calculate new maximum frame length according to MTU. */
- max_frame_len = (priv->mtu + ETHER_HDR_LEN +
- (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN));
- /* Provide new values to rxq_setup(). */
- dev->data->dev_conf.rxmode.jumbo_frame =
- (max_frame_len > ETHER_MAX_LEN);
- dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
- ret = rxq_rehash(dev, rxq);
- if (ret) {
- /* Force SP RX if that queue requires it and abort. */
- if (rxq->sp)
- rx_func = mlx4_rx_burst_sp;
- break;
- }
- /* Reenable non-RSS queue attributes. No need to check
- * for errors at this stage. */
- if (!priv->rss && !priv->isolated) {
- rxq_mac_addrs_add(rxq);
- if (priv->promisc)
- rxq_promiscuous_enable(rxq);
- if (priv->allmulti)
- rxq_allmulticast_enable(rxq);
- }
- /* Scattered burst function takes priority. */
- if (rxq->sp)
- rx_func = mlx4_rx_burst_sp;
- }
- /* Burst functions can now be called again. */
- rte_wmb();
- dev->rx_pkt_burst = rx_func;
-out:
- priv_unlock(priv);
- assert(ret >= 0);
- return -ret;
-}
-
-/**
- * DPDK callback to get flow control status.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param[out] fc_conf
- * Flow control output buffer.
- *
- * @return
- * 0 on success, negative errno value on failure.
- */
-static int
-mlx4_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
-{
- struct priv *priv = dev->data->dev_private;
- struct ifreq ifr;
- struct ethtool_pauseparam ethpause = {
- .cmd = ETHTOOL_GPAUSEPARAM
- };
- int ret;
-
- if (mlx4_is_secondary())
- return -E_RTE_SECONDARY;
- ifr.ifr_data = (void *)&ethpause;
- priv_lock(priv);
- if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
- ret = errno;
- WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)"
- " failed: %s",
- strerror(ret));
- goto out;
- }
-
- fc_conf->autoneg = ethpause.autoneg;
- if (ethpause.rx_pause && ethpause.tx_pause)
- fc_conf->mode = RTE_FC_FULL;
- else if (ethpause.rx_pause)
- fc_conf->mode = RTE_FC_RX_PAUSE;
- else if (ethpause.tx_pause)
- fc_conf->mode = RTE_FC_TX_PAUSE;
- else
- fc_conf->mode = RTE_FC_NONE;
- ret = 0;
-
-out:
- priv_unlock(priv);
- assert(ret >= 0);
- return -ret;
-}
-
-/**
- * DPDK callback to modify flow control parameters.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param[in] fc_conf
- * Flow control parameters.
- *
- * @return
- * 0 on success, negative errno value on failure.
- */
-static int
-mlx4_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
-{
- struct priv *priv = dev->data->dev_private;
- struct ifreq ifr;
- struct ethtool_pauseparam ethpause = {
- .cmd = ETHTOOL_SPAUSEPARAM
- };
- int ret;
-
- if (mlx4_is_secondary())
- return -E_RTE_SECONDARY;
- ifr.ifr_data = (void *)&ethpause;
- ethpause.autoneg = fc_conf->autoneg;
- if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
- (fc_conf->mode & RTE_FC_RX_PAUSE))
- ethpause.rx_pause = 1;
- else
- ethpause.rx_pause = 0;
-
- if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
- (fc_conf->mode & RTE_FC_TX_PAUSE))
- ethpause.tx_pause = 1;
- else
- ethpause.tx_pause = 0;
-
- priv_lock(priv);
- if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
- ret = errno;
- WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
- " failed: %s",
- strerror(ret));
- goto out;
- }
- ret = 0;
-
-out:
- priv_unlock(priv);
- assert(ret >= 0);
- return -ret;
-}
-
-/**
- * Configure a VLAN filter.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param vlan_id
- * VLAN ID to filter.
- * @param on
- * Toggle filter.
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
-{
- struct priv *priv = dev->data->dev_private;
- unsigned int i;
- unsigned int j = -1;
-
- DEBUG("%p: %s VLAN filter ID %" PRIu16,
- (void *)dev, (on ? "enable" : "disable"), vlan_id);
- for (i = 0; (i != elemof(priv->vlan_filter)); ++i) {
- if (!priv->vlan_filter[i].enabled) {
- /* Unused index, remember it. */
- j = i;
- continue;
- }
- if (priv->vlan_filter[i].id != vlan_id)
- continue;
- /* This VLAN ID is already known, use its index. */
- j = i;
- break;
- }
- /* Check if there's room for another VLAN filter. */
- if (j == (unsigned int)-1)
- return ENOMEM;
- /*
- * VLAN filters apply to all configured MAC addresses, flow
- * specifications must be reconfigured accordingly.
- */
- priv->vlan_filter[j].id = vlan_id;
- if ((on) && (!priv->vlan_filter[j].enabled)) {
- /*
- * Filter is disabled, enable it.
- * Rehashing flows in all RX queues is necessary.
- */
- if (priv->rss)
- rxq_mac_addrs_del(LIST_FIRST(&priv->parents));
- else
- for (i = 0; (i != priv->rxqs_n); ++i)
- if ((*priv->rxqs)[i] != NULL)
- rxq_mac_addrs_del((*priv->rxqs)[i]);
- priv->vlan_filter[j].enabled = 1;
- if (priv->started) {
- if (priv->rss)
- rxq_mac_addrs_add(LIST_FIRST(&priv->parents));
- else
- for (i = 0; (i != priv->rxqs_n); ++i) {
- if ((*priv->rxqs)[i] == NULL)
- continue;
- rxq_mac_addrs_add((*priv->rxqs)[i]);
- }
- }
- } else if ((!on) && (priv->vlan_filter[j].enabled)) {
- /*
- * Filter is enabled, disable it.
- * Rehashing flows in all RX queues is necessary.
- */
- if (priv->rss)
- rxq_mac_addrs_del(LIST_FIRST(&priv->parents));
- else
- for (i = 0; (i != priv->rxqs_n); ++i)
- if ((*priv->rxqs)[i] != NULL)
- rxq_mac_addrs_del((*priv->rxqs)[i]);
- priv->vlan_filter[j].enabled = 0;
- if (priv->started) {
- if (priv->rss)
- rxq_mac_addrs_add(LIST_FIRST(&priv->parents));
- else
- for (i = 0; (i != priv->rxqs_n); ++i) {
- if ((*priv->rxqs)[i] == NULL)
- continue;
- rxq_mac_addrs_add((*priv->rxqs)[i]);
- }
- }
- }
- return 0;
-}
-
-/**
- * DPDK callback to configure a VLAN filter.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param vlan_id
- * VLAN ID to filter.
- * @param on
- * Toggle filter.
- *
- * @return
- * 0 on success, negative errno value on failure.
- */
-static int
-mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
-{
- struct priv *priv = dev->data->dev_private;
- int ret;
-
- if (mlx4_is_secondary())
- return -E_RTE_SECONDARY;
- priv_lock(priv);
- if (priv->isolated) {
- DEBUG("%p: cannot set vlan filter, "
- "device is in isolated mode", (void *)dev);
- priv_unlock(priv);
- return -EINVAL;
- }
- ret = vlan_filter_set(dev, vlan_id, on);
- priv_unlock(priv);
- assert(ret >= 0);
- return -ret;
-}
-
-const struct rte_flow_ops mlx4_flow_ops = {
- .validate = mlx4_flow_validate,
- .create = mlx4_flow_create,
- .destroy = mlx4_flow_destroy,
- .flush = mlx4_flow_flush,
- .query = NULL,
- .isolate = mlx4_flow_isolate,
-};
-
-/**
- * Manage filter operations.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param filter_type
- * Filter type.
- * @param filter_op
- * Operation to perform.
- * @param arg
- * Pointer to operation-specific structure.
- *
- * @return
- * 0 on success, negative errno value on failure.
- */
-static int
-mlx4_dev_filter_ctrl(struct rte_eth_dev *dev,
- enum rte_filter_type filter_type,
- enum rte_filter_op filter_op,
- void *arg)
-{
- int ret = EINVAL;
-
- switch (filter_type) {
- case RTE_ETH_FILTER_GENERIC:
- if (filter_op != RTE_ETH_FILTER_GET)
- return -EINVAL;
- *(const void **)arg = &mlx4_flow_ops;
- return 0;
- case RTE_ETH_FILTER_FDIR:
- DEBUG("%p: filter type FDIR is not supported by this PMD",
- (void *)dev);
- break;
- default:
- ERROR("%p: filter type (%d) not supported",
- (void *)dev, filter_type);
- break;
- }
- return -ret;
-}
-
static const struct eth_dev_ops mlx4_dev_ops = {
.dev_configure = mlx4_dev_configure,
.dev_start = mlx4_dev_start,
.dev_stop = mlx4_dev_stop,
- .dev_set_link_down = mlx4_set_link_down,
- .dev_set_link_up = mlx4_set_link_up,
+ .dev_set_link_down = mlx4_dev_set_link_down,
+ .dev_set_link_up = mlx4_dev_set_link_up,
.dev_close = mlx4_dev_close,
+ .link_update = mlx4_link_update,
.promiscuous_enable = mlx4_promiscuous_enable,
.promiscuous_disable = mlx4_promiscuous_disable,
.allmulticast_enable = mlx4_allmulticast_enable,
.allmulticast_disable = mlx4_allmulticast_disable,
- .link_update = mlx4_link_update,
+ .mac_addr_remove = mlx4_mac_addr_remove,
+ .mac_addr_add = mlx4_mac_addr_add,
+ .mac_addr_set = mlx4_mac_addr_set,
.stats_get = mlx4_stats_get,
.stats_reset = mlx4_stats_reset,
- .queue_stats_mapping_set = NULL,
.dev_infos_get = mlx4_dev_infos_get,
.dev_supported_ptypes_get = mlx4_dev_supported_ptypes_get,
.vlan_filter_set = mlx4_vlan_filter_set,
- .vlan_tpid_set = NULL,
- .vlan_strip_queue_set = NULL,
- .vlan_offload_set = NULL,
.rx_queue_setup = mlx4_rx_queue_setup,
.tx_queue_setup = mlx4_tx_queue_setup,
.rx_queue_release = mlx4_rx_queue_release,
.tx_queue_release = mlx4_tx_queue_release,
- .dev_led_on = NULL,
- .dev_led_off = NULL,
- .flow_ctrl_get = mlx4_dev_get_flow_ctrl,
- .flow_ctrl_set = mlx4_dev_set_flow_ctrl,
- .priority_flow_ctrl_set = NULL,
- .mac_addr_remove = mlx4_mac_addr_remove,
- .mac_addr_add = mlx4_mac_addr_add,
- .mac_addr_set = mlx4_mac_addr_set,
- .mtu_set = mlx4_dev_set_mtu,
- .filter_ctrl = mlx4_dev_filter_ctrl,
+ .flow_ctrl_get = mlx4_flow_ctrl_get,
+ .flow_ctrl_set = mlx4_flow_ctrl_set,
+ .mtu_set = mlx4_mtu_set,
+ .filter_ctrl = mlx4_filter_ctrl,
.rx_queue_intr_enable = mlx4_rx_intr_enable,
.rx_queue_intr_disable = mlx4_rx_intr_disable,
};
@@ -5381,7 +267,7 @@ static const struct eth_dev_ops mlx4_dev_ops = {
* PCI bus address output buffer.
*
* @return
- * 0 on success, -1 on failure and errno is set.
+ * 0 on success, negative errno value otherwise and rte_errno is set.
*/
static int
mlx4_ibv_device_to_pci_addr(const struct ibv_device *device,
@@ -5392,8 +278,10 @@ mlx4_ibv_device_to_pci_addr(const struct ibv_device *device,
MKSTR(path, "%s/device/uevent", device->ibdev_path);
file = fopen(path, "rb");
- if (file == NULL)
- return -1;
+ if (file == NULL) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
while (fgets(line, sizeof(line), file) == line) {
size_t len = strlen(line);
int ret;
@@ -5423,572 +311,48 @@ mlx4_ibv_device_to_pci_addr(const struct ibv_device *device,
}
/**
- * Get MAC address by querying netdevice.
- *
- * @param[in] priv
- * struct priv for the requested device.
- * @param[out] mac
- * MAC address output buffer.
- *
- * @return
- * 0 on success, -1 on failure and errno is set.
- */
-static int
-priv_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN])
-{
- struct ifreq request;
-
- if (priv_ifreq(priv, SIOCGIFHWADDR, &request))
- return -1;
- memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
- return 0;
-}
-
-/* Support up to 32 adapters. */
-static struct {
- struct rte_pci_addr pci_addr; /* associated PCI address */
- uint32_t ports; /* physical ports bitfield. */
-} mlx4_dev[32];
-
-/**
- * Get device index in mlx4_dev[] from PCI bus address.
- *
- * @param[in] pci_addr
- * PCI bus address to look for.
- *
- * @return
- * mlx4_dev[] index on success, -1 on failure.
- */
-static int
-mlx4_dev_idx(struct rte_pci_addr *pci_addr)
-{
- unsigned int i;
- int ret = -1;
-
- assert(pci_addr != NULL);
- for (i = 0; (i != elemof(mlx4_dev)); ++i) {
- if ((mlx4_dev[i].pci_addr.domain == pci_addr->domain) &&
- (mlx4_dev[i].pci_addr.bus == pci_addr->bus) &&
- (mlx4_dev[i].pci_addr.devid == pci_addr->devid) &&
- (mlx4_dev[i].pci_addr.function == pci_addr->function))
- return i;
- if ((mlx4_dev[i].ports == 0) && (ret == -1))
- ret = i;
- }
- return ret;
-}
-
-/**
- * Retrieve integer value from environment variable.
- *
- * @param[in] name
- * Environment variable name.
- *
- * @return
- * Integer value, 0 if the variable is not set.
- */
-static int
-mlx4_getenv_int(const char *name)
-{
- const char *val = getenv(name);
-
- if (val == NULL)
- return 0;
- return atoi(val);
-}
-
-static void
-mlx4_dev_link_status_handler(void *);
-static void
-mlx4_dev_interrupt_handler(void *);
-
-/**
- * Link/device status handler.
- *
- * @param priv
- * Pointer to private structure.
- * @param dev
- * Pointer to the rte_eth_dev structure.
- * @param events
- * Pointer to event flags holder.
- *
- * @return
- * Number of events
- */
-static int
-priv_dev_status_handler(struct priv *priv, struct rte_eth_dev *dev,
- uint32_t *events)
-{
- struct ibv_async_event event;
- int port_change = 0;
- struct rte_eth_link *link = &dev->data->dev_link;
- int ret = 0;
-
- *events = 0;
- /* Read all message and acknowledge them. */
- for (;;) {
- if (ibv_get_async_event(priv->ctx, &event))
- break;
- if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
- event.event_type == IBV_EVENT_PORT_ERR) &&
- (priv->intr_conf.lsc == 1)) {
- port_change = 1;
- ret++;
- } else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
- priv->intr_conf.rmv == 1) {
- *events |= (1 << RTE_ETH_EVENT_INTR_RMV);
- ret++;
- } else
- DEBUG("event type %d on port %d not handled",
- event.event_type, event.element.port_num);
- ibv_ack_async_event(&event);
- }
- if (!port_change)
- return ret;
- mlx4_link_update(dev, 0);
- if (((link->link_speed == 0) && link->link_status) ||
- ((link->link_speed != 0) && !link->link_status)) {
- if (!priv->pending_alarm) {
- /* Inconsistent status, check again later. */
- priv->pending_alarm = 1;
- rte_eal_alarm_set(MLX4_ALARM_TIMEOUT_US,
- mlx4_dev_link_status_handler,
- dev);
- }
- } else {
- *events |= (1 << RTE_ETH_EVENT_INTR_LSC);
- }
- return ret;
-}
-
-/**
- * Handle delayed link status event.
- *
- * @param arg
- * Registered argument.
- */
-static void
-mlx4_dev_link_status_handler(void *arg)
-{
- struct rte_eth_dev *dev = arg;
- struct priv *priv = dev->data->dev_private;
- uint32_t events;
- int ret;
-
- priv_lock(priv);
- assert(priv->pending_alarm == 1);
- priv->pending_alarm = 0;
- ret = priv_dev_status_handler(priv, dev, &events);
- priv_unlock(priv);
- if (ret > 0 && events & (1 << RTE_ETH_EVENT_INTR_LSC))
- _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
- NULL);
-}
-
-/**
- * Handle interrupts from the NIC.
- *
- * @param[in] intr_handle
- * Interrupt handler.
- * @param cb_arg
- * Callback argument.
- */
-static void
-mlx4_dev_interrupt_handler(void *cb_arg)
-{
- struct rte_eth_dev *dev = cb_arg;
- struct priv *priv = dev->data->dev_private;
- int ret;
- uint32_t ev;
- int i;
-
- priv_lock(priv);
- ret = priv_dev_status_handler(priv, dev, &ev);
- priv_unlock(priv);
- if (ret > 0) {
- for (i = RTE_ETH_EVENT_UNKNOWN;
- i < RTE_ETH_EVENT_MAX;
- i++) {
- if (ev & (1 << i)) {
- ev &= ~(1 << i);
- _rte_eth_dev_callback_process(dev, i, NULL,
- NULL);
- ret--;
- }
- }
- if (ret)
- WARN("%d event%s not processed", ret,
- (ret > 1 ? "s were" : " was"));
- }
-}
-
-/**
- * Uninstall interrupt handler.
- *
- * @param priv
- * Pointer to private structure.
- * @param dev
- * Pointer to the rte_eth_dev structure.
- * @return
- * 0 on success, negative errno value on failure.
- */
-static int
-priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
-{
- int ret;
-
- if (priv->intr_conf.lsc ||
- priv->intr_conf.rmv)
- return 0;
- ret = rte_intr_callback_unregister(&priv->intr_handle,
- mlx4_dev_interrupt_handler,
- dev);
- if (ret < 0) {
- ERROR("rte_intr_callback_unregister failed with %d"
- "%s%s%s", ret,
- (errno ? " (errno: " : ""),
- (errno ? strerror(errno) : ""),
- (errno ? ")" : ""));
- }
- priv->intr_handle.fd = 0;
- priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
- return ret;
-}
-
-/**
- * Install interrupt handler.
- *
- * @param priv
- * Pointer to private structure.
- * @param dev
- * Pointer to the rte_eth_dev structure.
- * @return
- * 0 on success, negative errno value on failure.
- */
-static int
-priv_dev_interrupt_handler_install(struct priv *priv,
- struct rte_eth_dev *dev)
-{
- int flags;
- int rc;
-
- /* Check whether the interrupt handler has already been installed
- * for either type of interrupt
- */
- if (priv->intr_conf.lsc &&
- priv->intr_conf.rmv &&
- priv->intr_handle.fd)
- return 0;
- assert(priv->ctx->async_fd > 0);
- flags = fcntl(priv->ctx->async_fd, F_GETFL);
- rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
- if (rc < 0) {
- INFO("failed to change file descriptor async event queue");
- dev->data->dev_conf.intr_conf.lsc = 0;
- dev->data->dev_conf.intr_conf.rmv = 0;
- return -errno;
- } else {
- priv->intr_handle.fd = priv->ctx->async_fd;
- priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
- rc = rte_intr_callback_register(&priv->intr_handle,
- mlx4_dev_interrupt_handler,
- dev);
- if (rc) {
- ERROR("rte_intr_callback_register failed "
- " (errno: %s)", strerror(errno));
- return rc;
- }
- }
- return 0;
-}
-
-/**
- * Uninstall interrupt handler.
- *
- * @param priv
- * Pointer to private structure.
- * @param dev
- * Pointer to the rte_eth_dev structure.
- * @return
- * 0 on success, negative value on error.
- */
-static int
-priv_dev_removal_interrupt_handler_uninstall(struct priv *priv,
- struct rte_eth_dev *dev)
-{
- if (dev->data->dev_conf.intr_conf.rmv) {
- priv->intr_conf.rmv = 0;
- return priv_dev_interrupt_handler_uninstall(priv, dev);
- }
- return 0;
-}
-
-/**
- * Uninstall interrupt handler.
- *
- * @param priv
- * Pointer to private structure.
- * @param dev
- * Pointer to the rte_eth_dev structure.
- * @return
- * 0 on success, negative value on error,
- */
-static int
-priv_dev_link_interrupt_handler_uninstall(struct priv *priv,
- struct rte_eth_dev *dev)
-{
- int ret = 0;
-
- if (dev->data->dev_conf.intr_conf.lsc) {
- priv->intr_conf.lsc = 0;
- ret = priv_dev_interrupt_handler_uninstall(priv, dev);
- if (ret)
- return ret;
- }
- if (priv->pending_alarm)
- if (rte_eal_alarm_cancel(mlx4_dev_link_status_handler,
- dev)) {
- ERROR("rte_eal_alarm_cancel failed "
- " (errno: %s)", strerror(rte_errno));
- return -rte_errno;
- }
- priv->pending_alarm = 0;
- return 0;
-}
-
-/**
- * Install link interrupt handler.
- *
- * @param priv
- * Pointer to private structure.
- * @param dev
- * Pointer to the rte_eth_dev structure.
- * @return
- * 0 on success, negative value on error.
- */
-static int
-priv_dev_link_interrupt_handler_install(struct priv *priv,
- struct rte_eth_dev *dev)
-{
- int ret;
-
- if (dev->data->dev_conf.intr_conf.lsc) {
- ret = priv_dev_interrupt_handler_install(priv, dev);
- if (ret)
- return ret;
- priv->intr_conf.lsc = 1;
- }
- return 0;
-}
-
-/**
- * Install removal interrupt handler.
- *
- * @param priv
- * Pointer to private structure.
- * @param dev
- * Pointer to the rte_eth_dev structure.
- * @return
- * 0 on success, negative value on error.
- */
-static int
-priv_dev_removal_interrupt_handler_install(struct priv *priv,
- struct rte_eth_dev *dev)
-{
- int ret;
-
- if (dev->data->dev_conf.intr_conf.rmv) {
- ret = priv_dev_interrupt_handler_install(priv, dev);
- if (ret)
- return ret;
- priv->intr_conf.rmv = 1;
- }
- return 0;
-}
-
-/**
- * Allocate queue vector and fill epoll fd list for Rx interrupts.
- *
- * @param priv
- * Pointer to private structure.
- *
- * @return
- * 0 on success, negative on failure.
- */
-static int
-priv_rx_intr_vec_enable(struct priv *priv)
-{
- unsigned int i;
- unsigned int rxqs_n = priv->rxqs_n;
- unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
- unsigned int count = 0;
- struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
-
- if (!priv->dev->data->dev_conf.intr_conf.rxq)
- return 0;
- priv_rx_intr_vec_disable(priv);
- intr_handle->intr_vec = malloc(sizeof(intr_handle->intr_vec[rxqs_n]));
- if (intr_handle->intr_vec == NULL) {
- ERROR("failed to allocate memory for interrupt vector,"
- " Rx interrupts will not be supported");
- return -ENOMEM;
- }
- intr_handle->type = RTE_INTR_HANDLE_EXT;
- for (i = 0; i != n; ++i) {
- struct rxq *rxq = (*priv->rxqs)[i];
- int fd;
- int flags;
- int rc;
-
- /* Skip queues that cannot request interrupts. */
- if (!rxq || !rxq->channel) {
- /* Use invalid intr_vec[] index to disable entry. */
- intr_handle->intr_vec[i] =
- RTE_INTR_VEC_RXTX_OFFSET +
- RTE_MAX_RXTX_INTR_VEC_ID;
- continue;
- }
- if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
- ERROR("too many Rx queues for interrupt vector size"
- " (%d), Rx interrupts cannot be enabled",
- RTE_MAX_RXTX_INTR_VEC_ID);
- priv_rx_intr_vec_disable(priv);
- return -1;
- }
- fd = rxq->channel->fd;
- flags = fcntl(fd, F_GETFL);
- rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
- if (rc < 0) {
- ERROR("failed to make Rx interrupt file descriptor"
- " %d non-blocking for queue index %d", fd, i);
- priv_rx_intr_vec_disable(priv);
- return rc;
- }
- intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
- intr_handle->efds[count] = fd;
- count++;
- }
- if (!count)
- priv_rx_intr_vec_disable(priv);
- else
- intr_handle->nb_efd = count;
- return 0;
-}
-
-/**
- * Clean up Rx interrupts handler.
- *
- * @param priv
- * Pointer to private structure.
- */
-static void
-priv_rx_intr_vec_disable(struct priv *priv)
-{
- struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
-
- rte_intr_free_epoll_fd(intr_handle);
- free(intr_handle->intr_vec);
- intr_handle->nb_efd = 0;
- intr_handle->intr_vec = NULL;
-}
-
-/**
- * DPDK callback for Rx queue interrupt enable.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param idx
- * Rx queue index.
- *
- * @return
- * 0 on success, negative on failure.
- */
-static int
-mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx)
-{
- struct priv *priv = dev->data->dev_private;
- struct rxq *rxq = (*priv->rxqs)[idx];
- int ret;
-
- if (!rxq || !rxq->channel)
- ret = EINVAL;
- else
- ret = ibv_req_notify_cq(rxq->cq, 0);
- if (ret)
- WARN("unable to arm interrupt on rx queue %d", idx);
- return -ret;
-}
-
-/**
- * DPDK callback for Rx queue interrupt disable.
- *
- * @param dev
- * Pointer to Ethernet device structure.
- * @param idx
- * Rx queue index.
- *
- * @return
- * 0 on success, negative on failure.
- */
-static int
-mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx)
-{
- struct priv *priv = dev->data->dev_private;
- struct rxq *rxq = (*priv->rxqs)[idx];
- struct ibv_cq *ev_cq;
- void *ev_ctx;
- int ret;
-
- if (!rxq || !rxq->channel) {
- ret = EINVAL;
- } else {
- ret = ibv_get_cq_event(rxq->cq->channel, &ev_cq, &ev_ctx);
- if (ret || ev_cq != rxq->cq)
- ret = EINVAL;
- }
- if (ret)
- WARN("unable to disable interrupt on rx queue %d",
- idx);
- else
- ibv_ack_cq_events(rxq->cq, 1);
- return -ret;
-}
-
-/**
* Verify and store value for device argument.
*
* @param[in] key
* Key argument to verify.
* @param[in] val
* Value associated with key.
- * @param out
- * User data.
+ * @param[in, out] conf
+ * Shared configuration data.
*
* @return
- * 0 on success, negative errno value on failure.
+ * 0 on success, negative errno value otherwise and rte_errno is set.
*/
static int
-mlx4_arg_parse(const char *key, const char *val, void *out)
+mlx4_arg_parse(const char *key, const char *val, struct mlx4_conf *conf)
{
- struct mlx4_conf *conf = out;
unsigned long tmp;
errno = 0;
tmp = strtoul(val, NULL, 0);
if (errno) {
+ rte_errno = errno;
WARN("%s: \"%s\" is not a valid integer", key, val);
- return -errno;
+ return -rte_errno;
}
if (strcmp(MLX4_PMD_PORT_KVARG, key) == 0) {
- if (tmp >= MLX4_PMD_MAX_PHYS_PORTS) {
- ERROR("invalid port index %lu (max: %u)",
- tmp, MLX4_PMD_MAX_PHYS_PORTS - 1);
+ uint32_t ports = rte_log2_u32(conf->ports.present);
+
+ if (tmp >= ports) {
+ ERROR("port index %lu outside range [0,%" PRIu32 ")",
+ tmp, ports);
return -EINVAL;
}
- conf->active_ports |= 1 << tmp;
+ if (!(conf->ports.present & (1 << tmp))) {
+ rte_errno = EINVAL;
+ ERROR("invalid port index %lu", tmp);
+ return -rte_errno;
+ }
+ conf->ports.enabled |= 1 << tmp;
} else {
+ rte_errno = EINVAL;
WARN("%s: unknown parameter", key);
- return -EINVAL;
+ return -rte_errno;
}
return 0;
}
@@ -6000,7 +364,7 @@ mlx4_arg_parse(const char *key, const char *val, void *out)
* Device arguments structure.
*
* @return
- * 0 on success, negative errno value on failure.
+ * 0 on success, negative errno value otherwise and rte_errno is set.
*/
static int
mlx4_args(struct rte_devargs *devargs, struct mlx4_conf *conf)
@@ -6014,15 +378,21 @@ mlx4_args(struct rte_devargs *devargs, struct mlx4_conf *conf)
return 0;
kvlist = rte_kvargs_parse(devargs->args, pmd_mlx4_init_params);
if (kvlist == NULL) {
+ rte_errno = EINVAL;
ERROR("failed to parse kvargs");
- return -EINVAL;
+ return -rte_errno;
}
/* Process parameters. */
for (i = 0; pmd_mlx4_init_params[i]; ++i) {
arg_count = rte_kvargs_count(kvlist, MLX4_PMD_PORT_KVARG);
while (arg_count-- > 0) {
- ret = rte_kvargs_process(kvlist, MLX4_PMD_PORT_KVARG,
- mlx4_arg_parse, conf);
+ ret = rte_kvargs_process(kvlist,
+ MLX4_PMD_PORT_KVARG,
+ (int (*)(const char *,
+ const char *,
+ void *))
+ mlx4_arg_parse,
+ conf);
if (ret != 0)
goto free_kvlist;
}
@@ -6046,7 +416,7 @@ static struct rte_pci_driver mlx4_driver;
* PCI device information.
*
* @return
- * 0 on success, negative errno value on failure.
+ * 0 on success, negative errno value otherwise and rte_errno is set.
*/
static int
mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
@@ -6057,30 +427,20 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
struct ibv_context *attr_ctx = NULL;
struct ibv_device_attr device_attr;
struct mlx4_conf conf = {
- .active_ports = 0,
+ .ports.present = 0,
};
unsigned int vf;
- int idx;
int i;
(void)pci_drv;
assert(pci_drv == &mlx4_driver);
- /* Get mlx4_dev[] index. */
- idx = mlx4_dev_idx(&pci_dev->addr);
- if (idx == -1) {
- ERROR("this driver cannot support any more adapters");
- return -ENOMEM;
- }
- DEBUG("using driver device index %d", idx);
-
- /* Save PCI address. */
- mlx4_dev[idx].pci_addr = pci_dev->addr;
list = ibv_get_device_list(&i);
if (list == NULL) {
- assert(errno);
- if (errno == ENOSYS)
+ rte_errno = errno;
+ assert(rte_errno);
+ if (rte_errno == ENOSYS)
ERROR("cannot list devices, is ib_uverbs loaded?");
- return -errno;
+ return -rte_errno;
}
assert(i >= 0);
/*
@@ -6111,190 +471,112 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
ibv_free_device_list(list);
switch (err) {
case 0:
+ rte_errno = ENODEV;
ERROR("cannot access device, is mlx4_ib loaded?");
- return -ENODEV;
+ return -rte_errno;
case EINVAL:
+ rte_errno = EINVAL;
ERROR("cannot use device, are drivers up to date?");
- return -EINVAL;
+ return -rte_errno;
}
assert(err > 0);
- return -err;
+ rte_errno = err;
+ return -rte_errno;
}
ibv_dev = list[i];
-
DEBUG("device opened");
if (ibv_query_device(attr_ctx, &device_attr)) {
- err = ENODEV;
+ rte_errno = ENODEV;
goto error;
}
INFO("%u port(s) detected", device_attr.phys_port_cnt);
-
+ conf.ports.present |= (UINT64_C(1) << device_attr.phys_port_cnt) - 1;
if (mlx4_args(pci_dev->device.devargs, &conf)) {
ERROR("failed to process device arguments");
- err = EINVAL;
+ rte_errno = EINVAL;
goto error;
}
/* Use all ports when none are defined */
- if (conf.active_ports == 0) {
- for (i = 0; i < MLX4_PMD_MAX_PHYS_PORTS; i++)
- conf.active_ports |= 1 << i;
- }
+ if (!conf.ports.enabled)
+ conf.ports.enabled = conf.ports.present;
for (i = 0; i < device_attr.phys_port_cnt; i++) {
uint32_t port = i + 1; /* ports are indexed from one */
- uint32_t test = (1 << i);
struct ibv_context *ctx = NULL;
struct ibv_port_attr port_attr;
struct ibv_pd *pd = NULL;
struct priv *priv = NULL;
struct rte_eth_dev *eth_dev = NULL;
-#ifdef HAVE_EXP_QUERY_DEVICE
- struct ibv_exp_device_attr exp_device_attr;
-#endif /* HAVE_EXP_QUERY_DEVICE */
struct ether_addr mac;
- /* If port is not active, skip. */
- if (!(conf.active_ports & (1 << i)))
+ /* If port is not enabled, skip. */
+ if (!(conf.ports.enabled & (1 << i)))
continue;
-#ifdef HAVE_EXP_QUERY_DEVICE
- exp_device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS;
-#ifdef RSS_SUPPORT
- exp_device_attr.comp_mask |= IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ;
-#endif /* RSS_SUPPORT */
-#endif /* HAVE_EXP_QUERY_DEVICE */
-
- DEBUG("using port %u (%08" PRIx32 ")", port, test);
-
+ DEBUG("using port %u", port);
ctx = ibv_open_device(ibv_dev);
if (ctx == NULL) {
- err = ENODEV;
+ rte_errno = ENODEV;
goto port_error;
}
-
/* Check port status. */
err = ibv_query_port(ctx, port, &port_attr);
if (err) {
- ERROR("port query failed: %s", strerror(err));
- err = ENODEV;
+ rte_errno = err;
+ ERROR("port query failed: %s", strerror(rte_errno));
goto port_error;
}
-
if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+ rte_errno = ENOTSUP;
ERROR("port %d is not configured in Ethernet mode",
port);
- err = EINVAL;
goto port_error;
}
-
if (port_attr.state != IBV_PORT_ACTIVE)
DEBUG("port %d is not active: \"%s\" (%d)",
port, ibv_port_state_str(port_attr.state),
port_attr.state);
-
+ /* Make asynchronous FD non-blocking to handle interrupts. */
+ if (mlx4_fd_set_non_blocking(ctx->async_fd) < 0) {
+ ERROR("cannot make asynchronous FD non-blocking: %s",
+ strerror(rte_errno));
+ goto port_error;
+ }
/* Allocate protection domain. */
pd = ibv_alloc_pd(ctx);
if (pd == NULL) {
+ rte_errno = ENOMEM;
ERROR("PD allocation failure");
- err = ENOMEM;
goto port_error;
}
-
- mlx4_dev[idx].ports |= test;
-
/* from rte_ethdev.c */
priv = rte_zmalloc("ethdev private structure",
sizeof(*priv),
RTE_CACHE_LINE_SIZE);
if (priv == NULL) {
+ rte_errno = ENOMEM;
ERROR("priv allocation failure");
- err = ENOMEM;
goto port_error;
}
-
priv->ctx = ctx;
priv->device_attr = device_attr;
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
-#ifdef HAVE_EXP_QUERY_DEVICE
- if (ibv_exp_query_device(ctx, &exp_device_attr)) {
- ERROR("ibv_exp_query_device() failed");
- err = ENODEV;
- goto port_error;
- }
-#ifdef RSS_SUPPORT
- if ((exp_device_attr.exp_device_cap_flags &
- IBV_EXP_DEVICE_QPG) &&
- (exp_device_attr.exp_device_cap_flags &
- IBV_EXP_DEVICE_UD_RSS) &&
- (exp_device_attr.comp_mask &
- IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ) &&
- (exp_device_attr.max_rss_tbl_sz > 0)) {
- priv->hw_qpg = 1;
- priv->hw_rss = 1;
- priv->max_rss_tbl_sz = exp_device_attr.max_rss_tbl_sz;
- } else {
- priv->hw_qpg = 0;
- priv->hw_rss = 0;
- priv->max_rss_tbl_sz = 0;
- }
- priv->hw_tss = !!(exp_device_attr.exp_device_cap_flags &
- IBV_EXP_DEVICE_UD_TSS);
- DEBUG("device flags: %s%s%s",
- (priv->hw_qpg ? "IBV_DEVICE_QPG " : ""),
- (priv->hw_tss ? "IBV_DEVICE_TSS " : ""),
- (priv->hw_rss ? "IBV_DEVICE_RSS " : ""));
- if (priv->hw_rss)
- DEBUG("maximum RSS indirection table size: %u",
- exp_device_attr.max_rss_tbl_sz);
-#endif /* RSS_SUPPORT */
-
- priv->hw_csum =
- ((exp_device_attr.exp_device_cap_flags &
- IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT) &&
- (exp_device_attr.exp_device_cap_flags &
- IBV_EXP_DEVICE_RX_CSUM_IP_PKT));
+ priv->vf = vf;
+ priv->hw_csum = !!(device_attr.device_cap_flags &
+ IBV_DEVICE_RAW_IP_CSUM);
DEBUG("checksum offloading is %ssupported",
(priv->hw_csum ? "" : "not "));
-
- priv->hw_csum_l2tun = !!(exp_device_attr.exp_device_cap_flags &
- IBV_EXP_DEVICE_VXLAN_SUPPORT);
+ /* Only ConnectX-3 Pro supports tunneling. */
+ priv->hw_csum_l2tun =
+ priv->hw_csum &&
+ (device_attr.vendor_part_id ==
+ PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO);
DEBUG("L2 tunnel checksum offloads are %ssupported",
(priv->hw_csum_l2tun ? "" : "not "));
-
-#ifdef INLINE_RECV
- priv->inl_recv_size = mlx4_getenv_int("MLX4_INLINE_RECV_SIZE");
-
- if (priv->inl_recv_size) {
- exp_device_attr.comp_mask =
- IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ;
- if (ibv_exp_query_device(ctx, &exp_device_attr)) {
- INFO("Couldn't query device for inline-receive"
- " capabilities.");
- priv->inl_recv_size = 0;
- } else {
- if ((unsigned)exp_device_attr.inline_recv_sz <
- priv->inl_recv_size) {
- INFO("Max inline-receive (%d) <"
- " requested inline-receive (%u)",
- exp_device_attr.inline_recv_sz,
- priv->inl_recv_size);
- priv->inl_recv_size =
- exp_device_attr.inline_recv_sz;
- }
- }
- INFO("Set inline receive size to %u",
- priv->inl_recv_size);
- }
-#endif /* INLINE_RECV */
-#endif /* HAVE_EXP_QUERY_DEVICE */
-
- (void)mlx4_getenv_int;
- priv->vf = vf;
/* Configure the first MAC address by default. */
- if (priv_get_mac(priv, &mac.addr_bytes)) {
+ if (mlx4_get_mac(priv, &mac.addr_bytes)) {
ERROR("cannot get MAC address, is mlx4_en loaded?"
- " (errno: %s)", strerror(errno));
- err = ENODEV;
+ " (rte_errno: %s)", strerror(rte_errno));
goto port_error;
}
INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
@@ -6302,18 +584,13 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
mac.addr_bytes[0], mac.addr_bytes[1],
mac.addr_bytes[2], mac.addr_bytes[3],
mac.addr_bytes[4], mac.addr_bytes[5]);
- /* Register MAC and broadcast addresses. */
- claim_zero(priv_mac_addr_add(priv, 0,
- (const uint8_t (*)[ETHER_ADDR_LEN])
- mac.addr_bytes));
- claim_zero(priv_mac_addr_add(priv, (elemof(priv->mac) - 1),
- &(const uint8_t [ETHER_ADDR_LEN])
- { "\xff\xff\xff\xff\xff\xff" }));
+ /* Register MAC address. */
+ priv->mac[0] = mac;
#ifndef NDEBUG
{
char ifname[IF_NAMESIZE];
- if (priv_get_ifname(priv, &ifname) == 0)
+ if (mlx4_get_ifname(priv, &ifname) == 0)
DEBUG("port %u ifname is \"%s\"",
priv->port, ifname);
else
@@ -6321,9 +598,8 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
}
#endif
/* Get actual MTU if possible. */
- priv_get_mtu(priv, &priv->mtu);
+ mlx4_mtu_get(priv, &priv->mtu);
DEBUG("port %u MTU is %u", priv->port, priv->mtu);
-
/* from rte_ethdev.c */
{
char name[RTE_ETH_NAME_MAX_LEN];
@@ -6334,67 +610,41 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
}
if (eth_dev == NULL) {
ERROR("can not allocate rte ethdev");
- err = ENOMEM;
+ rte_errno = ENOMEM;
goto port_error;
}
-
- /* Secondary processes have to use local storage for their
- * private data as well as a copy of eth_dev->data, but this
- * pointer must not be modified before burst functions are
- * actually called. */
- if (mlx4_is_secondary()) {
- struct mlx4_secondary_data *sd =
- &mlx4_secondary_data[eth_dev->data->port_id];
-
- sd->primary_priv = eth_dev->data->dev_private;
- if (sd->primary_priv == NULL) {
- ERROR("no private data for port %u",
- eth_dev->data->port_id);
- err = EINVAL;
- goto port_error;
- }
- sd->shared_dev_data = eth_dev->data;
- rte_spinlock_init(&sd->lock);
- memcpy(sd->data.name, sd->shared_dev_data->name,
- sizeof(sd->data.name));
- sd->data.dev_private = priv;
- sd->data.rx_mbuf_alloc_failed = 0;
- sd->data.mtu = ETHER_MTU;
- sd->data.port_id = sd->shared_dev_data->port_id;
- sd->data.mac_addrs = priv->mac;
- eth_dev->tx_pkt_burst = mlx4_tx_burst_secondary_setup;
- eth_dev->rx_pkt_burst = mlx4_rx_burst_secondary_setup;
- } else {
- eth_dev->data->dev_private = priv;
- eth_dev->data->mac_addrs = priv->mac;
- }
+ eth_dev->data->dev_private = priv;
+ eth_dev->data->mac_addrs = priv->mac;
eth_dev->device = &pci_dev->device;
-
rte_eth_copy_pci_info(eth_dev, pci_dev);
-
eth_dev->device->driver = &mlx4_driver.driver;
-
+ /* Initialize local interrupt handle for current port. */
+ priv->intr_handle = (struct rte_intr_handle){
+ .fd = -1,
+ .type = RTE_INTR_HANDLE_EXT,
+ };
/*
- * Copy and override interrupt handle to prevent it from
- * being shared between all ethdev instances of a given PCI
- * device. This is required to properly handle Rx interrupts
- * on all ports.
+ * Override ethdev interrupt handle pointer with private
+ * handle instead of that of the parent PCI device used by
+ * default. This prevents it from being shared between all
+ * ports of the same PCI device since each of them is
+ * associated its own Verbs context.
+ *
+ * Rx interrupts in particular require this as the PMD has
+ * no control over the registration of queue interrupts
+ * besides setting up eth_dev->intr_handle, the rest is
+ * handled by rte_intr_rx_ctl().
*/
- priv->intr_handle_dev = *eth_dev->intr_handle;
- eth_dev->intr_handle = &priv->intr_handle_dev;
-
+ eth_dev->intr_handle = &priv->intr_handle;
priv->dev = eth_dev;
eth_dev->dev_ops = &mlx4_dev_ops;
- eth_dev->data->dev_flags |= RTE_ETH_DEV_DETACHABLE;
-
/* Bring Ethernet device up. */
DEBUG("forcing Ethernet interface up");
- priv_set_flags(priv, ~IFF_UP, IFF_UP);
+ mlx4_dev_set_link_up(priv->dev);
/* Update link status once if waiting for LSC. */
if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
mlx4_link_update(eth_dev, 0);
continue;
-
port_error:
rte_free(priv);
if (pd)
@@ -6405,27 +655,21 @@ port_error:
rte_eth_dev_release_port(eth_dev);
break;
}
-
+ if (i == device_attr.phys_port_cnt)
+ return 0;
/*
* XXX if something went wrong in the loop above, there is a resource
* leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
* long as the dpdk does not provide a way to deallocate a ethdev and a
* way to enumerate the registered ethdevs to free the previous ones.
*/
-
- /* no port found, complain */
- if (!mlx4_dev[idx].ports) {
- err = ENODEV;
- goto error;
- }
-
error:
if (attr_ctx)
claim_zero(ibv_close_device(attr_ctx));
if (list)
ibv_free_device_list(list);
- assert(err >= 0);
- return -err;
+ assert(rte_errno >= 0);
+ return -rte_errno;
}
static const struct rte_pci_id mlx4_pci_id_map[] = {
@@ -6463,7 +707,6 @@ RTE_INIT(rte_mlx4_pmd_init);
static void
rte_mlx4_pmd_init(void)
{
- RTE_BUILD_BUG_ON(sizeof(wr_id_t) != sizeof(uint64_t));
/*
* RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
* huge pages. Calling ibv_fork_init() during init allows
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index c0ade4f1..3aeef87e 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -1,8 +1,8 @@
/*-
* BSD LICENSE
*
- * Copyright 2012-2017 6WIND S.A.
- * Copyright 2012-2017 Mellanox.
+ * Copyright 2012 6WIND S.A.
+ * Copyright 2012 Mellanox
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -34,29 +34,11 @@
#ifndef RTE_PMD_MLX4_H_
#define RTE_PMD_MLX4_H_
-#include <stddef.h>
+#include <net/if.h>
#include <stdint.h>
-#include <limits.h>
+#include <sys/queue.h>
-/*
- * Runtime logging through RTE_LOG() is enabled when not in debugging mode.
- * Intermediate LOG_*() macros add the required end-of-line characters.
- */
-#ifndef NDEBUG
-#define INFO(...) DEBUG(__VA_ARGS__)
-#define WARN(...) DEBUG(__VA_ARGS__)
-#define ERROR(...) DEBUG(__VA_ARGS__)
-#else
-#define LOG__(level, m, ...) \
- RTE_LOG(level, PMD, MLX4_DRIVER_NAME ": " m "%c", __VA_ARGS__)
-#define LOG_(level, ...) LOG__(level, __VA_ARGS__, '\n')
-#define INFO(...) LOG_(INFO, __VA_ARGS__)
-#define WARN(...) LOG_(WARNING, __VA_ARGS__)
-#define ERROR(...) LOG_(ERR, __VA_ARGS__)
-#endif
-
-/* Verbs header. */
-/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+/* Verbs headers do not support -pedantic. */
#ifdef PEDANTIC
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
@@ -65,36 +47,25 @@
#pragma GCC diagnostic error "-Wpedantic"
#endif
-/*
- * Maximum number of simultaneous MAC addresses supported.
- *
- * According to ConnectX's Programmer Reference Manual:
- * The L2 Address Match is implemented by comparing a MAC/VLAN combination
- * of 128 MAC addresses and 127 VLAN values, comprising 128x127 possible
- * L2 addresses.
- */
-#define MLX4_MAX_MAC_ADDRESSES 128
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+#include <rte_interrupts.h>
+#include <rte_mempool.h>
+#include <rte_spinlock.h>
-/* Maximum number of simultaneous VLAN filters supported. See above. */
-#define MLX4_MAX_VLAN_IDS 127
+/** Maximum number of simultaneous MAC addresses. This value is arbitrary. */
+#define MLX4_MAX_MAC_ADDRESSES 128
-/* Request send completion once in every 64 sends, might be less. */
+/** Request send completion once in every 64 sends, might be less. */
#define MLX4_PMD_TX_PER_COMP_REQ 64
-/* Maximum number of physical ports. */
-#define MLX4_PMD_MAX_PHYS_PORTS 2
-
-/* Maximum number of Scatter/Gather Elements per Work Request. */
-#ifndef MLX4_PMD_SGE_WR_N
-#define MLX4_PMD_SGE_WR_N 4
-#endif
-
-/* Maximum size for inline data. */
-#ifndef MLX4_PMD_MAX_INLINE
+/** Maximum size for inline data. */
#define MLX4_PMD_MAX_INLINE 0
-#endif
-/*
+/** Fixed RSS hash key size in bytes. Cannot be modified. */
+#define MLX4_RSS_HASH_KEY_SIZE 40
+
+/**
* Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
* from which buffers are to be transmitted will have to be mapped by this
* driver to their own Memory Region (MR). This is a slow operation.
@@ -105,18 +76,10 @@
#define MLX4_PMD_TX_MP_CACHE 8
#endif
-/*
- * If defined, only use software counters. The PMD will never ask the hardware
- * for these, and many of them won't be available.
- */
-#ifndef MLX4_PMD_SOFT_COUNTERS
-#define MLX4_PMD_SOFT_COUNTERS 1
-#endif
-
-/* Alarm timeout. */
-#define MLX4_ALARM_TIMEOUT_US 100000
+/** Interrupt alarm timeout value in microseconds. */
+#define MLX4_INTR_ALARM_TIMEOUT 100000
-/* Port parameter. */
+/** Port parameter. */
#define MLX4_PMD_PORT_KVARG "port"
enum {
@@ -129,258 +92,92 @@ enum {
PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO = 0x1007,
};
+/** Driver name reported to lower layers and used in log output. */
#define MLX4_DRIVER_NAME "net_mlx4"
-/* Bit-field manipulation. */
-#define BITFIELD_DECLARE(bf, type, size) \
- type bf[(((size_t)(size) / (sizeof(type) * CHAR_BIT)) + \
- !!((size_t)(size) % (sizeof(type) * CHAR_BIT)))]
-#define BITFIELD_DEFINE(bf, type, size) \
- BITFIELD_DECLARE((bf), type, (size)) = { 0 }
-#define BITFIELD_SET(bf, b) \
- (assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)), \
- (void)((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] |= \
- ((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT)))))
-#define BITFIELD_RESET(bf, b) \
- (assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)), \
- (void)((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] &= \
- ~((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT)))))
-#define BITFIELD_ISSET(bf, b) \
- (assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)), \
- !!(((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] & \
- ((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT))))))
-
-/* Number of elements in array. */
-#define elemof(a) (sizeof(a) / sizeof((a)[0]))
-
-/* Cast pointer p to structure member m to its parent structure of type t. */
-#define containerof(p, t, m) ((t *)((uint8_t *)(p) - offsetof(t, m)))
-
-/* Branch prediction helpers. */
-#ifndef likely
-#define likely(c) __builtin_expect(!!(c), 1)
-#endif
-#ifndef unlikely
-#define unlikely(c) __builtin_expect(!!(c), 0)
-#endif
-
-/* Debugging */
-#ifndef NDEBUG
-#include <stdio.h>
-#define DEBUG__(m, ...) \
- (fprintf(stderr, "%s:%d: %s(): " m "%c", \
- __FILE__, __LINE__, __func__, __VA_ARGS__), \
- fflush(stderr), \
- (void)0)
-/*
- * Save/restore errno around DEBUG__().
- * XXX somewhat undefined behavior, but works.
- */
-#define DEBUG_(...) \
- (errno = ((int []){ \
- *(volatile int *)&errno, \
- (DEBUG__(__VA_ARGS__), 0) \
- })[0])
-#define DEBUG(...) DEBUG_(__VA_ARGS__, '\n')
-#ifndef MLX4_PMD_DEBUG_BROKEN_VERBS
-#define claim_zero(...) assert((__VA_ARGS__) == 0)
-#else /* MLX4_PMD_DEBUG_BROKEN_VERBS */
-#define claim_zero(...) \
- (void)(((__VA_ARGS__) == 0) || \
- DEBUG("Assertion `(" # __VA_ARGS__ ") == 0' failed (IGNORED)."))
-#endif /* MLX4_PMD_DEBUG_BROKEN_VERBS */
-#define claim_nonzero(...) assert((__VA_ARGS__) != 0)
-#define claim_positive(...) assert((__VA_ARGS__) >= 0)
-#else /* NDEBUG */
-/* No-ops. */
-#define DEBUG(...) (void)0
-#define claim_zero(...) (__VA_ARGS__)
-#define claim_nonzero(...) (__VA_ARGS__)
-#define claim_positive(...) (__VA_ARGS__)
-#endif /* NDEBUG */
-
-struct mlx4_rxq_stats {
- unsigned int idx; /**< Mapping index. */
-#ifdef MLX4_PMD_SOFT_COUNTERS
- uint64_t ipackets; /**< Total of successfully received packets. */
- uint64_t ibytes; /**< Total of successfully received bytes. */
-#endif
- uint64_t idropped; /**< Total of packets dropped when RX ring full. */
- uint64_t rx_nombuf; /**< Total of RX mbuf allocation failures. */
-};
-
-/* RX element (scattered packets). */
-struct rxq_elt_sp {
- struct ibv_recv_wr wr; /* Work Request. */
- struct ibv_sge sges[MLX4_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */
- struct rte_mbuf *bufs[MLX4_PMD_SGE_WR_N]; /* SGEs buffers. */
-};
-
-/* RX element. */
-struct rxq_elt {
- struct ibv_recv_wr wr; /* Work Request. */
- struct ibv_sge sge; /* Scatter/Gather Element. */
- /* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */
-};
-
-/* RX queue descriptor. */
-struct rxq {
- LIST_ENTRY(rxq) next; /* Used by parent queue only */
- struct priv *priv; /* Back pointer to private data. */
- struct rte_mempool *mp; /* Memory Pool for allocations. */
- struct ibv_mr *mr; /* Memory Region (for mp). */
- struct ibv_cq *cq; /* Completion Queue. */
- struct ibv_qp *qp; /* Queue Pair. */
- struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
- struct ibv_exp_cq_family *if_cq; /* CQ interface. */
- struct ibv_comp_channel *channel;
- /*
- * Each VLAN ID requires a separate flow steering rule.
- */
- BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES);
- struct ibv_flow *mac_flow[MLX4_MAX_MAC_ADDRESSES][MLX4_MAX_VLAN_IDS];
- struct ibv_flow *promisc_flow; /* Promiscuous flow. */
- struct ibv_flow *allmulti_flow; /* Multicast flow. */
- unsigned int port_id; /* Port ID for incoming packets. */
- unsigned int elts_n; /* (*elts)[] length. */
- unsigned int elts_head; /* Current index in (*elts)[]. */
- union {
- struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
- struct rxq_elt (*no_sp)[]; /* RX elements. */
- } elts;
- unsigned int sp:1; /* Use scattered RX elements. */
- unsigned int csum:1; /* Enable checksum offloading. */
- unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
- struct mlx4_rxq_stats stats; /* RX queue counters. */
- unsigned int socket; /* CPU socket ID for allocations. */
- struct ibv_exp_res_domain *rd; /* Resource Domain. */
- struct {
- uint16_t queues_n;
- uint16_t queues[RTE_MAX_QUEUES_PER_PORT];
- } rss;
-};
-
-/* TX element. */
-struct txq_elt {
- struct rte_mbuf *buf;
-};
-
-struct mlx4_txq_stats {
- unsigned int idx; /**< Mapping index. */
-#ifdef MLX4_PMD_SOFT_COUNTERS
- uint64_t opackets; /**< Total of successfully sent packets. */
- uint64_t obytes; /**< Total of successfully sent bytes. */
-#endif
- uint64_t odropped; /**< Total of packets not sent when TX ring full. */
-};
-
-/*
- * Linear buffer type. It is used when transmitting buffers with too many
- * segments that do not fit the hardware queue (see max_send_sge).
- * Extra segments are copied (linearized) in such buffers, replacing the
- * last SGE during TX.
- * The size is arbitrary but large enough to hold a jumbo frame with
- * 8 segments considering mbuf.buf_len is about 2048 bytes.
- */
-typedef uint8_t linear_t[16384];
+struct mlx4_drop;
+struct mlx4_rss;
+struct rxq;
+struct txq;
+struct rte_flow;
-/* TX queue descriptor. */
-struct txq {
- struct priv *priv; /* Back pointer to private data. */
- struct {
- const struct rte_mempool *mp; /* Cached Memory Pool. */
- struct ibv_mr *mr; /* Memory Region (for mp). */
- uint32_t lkey; /* mr->lkey */
- } mp2mr[MLX4_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
- struct ibv_cq *cq; /* Completion Queue. */
- struct ibv_qp *qp; /* Queue Pair. */
- struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
- struct ibv_exp_cq_family *if_cq; /* CQ interface. */
-#if MLX4_PMD_MAX_INLINE > 0
- uint32_t max_inline; /* Max inline send size <= MLX4_PMD_MAX_INLINE. */
-#endif
- unsigned int elts_n; /* (*elts)[] length. */
- struct txq_elt (*elts)[]; /* TX elements. */
- unsigned int elts_head; /* Current index in (*elts)[]. */
- unsigned int elts_tail; /* First element awaiting completion. */
- unsigned int elts_comp; /* Number of completion requests. */
- unsigned int elts_comp_cd; /* Countdown for next completion request. */
- unsigned int elts_comp_cd_init; /* Initial value for countdown. */
- struct mlx4_txq_stats stats; /* TX queue counters. */
- linear_t (*elts_linear)[]; /* Linearized buffers. */
- struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */
- unsigned int socket; /* CPU socket ID for allocations. */
- struct ibv_exp_res_domain *rd; /* Resource Domain. */
+/** Memory region descriptor. */
+struct mlx4_mr {
+ LIST_ENTRY(mlx4_mr) next; /**< Next entry in list. */
+ uintptr_t start; /**< Base address for memory region. */
+ uintptr_t end; /**< End address for memory region. */
+ uint32_t lkey; /**< L_Key extracted from @p mr. */
+ uint32_t refcnt; /**< Reference count for this object. */
+ struct priv *priv; /**< Back pointer to private data. */
+ struct ibv_mr *mr; /**< Memory region associated with @p mp. */
+ struct rte_mempool *mp; /**< Target memory pool (mempool). */
};
-struct rte_flow;
-
+/** Private data structure. */
struct priv {
- struct rte_eth_dev *dev; /* Ethernet device. */
- struct ibv_context *ctx; /* Verbs context. */
- struct ibv_device_attr device_attr; /* Device properties. */
- struct ibv_pd *pd; /* Protection Domain. */
- /*
- * MAC addresses array and configuration bit-field.
- * An extra entry that cannot be modified by the DPDK is reserved
- * for broadcast frames (destination MAC address ff:ff:ff:ff:ff:ff).
- */
- struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES];
- BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES);
- /* VLAN filters. */
- struct {
- unsigned int enabled:1; /* If enabled. */
- unsigned int id:12; /* VLAN ID (0-4095). */
- } vlan_filter[MLX4_MAX_VLAN_IDS]; /* VLAN filters table. */
+ struct rte_eth_dev *dev; /**< Ethernet device. */
+ struct ibv_context *ctx; /**< Verbs context. */
+ struct ibv_device_attr device_attr; /**< Device properties. */
+ struct ibv_pd *pd; /**< Protection Domain. */
/* Device properties. */
- uint16_t mtu; /* Configured MTU. */
- uint8_t port; /* Physical port number. */
- unsigned int started:1; /* Device started, flows enabled. */
- unsigned int promisc:1; /* Device in promiscuous mode. */
- unsigned int allmulti:1; /* Device receives all multicast packets. */
- unsigned int hw_qpg:1; /* QP groups are supported. */
- unsigned int hw_tss:1; /* TSS is supported. */
- unsigned int hw_rss:1; /* RSS is supported. */
- unsigned int hw_csum:1; /* Checksum offload is supported. */
- unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */
- unsigned int rss:1; /* RSS is enabled. */
- unsigned int vf:1; /* This is a VF device. */
- unsigned int pending_alarm:1; /* An alarm is pending. */
- unsigned int isolated:1; /* Toggle isolated mode. */
-#ifdef INLINE_RECV
- unsigned int inl_recv_size; /* Inline recv size */
-#endif
- unsigned int max_rss_tbl_sz; /* Maximum number of RSS queues. */
- /* RX/TX queues. */
- unsigned int rxqs_n; /* RX queues array size. */
- unsigned int txqs_n; /* TX queues array size. */
- struct rxq *(*rxqs)[]; /* RX queues. */
- struct txq *(*txqs)[]; /* TX queues. */
- struct rte_intr_handle intr_handle_dev; /* Device interrupt handler. */
- struct rte_intr_handle intr_handle; /* Interrupt handler. */
- struct rte_flow_drop *flow_drop_queue; /* Flow drop queue. */
- LIST_HEAD(mlx4_flows, rte_flow) flows;
- struct rte_intr_conf intr_conf; /* Active interrupt configuration. */
- LIST_HEAD(mlx4_parents, rxq) parents;
- rte_spinlock_t lock; /* Lock for control functions. */
+ uint16_t mtu; /**< Configured MTU. */
+ uint8_t port; /**< Physical port number. */
+ uint32_t started:1; /**< Device started, flows enabled. */
+ uint32_t vf:1; /**< This is a VF device. */
+ uint32_t intr_alarm:1; /**< An interrupt alarm is scheduled. */
+ uint32_t isolated:1; /**< Toggle isolated mode. */
+ uint32_t hw_csum:1; /* Checksum offload is supported. */
+ uint32_t hw_csum_l2tun:1; /* Checksum support for L2 tunnels. */
+ struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
+ struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
+ LIST_HEAD(, mlx4_rss) rss; /**< Shared targets for Rx flow rules. */
+ LIST_HEAD(, rte_flow) flows; /**< Configured flow rule handles. */
+ LIST_HEAD(, mlx4_mr) mr; /**< Registered memory regions. */
+ rte_spinlock_t mr_lock; /**< Lock for @p mr access. */
+ struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES];
+ /**< Configured MAC addresses. Unused entries are zeroed. */
};
-void priv_lock(struct priv *priv);
-void priv_unlock(struct priv *priv);
-
-int
-rxq_create_qp(struct rxq *rxq,
- uint16_t desc,
- int inactive,
- int children_n,
- struct rxq *rxq_parent);
-
-void
-rxq_parent_cleanup(struct rxq *parent);
-
-struct rxq *
-priv_parent_create(struct priv *priv,
- uint16_t queues[],
- uint16_t children_n);
+/* mlx4_ethdev.c */
+
+int mlx4_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]);
+int mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]);
+int mlx4_mtu_get(struct priv *priv, uint16_t *mtu);
+int mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu);
+int mlx4_dev_set_link_down(struct rte_eth_dev *dev);
+int mlx4_dev_set_link_up(struct rte_eth_dev *dev);
+void mlx4_promiscuous_enable(struct rte_eth_dev *dev);
+void mlx4_promiscuous_disable(struct rte_eth_dev *dev);
+void mlx4_allmulticast_enable(struct rte_eth_dev *dev);
+void mlx4_allmulticast_disable(struct rte_eth_dev *dev);
+void mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index);
+int mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
+ uint32_t index, uint32_t vmdq);
+void mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr);
+int mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on);
+int mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats);
+void mlx4_stats_reset(struct rte_eth_dev *dev);
+void mlx4_dev_infos_get(struct rte_eth_dev *dev,
+ struct rte_eth_dev_info *info);
+int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete);
+int mlx4_flow_ctrl_get(struct rte_eth_dev *dev,
+ struct rte_eth_fc_conf *fc_conf);
+int mlx4_flow_ctrl_set(struct rte_eth_dev *dev,
+ struct rte_eth_fc_conf *fc_conf);
+const uint32_t *mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev);
+
+/* mlx4_intr.c */
+
+int mlx4_intr_uninstall(struct priv *priv);
+int mlx4_intr_install(struct priv *priv);
+int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
+int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
+
+/* mlx4_mr.c */
+
+struct mlx4_mr *mlx4_mr_get(struct priv *priv, struct rte_mempool *mp);
+void mlx4_mr_put(struct mlx4_mr *mr);
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
+ uint32_t i);
#endif /* RTE_PMD_MLX4_H_ */
diff --git a/drivers/net/mlx4/mlx4_ethdev.c b/drivers/net/mlx4/mlx4_ethdev.c
new file mode 100644
index 00000000..c2ea4db1
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_ethdev.c
@@ -0,0 +1,1047 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Miscellaneous control operations for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_bus_pci.h>
+#include <rte_errno.h>
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+#include <rte_flow.h>
+#include <rte_pci.h>
+
+#include "mlx4.h"
+#include "mlx4_flow.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Get interface name from private structure.
+ *
+ * @param[in] priv
+ * Pointer to private structure.
+ * @param[out] ifname
+ * Interface name output buffer.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
+{
+ DIR *dir;
+ struct dirent *dent;
+ unsigned int dev_type = 0;
+ unsigned int dev_port_prev = ~0u;
+ char match[IF_NAMESIZE] = "";
+
+ {
+ MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path);
+
+ dir = opendir(path);
+ if (dir == NULL) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ }
+ while ((dent = readdir(dir)) != NULL) {
+ char *name = dent->d_name;
+ FILE *file;
+ unsigned int dev_port;
+ int r;
+
+ if ((name[0] == '.') &&
+ ((name[1] == '\0') ||
+ ((name[1] == '.') && (name[2] == '\0'))))
+ continue;
+
+ MKSTR(path, "%s/device/net/%s/%s",
+ priv->ctx->device->ibdev_path, name,
+ (dev_type ? "dev_id" : "dev_port"));
+
+ file = fopen(path, "rb");
+ if (file == NULL) {
+ if (errno != ENOENT)
+ continue;
+ /*
+ * Switch to dev_id when dev_port does not exist as
+ * is the case with Linux kernel versions < 3.15.
+ */
+try_dev_id:
+ match[0] = '\0';
+ if (dev_type)
+ break;
+ dev_type = 1;
+ dev_port_prev = ~0u;
+ rewinddir(dir);
+ continue;
+ }
+ r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
+ fclose(file);
+ if (r != 1)
+ continue;
+ /*
+ * Switch to dev_id when dev_port returns the same value for
+ * all ports. May happen when using a MOFED release older than
+ * 3.0 with a Linux kernel >= 3.15.
+ */
+ if (dev_port == dev_port_prev)
+ goto try_dev_id;
+ dev_port_prev = dev_port;
+ if (dev_port == (priv->port - 1u))
+ snprintf(match, sizeof(match), "%s", name);
+ }
+ closedir(dir);
+ if (match[0] == '\0') {
+ rte_errno = ENODEV;
+ return -rte_errno;
+ }
+ strncpy(*ifname, match, sizeof(*ifname));
+ return 0;
+}
+
+/**
+ * Read from sysfs entry.
+ *
+ * @param[in] priv
+ * Pointer to private structure.
+ * @param[in] entry
+ * Entry name relative to sysfs path.
+ * @param[out] buf
+ * Data output buffer.
+ * @param size
+ * Buffer size.
+ *
+ * @return
+ * Number of bytes read on success, negative errno value otherwise and
+ * rte_errno is set.
+ */
+static int
+mlx4_sysfs_read(const struct priv *priv, const char *entry,
+ char *buf, size_t size)
+{
+ char ifname[IF_NAMESIZE];
+ FILE *file;
+ int ret;
+
+ ret = mlx4_get_ifname(priv, &ifname);
+ if (ret)
+ return ret;
+
+ MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
+ ifname, entry);
+
+ file = fopen(path, "rb");
+ if (file == NULL) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ ret = fread(buf, 1, size, file);
+ if ((size_t)ret < size && ferror(file)) {
+ rte_errno = EIO;
+ ret = -rte_errno;
+ } else {
+ ret = size;
+ }
+ fclose(file);
+ return ret;
+}
+
+/**
+ * Write to sysfs entry.
+ *
+ * @param[in] priv
+ * Pointer to private structure.
+ * @param[in] entry
+ * Entry name relative to sysfs path.
+ * @param[in] buf
+ * Data buffer.
+ * @param size
+ * Buffer size.
+ *
+ * @return
+ * Number of bytes written on success, negative errno value otherwise and
+ * rte_errno is set.
+ */
+static int
+mlx4_sysfs_write(const struct priv *priv, const char *entry,
+ char *buf, size_t size)
+{
+ char ifname[IF_NAMESIZE];
+ FILE *file;
+ int ret;
+
+ ret = mlx4_get_ifname(priv, &ifname);
+ if (ret)
+ return ret;
+
+ MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path,
+ ifname, entry);
+
+ file = fopen(path, "wb");
+ if (file == NULL) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ ret = fwrite(buf, 1, size, file);
+ if ((size_t)ret < size || ferror(file)) {
+ rte_errno = EIO;
+ ret = -rte_errno;
+ } else {
+ ret = size;
+ }
+ fclose(file);
+ return ret;
+}
+
+/**
+ * Get unsigned long sysfs property.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param[in] name
+ * Entry name relative to sysfs path.
+ * @param[out] value
+ * Value output buffer.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value)
+{
+ int ret;
+ unsigned long value_ret;
+ char value_str[32];
+
+ ret = mlx4_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1));
+ if (ret < 0) {
+ DEBUG("cannot read %s value from sysfs: %s",
+ name, strerror(rte_errno));
+ return ret;
+ }
+ value_str[ret] = '\0';
+ errno = 0;
+ value_ret = strtoul(value_str, NULL, 0);
+ if (errno) {
+ rte_errno = errno;
+ DEBUG("invalid %s value `%s': %s", name, value_str,
+ strerror(rte_errno));
+ return -rte_errno;
+ }
+ *value = value_ret;
+ return 0;
+}
+
+/**
+ * Set unsigned long sysfs property.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param[in] name
+ * Entry name relative to sysfs path.
+ * @param value
+ * Value to set.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value)
+{
+ int ret;
+ MKSTR(value_str, "%lu", value);
+
+ ret = mlx4_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1));
+ if (ret < 0) {
+ DEBUG("cannot write %s `%s' (%lu) to sysfs: %s",
+ name, value_str, value, strerror(rte_errno));
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * Perform ifreq ioctl() on associated Ethernet device.
+ *
+ * @param[in] priv
+ * Pointer to private structure.
+ * @param req
+ * Request number to pass to ioctl().
+ * @param[out] ifr
+ * Interface request structure output buffer.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_ifreq(const struct priv *priv, int req, struct ifreq *ifr)
+{
+ int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+ int ret;
+
+ if (sock == -1) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ ret = mlx4_get_ifname(priv, &ifr->ifr_name);
+ if (!ret && ioctl(sock, req, ifr) == -1) {
+ rte_errno = errno;
+ ret = -rte_errno;
+ }
+ close(sock);
+ return ret;
+}
+
+/**
+ * Get MAC address by querying netdevice.
+ *
+ * @param[in] priv
+ * Pointer to private structure.
+ * @param[out] mac
+ * MAC address output buffer.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN])
+{
+ struct ifreq request;
+ int ret = mlx4_ifreq(priv, SIOCGIFHWADDR, &request);
+
+ if (ret)
+ return ret;
+ memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
+ return 0;
+}
+
+/**
+ * Get device MTU.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param[out] mtu
+ * MTU value output buffer.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mtu_get(struct priv *priv, uint16_t *mtu)
+{
+ unsigned long ulong_mtu = 0;
+ int ret = mlx4_get_sysfs_ulong(priv, "mtu", &ulong_mtu);
+
+ if (ret)
+ return ret;
+ *mtu = ulong_mtu;
+ return 0;
+}
+
+/**
+ * DPDK callback to change the MTU.
+ *
+ * @param priv
+ * Pointer to Ethernet device structure.
+ * @param mtu
+ * MTU value to set.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
+{
+ struct priv *priv = dev->data->dev_private;
+ uint16_t new_mtu;
+ int ret = mlx4_set_sysfs_ulong(priv, "mtu", mtu);
+
+ if (ret)
+ return ret;
+ ret = mlx4_mtu_get(priv, &new_mtu);
+ if (ret)
+ return ret;
+ if (new_mtu == mtu) {
+ priv->mtu = mtu;
+ return 0;
+ }
+ rte_errno = EINVAL;
+ return -rte_errno;
+}
+
+/**
+ * Set device flags.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param keep
+ * Bitmask for flags that must remain untouched.
+ * @param flags
+ * Bitmask for flags to modify.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_set_flags(struct priv *priv, unsigned int keep, unsigned int flags)
+{
+ unsigned long tmp = 0;
+ int ret = mlx4_get_sysfs_ulong(priv, "flags", &tmp);
+
+ if (ret)
+ return ret;
+ tmp &= keep;
+ tmp |= (flags & (~keep));
+ return mlx4_set_sysfs_ulong(priv, "flags", tmp);
+}
+
+/**
+ * Change the link state (UP / DOWN).
+ *
+ * @param priv
+ * Pointer to Ethernet device private data.
+ * @param up
+ * Nonzero for link up, otherwise link down.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_dev_set_link(struct priv *priv, int up)
+{
+ int err;
+
+ if (up) {
+ err = mlx4_set_flags(priv, ~IFF_UP, IFF_UP);
+ if (err)
+ return err;
+ } else {
+ err = mlx4_set_flags(priv, ~IFF_UP, ~IFF_UP);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/**
+ * DPDK callback to bring the link DOWN.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_dev_set_link_down(struct rte_eth_dev *dev)
+{
+ struct priv *priv = dev->data->dev_private;
+
+ return mlx4_dev_set_link(priv, 0);
+}
+
+/**
+ * DPDK callback to bring the link UP.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_dev_set_link_up(struct rte_eth_dev *dev)
+{
+ struct priv *priv = dev->data->dev_private;
+
+ return mlx4_dev_set_link(priv, 1);
+}
+
+/**
+ * Supported Rx mode toggles.
+ *
+ * Even and odd values respectively stand for off and on.
+ */
+enum rxmode_toggle {
+ RXMODE_TOGGLE_PROMISC_OFF,
+ RXMODE_TOGGLE_PROMISC_ON,
+ RXMODE_TOGGLE_ALLMULTI_OFF,
+ RXMODE_TOGGLE_ALLMULTI_ON,
+};
+
+/**
+ * Helper function to toggle promiscuous and all multicast modes.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param toggle
+ * Toggle to set.
+ */
+static void
+mlx4_rxmode_toggle(struct rte_eth_dev *dev, enum rxmode_toggle toggle)
+{
+ struct priv *priv = dev->data->dev_private;
+ const char *mode;
+ struct rte_flow_error error;
+
+ switch (toggle) {
+ case RXMODE_TOGGLE_PROMISC_OFF:
+ case RXMODE_TOGGLE_PROMISC_ON:
+ mode = "promiscuous";
+ dev->data->promiscuous = toggle & 1;
+ break;
+ case RXMODE_TOGGLE_ALLMULTI_OFF:
+ case RXMODE_TOGGLE_ALLMULTI_ON:
+ mode = "all multicast";
+ dev->data->all_multicast = toggle & 1;
+ break;
+ }
+ if (!mlx4_flow_sync(priv, &error))
+ return;
+ ERROR("cannot toggle %s mode (code %d, \"%s\"),"
+ " flow error type %d, cause %p, message: %s",
+ mode, rte_errno, strerror(rte_errno), error.type, error.cause,
+ error.message ? error.message : "(unspecified)");
+}
+
+/**
+ * DPDK callback to enable promiscuous mode.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+void
+mlx4_promiscuous_enable(struct rte_eth_dev *dev)
+{
+ mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_PROMISC_ON);
+}
+
+/**
+ * DPDK callback to disable promiscuous mode.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+void
+mlx4_promiscuous_disable(struct rte_eth_dev *dev)
+{
+ mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_PROMISC_OFF);
+}
+
+/**
+ * DPDK callback to enable all multicast mode.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+void
+mlx4_allmulticast_enable(struct rte_eth_dev *dev)
+{
+ mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_ALLMULTI_ON);
+}
+
+/**
+ * DPDK callback to disable all multicast mode.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+void
+mlx4_allmulticast_disable(struct rte_eth_dev *dev)
+{
+ mlx4_rxmode_toggle(dev, RXMODE_TOGGLE_ALLMULTI_OFF);
+}
+
+/**
+ * DPDK callback to remove a MAC address.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param index
+ * MAC address index.
+ */
+void
+mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct rte_flow_error error;
+
+ if (index >= RTE_DIM(priv->mac)) {
+ rte_errno = EINVAL;
+ return;
+ }
+ memset(&priv->mac[index], 0, sizeof(priv->mac[index]));
+ if (!mlx4_flow_sync(priv, &error))
+ return;
+ ERROR("failed to synchronize flow rules after removing MAC address"
+ " at index %d (code %d, \"%s\"),"
+ " flow error type %d, cause %p, message: %s",
+ index, rte_errno, strerror(rte_errno), error.type, error.cause,
+ error.message ? error.message : "(unspecified)");
+}
+
+/**
+ * DPDK callback to add a MAC address.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param mac_addr
+ * MAC address to register.
+ * @param index
+ * MAC address index.
+ * @param vmdq
+ * VMDq pool index to associate address with (ignored).
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
+ uint32_t index, uint32_t vmdq)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct rte_flow_error error;
+ int ret;
+
+ (void)vmdq;
+ if (index >= RTE_DIM(priv->mac)) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ memcpy(&priv->mac[index], mac_addr, sizeof(priv->mac[index]));
+ ret = mlx4_flow_sync(priv, &error);
+ if (!ret)
+ return 0;
+ ERROR("failed to synchronize flow rules after adding MAC address"
+ " at index %d (code %d, \"%s\"),"
+ " flow error type %d, cause %p, message: %s",
+ index, rte_errno, strerror(rte_errno), error.type, error.cause,
+ error.message ? error.message : "(unspecified)");
+ return ret;
+}
+
+/**
+ * DPDK callback to configure a VLAN filter.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param vlan_id
+ * VLAN ID to filter.
+ * @param on
+ * Toggle filter.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct rte_flow_error error;
+ unsigned int vidx = vlan_id / 64;
+ unsigned int vbit = vlan_id % 64;
+ uint64_t *v;
+ int ret;
+
+ if (vidx >= RTE_DIM(dev->data->vlan_filter_conf.ids)) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ v = &dev->data->vlan_filter_conf.ids[vidx];
+ *v &= ~(UINT64_C(1) << vbit);
+ *v |= (uint64_t)!!on << vbit;
+ ret = mlx4_flow_sync(priv, &error);
+ if (!ret)
+ return 0;
+ ERROR("failed to synchronize flow rules after %s VLAN filter on ID %u"
+ " (code %d, \"%s\"), "
+ " flow error type %d, cause %p, message: %s",
+ on ? "enabling" : "disabling", vlan_id,
+ rte_errno, strerror(rte_errno), error.type, error.cause,
+ error.message ? error.message : "(unspecified)");
+ return ret;
+}
+
+/**
+ * DPDK callback to set the primary MAC address.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param mac_addr
+ * MAC address to register.
+ */
+void
+mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
+{
+ mlx4_mac_addr_add(dev, mac_addr, 0, 0);
+}
+
+/**
+ * DPDK callback to get information about the device.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] info
+ * Info structure output buffer.
+ */
+void
+mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
+{
+ struct priv *priv = dev->data->dev_private;
+ unsigned int max;
+ char ifname[IF_NAMESIZE];
+
+ info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
+ /* FIXME: we should ask the device for these values. */
+ info->min_rx_bufsize = 32;
+ info->max_rx_pktlen = 65536;
+ /*
+ * Since we need one CQ per QP, the limit is the minimum number
+ * between the two values.
+ */
+ max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ?
+ priv->device_attr.max_qp : priv->device_attr.max_cq);
+ /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
+ if (max >= 65535)
+ max = 65535;
+ info->max_rx_queues = max;
+ info->max_tx_queues = max;
+ info->max_mac_addrs = RTE_DIM(priv->mac);
+ info->rx_offload_capa = 0;
+ info->tx_offload_capa = 0;
+ if (priv->hw_csum) {
+ info->tx_offload_capa |= (DEV_TX_OFFLOAD_IPV4_CKSUM |
+ DEV_TX_OFFLOAD_UDP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_CKSUM);
+ info->rx_offload_capa |= (DEV_RX_OFFLOAD_IPV4_CKSUM |
+ DEV_RX_OFFLOAD_UDP_CKSUM |
+ DEV_RX_OFFLOAD_TCP_CKSUM);
+ }
+ if (priv->hw_csum_l2tun)
+ info->tx_offload_capa |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+ if (mlx4_get_ifname(priv, &ifname) == 0)
+ info->if_index = if_nametoindex(ifname);
+ info->hash_key_size = MLX4_RSS_HASH_KEY_SIZE;
+ info->speed_capa =
+ ETH_LINK_SPEED_1G |
+ ETH_LINK_SPEED_10G |
+ ETH_LINK_SPEED_20G |
+ ETH_LINK_SPEED_40G |
+ ETH_LINK_SPEED_56G;
+}
+
+/**
+ * DPDK callback to get device statistics.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] stats
+ * Stats structure output buffer.
+ */
+int
+mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+ struct rte_eth_stats tmp;
+ unsigned int i;
+ unsigned int idx;
+
+ memset(&tmp, 0, sizeof(tmp));
+ /* Add software counters. */
+ for (i = 0; i != dev->data->nb_rx_queues; ++i) {
+ struct rxq *rxq = dev->data->rx_queues[i];
+
+ if (rxq == NULL)
+ continue;
+ idx = rxq->stats.idx;
+ if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+ tmp.q_ipackets[idx] += rxq->stats.ipackets;
+ tmp.q_ibytes[idx] += rxq->stats.ibytes;
+ tmp.q_errors[idx] += (rxq->stats.idropped +
+ rxq->stats.rx_nombuf);
+ }
+ tmp.ipackets += rxq->stats.ipackets;
+ tmp.ibytes += rxq->stats.ibytes;
+ tmp.ierrors += rxq->stats.idropped;
+ tmp.rx_nombuf += rxq->stats.rx_nombuf;
+ }
+ for (i = 0; i != dev->data->nb_tx_queues; ++i) {
+ struct txq *txq = dev->data->tx_queues[i];
+
+ if (txq == NULL)
+ continue;
+ idx = txq->stats.idx;
+ if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+ tmp.q_opackets[idx] += txq->stats.opackets;
+ tmp.q_obytes[idx] += txq->stats.obytes;
+ tmp.q_errors[idx] += txq->stats.odropped;
+ }
+ tmp.opackets += txq->stats.opackets;
+ tmp.obytes += txq->stats.obytes;
+ tmp.oerrors += txq->stats.odropped;
+ }
+ *stats = tmp;
+ return 0;
+}
+
+/**
+ * DPDK callback to clear device statistics.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+void
+mlx4_stats_reset(struct rte_eth_dev *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i != dev->data->nb_rx_queues; ++i) {
+ struct rxq *rxq = dev->data->rx_queues[i];
+
+ if (rxq)
+ rxq->stats = (struct mlx4_rxq_stats){
+ .idx = rxq->stats.idx,
+ };
+ }
+ for (i = 0; i != dev->data->nb_tx_queues; ++i) {
+ struct txq *txq = dev->data->tx_queues[i];
+
+ if (txq)
+ txq->stats = (struct mlx4_txq_stats){
+ .idx = txq->stats.idx,
+ };
+ }
+}
+
+/**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param wait_to_complete
+ * Wait for request completion (ignored).
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete)
+{
+ const struct priv *priv = dev->data->dev_private;
+ struct ethtool_cmd edata = {
+ .cmd = ETHTOOL_GSET,
+ };
+ struct ifreq ifr;
+ struct rte_eth_link dev_link;
+ int link_speed = 0;
+
+ if (priv == NULL) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ (void)wait_to_complete;
+ if (mlx4_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
+ WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(rte_errno));
+ return -rte_errno;
+ }
+ memset(&dev_link, 0, sizeof(dev_link));
+ dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
+ (ifr.ifr_flags & IFF_RUNNING));
+ ifr.ifr_data = (void *)&edata;
+ if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) {
+ WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
+ strerror(rte_errno));
+ return -rte_errno;
+ }
+ link_speed = ethtool_cmd_speed(&edata);
+ if (link_speed == -1)
+ dev_link.link_speed = 0;
+ else
+ dev_link.link_speed = link_speed;
+ dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
+ ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+ dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
+ ETH_LINK_SPEED_FIXED);
+ dev->data->dev_link = dev_link;
+ return 0;
+}
+
+/**
+ * DPDK callback to get flow control status.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] fc_conf
+ * Flow control output buffer.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_flow_ctrl_get(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct ifreq ifr;
+ struct ethtool_pauseparam ethpause = {
+ .cmd = ETHTOOL_GPAUSEPARAM,
+ };
+ int ret;
+
+ ifr.ifr_data = (void *)&ethpause;
+ if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) {
+ ret = rte_errno;
+ WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)"
+ " failed: %s",
+ strerror(rte_errno));
+ goto out;
+ }
+ fc_conf->autoneg = ethpause.autoneg;
+ if (ethpause.rx_pause && ethpause.tx_pause)
+ fc_conf->mode = RTE_FC_FULL;
+ else if (ethpause.rx_pause)
+ fc_conf->mode = RTE_FC_RX_PAUSE;
+ else if (ethpause.tx_pause)
+ fc_conf->mode = RTE_FC_TX_PAUSE;
+ else
+ fc_conf->mode = RTE_FC_NONE;
+ ret = 0;
+out:
+ assert(ret >= 0);
+ return -ret;
+}
+
+/**
+ * DPDK callback to modify flow control parameters.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[in] fc_conf
+ * Flow control parameters.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct ifreq ifr;
+ struct ethtool_pauseparam ethpause = {
+ .cmd = ETHTOOL_SPAUSEPARAM,
+ };
+ int ret;
+
+ ifr.ifr_data = (void *)&ethpause;
+ ethpause.autoneg = fc_conf->autoneg;
+ if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+ (fc_conf->mode & RTE_FC_RX_PAUSE))
+ ethpause.rx_pause = 1;
+ else
+ ethpause.rx_pause = 0;
+ if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+ (fc_conf->mode & RTE_FC_TX_PAUSE))
+ ethpause.tx_pause = 1;
+ else
+ ethpause.tx_pause = 0;
+ if (mlx4_ifreq(priv, SIOCETHTOOL, &ifr)) {
+ ret = rte_errno;
+ WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
+ " failed: %s",
+ strerror(rte_errno));
+ goto out;
+ }
+ ret = 0;
+out:
+ assert(ret >= 0);
+ return -ret;
+}
+
+/**
+ * DPDK callback to retrieve the received packet types that are recognized
+ * by the device.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * Pointer to an array of recognized packet types if in Rx burst mode,
+ * NULL otherwise.
+ */
+const uint32_t *
+mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev)
+{
+ static const uint32_t ptypes[] = {
+ /* refers to rxq_cq_to_pkt_type() */
+ RTE_PTYPE_L2_ETHER,
+ RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+ RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+ RTE_PTYPE_L4_FRAG,
+ RTE_PTYPE_L4_TCP,
+ RTE_PTYPE_L4_UDP,
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
+ RTE_PTYPE_UNKNOWN
+ };
+
+ if (dev->rx_pkt_burst == mlx4_rx_burst)
+ return ptypes;
+ return NULL;
+}
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 925c89c5..8b87b298 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -2,7 +2,7 @@
* BSD LICENSE
*
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox.
+ * Copyright 2017 Mellanox
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -31,197 +31,328 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+/**
+ * @file
+ * Flow API operations for mlx4 driver.
+ */
+
+#include <arpa/inet.h>
#include <assert.h>
+#include <errno.h>
+#include <stdalign.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/queue.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+#include <rte_byteorder.h>
+#include <rte_errno.h>
+#include <rte_eth_ctrl.h>
+#include <rte_ethdev.h>
+#include <rte_ether.h>
#include <rte_flow.h>
#include <rte_flow_driver.h>
#include <rte_malloc.h>
-/* Generated configuration header. */
-#include "mlx4_autoconf.h"
-
/* PMD headers. */
#include "mlx4.h"
#include "mlx4_flow.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
-/** Static initializer for items. */
-#define ITEMS(...) \
+/** Static initializer for a list of subsequent item types. */
+#define NEXT_ITEM(...) \
(const enum rte_flow_item_type []){ \
__VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
}
-/** Structure to generate a simple graph of layers supported by the NIC. */
-struct mlx4_flow_items {
- /** List of possible actions for these items. */
- const enum rte_flow_action_type *const actions;
- /** Bit-masks corresponding to the possibilities for the item. */
- const void *mask;
- /**
- * Default bit-masks to use when item->mask is not provided. When
- * \default_mask is also NULL, the full supported bit-mask (\mask) is
- * used instead.
- */
- const void *default_mask;
- /** Bit-masks size in bytes. */
+/** Processor structure associated with a flow item. */
+struct mlx4_flow_proc_item {
+ /** Bit-mask for fields supported by this PMD. */
+ const void *mask_support;
+ /** Bit-mask to use when @p item->mask is not provided. */
+ const void *mask_default;
+ /** Size in bytes for @p mask_support and @p mask_default. */
const unsigned int mask_sz;
- /**
- * Check support for a given item.
- *
- * @param item[in]
- * Item specification.
- * @param mask[in]
- * Bit-masks covering supported fields to compare with spec,
- * last and mask in
- * \item.
- * @param size
- * Bit-Mask size in bytes.
- *
- * @return
- * 0 on success, negative value otherwise.
- */
- int (*validate)(const struct rte_flow_item *item,
- const uint8_t *mask, unsigned int size);
- /**
- * Conversion function from rte_flow to NIC specific flow.
- *
- * @param item
- * rte_flow item to convert.
- * @param default_mask
- * Default bit-masks to use when item->mask is not provided.
- * @param data
- * Internal structure to store the conversion.
- *
- * @return
- * 0 on success, negative value otherwise.
- */
- int (*convert)(const struct rte_flow_item *item,
- const void *default_mask,
- void *data);
+ /** Merge a pattern item into a flow rule handle. */
+ int (*merge)(struct rte_flow *flow,
+ const struct rte_flow_item *item,
+ const struct mlx4_flow_proc_item *proc,
+ struct rte_flow_error *error);
/** Size in bytes of the destination structure. */
const unsigned int dst_sz;
- /** List of possible following items. */
- const enum rte_flow_item_type *const items;
+ /** List of possible subsequent items. */
+ const enum rte_flow_item_type *const next_item;
};
-struct rte_flow_drop {
- struct ibv_qp *qp; /**< Verbs queue pair. */
- struct ibv_cq *cq; /**< Verbs completion queue. */
+/** Shared resources for drop flow rules. */
+struct mlx4_drop {
+ struct ibv_qp *qp; /**< QP target. */
+ struct ibv_cq *cq; /**< CQ associated with above QP. */
+ struct priv *priv; /**< Back pointer to private data. */
+ uint32_t refcnt; /**< Reference count. */
};
-/** Valid action for this PMD. */
-static const enum rte_flow_action_type valid_actions[] = {
- RTE_FLOW_ACTION_TYPE_DROP,
- RTE_FLOW_ACTION_TYPE_QUEUE,
- RTE_FLOW_ACTION_TYPE_RSS,
- RTE_FLOW_ACTION_TYPE_END,
-};
+/**
+ * Convert DPDK RSS hash fields to their Verbs equivalent.
+ *
+ * @param rss_hf
+ * Hash fields in DPDK format (see struct rte_eth_rss_conf).
+ *
+ * @return
+ * A valid Verbs RSS hash fields mask for mlx4 on success, (uint64_t)-1
+ * otherwise and rte_errno is set.
+ */
+static uint64_t
+mlx4_conv_rss_hf(uint64_t rss_hf)
+{
+ enum { IPV4, IPV6, TCP, UDP, };
+ const uint64_t in[] = {
+ [IPV4] = (ETH_RSS_IPV4 |
+ ETH_RSS_FRAG_IPV4 |
+ ETH_RSS_NONFRAG_IPV4_TCP |
+ ETH_RSS_NONFRAG_IPV4_UDP |
+ ETH_RSS_NONFRAG_IPV4_OTHER),
+ [IPV6] = (ETH_RSS_IPV6 |
+ ETH_RSS_FRAG_IPV6 |
+ ETH_RSS_NONFRAG_IPV6_TCP |
+ ETH_RSS_NONFRAG_IPV6_UDP |
+ ETH_RSS_NONFRAG_IPV6_OTHER |
+ ETH_RSS_IPV6_EX |
+ ETH_RSS_IPV6_TCP_EX |
+ ETH_RSS_IPV6_UDP_EX),
+ [TCP] = (ETH_RSS_NONFRAG_IPV4_TCP |
+ ETH_RSS_NONFRAG_IPV6_TCP |
+ ETH_RSS_IPV6_TCP_EX),
+ /*
+ * UDP support is temporarily disabled due to an
+ * implementation issue in the kernel.
+ */
+ [UDP] = 0,
+ };
+ const uint64_t out[RTE_DIM(in)] = {
+ [IPV4] = IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4,
+ [IPV6] = IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6,
+ [TCP] = IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP,
+ [UDP] = IBV_RX_HASH_SRC_PORT_UDP | IBV_RX_HASH_DST_PORT_UDP,
+ };
+ uint64_t seen = 0;
+ uint64_t conv = 0;
+ unsigned int i;
+
+ for (i = 0; i != RTE_DIM(in); ++i)
+ if (rss_hf & in[i]) {
+ seen |= rss_hf & in[i];
+ conv |= out[i];
+ }
+ if (!(rss_hf & ~seen))
+ return conv;
+ rte_errno = ENOTSUP;
+ return (uint64_t)-1;
+}
/**
- * Convert Ethernet item to Verbs specification.
+ * Merge Ethernet pattern item into flow rule handle.
*
- * @param item[in]
- * Item specification.
- * @param default_mask[in]
- * Default bit-masks to use when item->mask is not provided.
- * @param data[in, out]
- * User structure.
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks, except in the specific case of matching
+ * all multicast traffic (@p spec->dst and @p mask->dst equal to
+ * 01:00:00:00:00:00).
+ * - Not providing @p item->spec or providing an empty @p mask->dst is
+ * *only* supported if the rule doesn't specify additional matching
+ * criteria (i.e. rule is promiscuous-like).
+ *
+ * @param[in, out] flow
+ * Flow rule handle to update.
+ * @param[in] item
+ * Pattern item to merge.
+ * @param[in] proc
+ * Associated item-processing object.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
-mlx4_flow_create_eth(const struct rte_flow_item *item,
- const void *default_mask,
- void *data)
+mlx4_flow_merge_eth(struct rte_flow *flow,
+ const struct rte_flow_item *item,
+ const struct mlx4_flow_proc_item *proc,
+ struct rte_flow_error *error)
{
const struct rte_flow_item_eth *spec = item->spec;
- const struct rte_flow_item_eth *mask = item->mask;
- struct mlx4_flow *flow = (struct mlx4_flow *)data;
+ const struct rte_flow_item_eth *mask =
+ spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
struct ibv_flow_spec_eth *eth;
- const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth);
+ const char *msg;
unsigned int i;
+ if (!mask) {
+ flow->promisc = 1;
+ } else {
+ uint32_t sum_dst = 0;
+ uint32_t sum_src = 0;
+
+ for (i = 0; i != sizeof(mask->dst.addr_bytes); ++i) {
+ sum_dst += mask->dst.addr_bytes[i];
+ sum_src += mask->src.addr_bytes[i];
+ }
+ if (sum_src) {
+ msg = "mlx4 does not support source MAC matching";
+ goto error;
+ } else if (!sum_dst) {
+ flow->promisc = 1;
+ } else if (sum_dst == 1 && mask->dst.addr_bytes[0] == 1) {
+ if (!(spec->dst.addr_bytes[0] & 1)) {
+ msg = "mlx4 does not support the explicit"
+ " exclusion of all multicast traffic";
+ goto error;
+ }
+ flow->allmulti = 1;
+ } else if (sum_dst != (UINT8_C(0xff) * ETHER_ADDR_LEN)) {
+ msg = "mlx4 does not support matching partial"
+ " Ethernet fields";
+ goto error;
+ }
+ }
+ if (!flow->ibv_attr)
+ return 0;
+ if (flow->promisc) {
+ flow->ibv_attr->type = IBV_FLOW_ATTR_ALL_DEFAULT;
+ return 0;
+ }
+ if (flow->allmulti) {
+ flow->ibv_attr->type = IBV_FLOW_ATTR_MC_DEFAULT;
+ return 0;
+ }
++flow->ibv_attr->num_of_specs;
- flow->ibv_attr->priority = 2;
- eth = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
+ eth = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
*eth = (struct ibv_flow_spec_eth) {
.type = IBV_FLOW_SPEC_ETH,
- .size = eth_size,
+ .size = sizeof(*eth),
};
- if (!spec) {
- flow->ibv_attr->type = IBV_FLOW_ATTR_ALL_DEFAULT;
- return 0;
- }
- if (!mask)
- mask = default_mask;
memcpy(eth->val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN);
- memcpy(eth->val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN);
memcpy(eth->mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN);
- memcpy(eth->mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN);
/* Remove unwanted bits from values. */
for (i = 0; i < ETHER_ADDR_LEN; ++i) {
eth->val.dst_mac[i] &= eth->mask.dst_mac[i];
- eth->val.src_mac[i] &= eth->mask.src_mac[i];
}
return 0;
+error:
+ return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+ item, msg);
}
/**
- * Convert VLAN item to Verbs specification.
+ * Merge VLAN pattern item into flow rule handle.
*
- * @param item[in]
- * Item specification.
- * @param default_mask[in]
- * Default bit-masks to use when item->mask is not provided.
- * @param data[in, out]
- * User structure.
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - Matching *all* VLAN traffic by omitting @p item->spec or providing an
+ * empty @p item->mask would also include non-VLAN traffic. Doing so is
+ * therefore unsupported.
+ * - No support for partial masks.
+ *
+ * @param[in, out] flow
+ * Flow rule handle to update.
+ * @param[in] item
+ * Pattern item to merge.
+ * @param[in] proc
+ * Associated item-processing object.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
-mlx4_flow_create_vlan(const struct rte_flow_item *item,
- const void *default_mask,
- void *data)
+mlx4_flow_merge_vlan(struct rte_flow *flow,
+ const struct rte_flow_item *item,
+ const struct mlx4_flow_proc_item *proc,
+ struct rte_flow_error *error)
{
const struct rte_flow_item_vlan *spec = item->spec;
- const struct rte_flow_item_vlan *mask = item->mask;
- struct mlx4_flow *flow = (struct mlx4_flow *)data;
+ const struct rte_flow_item_vlan *mask =
+ spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
struct ibv_flow_spec_eth *eth;
- const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth);
+ const char *msg;
- eth = (void *)((uintptr_t)flow->ibv_attr + flow->offset - eth_size);
- if (!spec)
+ if (!mask || !mask->tci) {
+ msg = "mlx4 cannot match all VLAN traffic while excluding"
+ " non-VLAN traffic, TCI VID must be specified";
+ goto error;
+ }
+ if (mask->tci != RTE_BE16(0x0fff)) {
+ msg = "mlx4 does not support partial TCI VID matching";
+ goto error;
+ }
+ if (!flow->ibv_attr)
return 0;
- if (!mask)
- mask = default_mask;
+ eth = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size -
+ sizeof(*eth));
eth->val.vlan_tag = spec->tci;
eth->mask.vlan_tag = mask->tci;
eth->val.vlan_tag &= eth->mask.vlan_tag;
return 0;
+error:
+ return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+ item, msg);
}
/**
- * Convert IPv4 item to Verbs specification.
+ * Merge IPv4 pattern item into flow rule handle.
*
- * @param item[in]
- * Item specification.
- * @param default_mask[in]
- * Default bit-masks to use when item->mask is not provided.
- * @param data[in, out]
- * User structure.
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks.
+ *
+ * @param[in, out] flow
+ * Flow rule handle to update.
+ * @param[in] item
+ * Pattern item to merge.
+ * @param[in] proc
+ * Associated item-processing object.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
-mlx4_flow_create_ipv4(const struct rte_flow_item *item,
- const void *default_mask,
- void *data)
+mlx4_flow_merge_ipv4(struct rte_flow *flow,
+ const struct rte_flow_item *item,
+ const struct mlx4_flow_proc_item *proc,
+ struct rte_flow_error *error)
{
const struct rte_flow_item_ipv4 *spec = item->spec;
- const struct rte_flow_item_ipv4 *mask = item->mask;
- struct mlx4_flow *flow = (struct mlx4_flow *)data;
+ const struct rte_flow_item_ipv4 *mask =
+ spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
struct ibv_flow_spec_ipv4 *ipv4;
- unsigned int ipv4_size = sizeof(struct ibv_flow_spec_ipv4);
+ const char *msg;
+ if (mask &&
+ ((uint32_t)(mask->hdr.src_addr + 1) > UINT32_C(1) ||
+ (uint32_t)(mask->hdr.dst_addr + 1) > UINT32_C(1))) {
+ msg = "mlx4 does not support matching partial IPv4 fields";
+ goto error;
+ }
+ if (!flow->ibv_attr)
+ return 0;
++flow->ibv_attr->num_of_specs;
- flow->ibv_attr->priority = 1;
- ipv4 = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
+ ipv4 = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
*ipv4 = (struct ibv_flow_spec_ipv4) {
.type = IBV_FLOW_SPEC_IPV4,
- .size = ipv4_size,
+ .size = sizeof(*ipv4),
};
if (!spec)
return 0;
@@ -229,8 +360,6 @@ mlx4_flow_create_ipv4(const struct rte_flow_item *item,
.src_ip = spec->hdr.src_addr,
.dst_ip = spec->hdr.dst_addr,
};
- if (!mask)
- mask = default_mask;
ipv4->mask = (struct ibv_flow_ipv4_filter) {
.src_ip = mask->hdr.src_addr,
.dst_ip = mask->hdr.dst_addr,
@@ -239,528 +368,504 @@ mlx4_flow_create_ipv4(const struct rte_flow_item *item,
ipv4->val.src_ip &= ipv4->mask.src_ip;
ipv4->val.dst_ip &= ipv4->mask.dst_ip;
return 0;
+error:
+ return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+ item, msg);
}
/**
- * Convert UDP item to Verbs specification.
+ * Merge UDP pattern item into flow rule handle.
*
- * @param item[in]
- * Item specification.
- * @param default_mask[in]
- * Default bit-masks to use when item->mask is not provided.
- * @param data[in, out]
- * User structure.
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks.
+ *
+ * @param[in, out] flow
+ * Flow rule handle to update.
+ * @param[in] item
+ * Pattern item to merge.
+ * @param[in] proc
+ * Associated item-processing object.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
-mlx4_flow_create_udp(const struct rte_flow_item *item,
- const void *default_mask,
- void *data)
+mlx4_flow_merge_udp(struct rte_flow *flow,
+ const struct rte_flow_item *item,
+ const struct mlx4_flow_proc_item *proc,
+ struct rte_flow_error *error)
{
const struct rte_flow_item_udp *spec = item->spec;
- const struct rte_flow_item_udp *mask = item->mask;
- struct mlx4_flow *flow = (struct mlx4_flow *)data;
+ const struct rte_flow_item_udp *mask =
+ spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
struct ibv_flow_spec_tcp_udp *udp;
- unsigned int udp_size = sizeof(struct ibv_flow_spec_tcp_udp);
+ const char *msg;
+ if (mask &&
+ ((uint16_t)(mask->hdr.src_port + 1) > UINT16_C(1) ||
+ (uint16_t)(mask->hdr.dst_port + 1) > UINT16_C(1))) {
+ msg = "mlx4 does not support matching partial UDP fields";
+ goto error;
+ }
+ if (!flow->ibv_attr)
+ return 0;
++flow->ibv_attr->num_of_specs;
- flow->ibv_attr->priority = 0;
- udp = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
+ udp = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
*udp = (struct ibv_flow_spec_tcp_udp) {
.type = IBV_FLOW_SPEC_UDP,
- .size = udp_size,
+ .size = sizeof(*udp),
};
if (!spec)
return 0;
udp->val.dst_port = spec->hdr.dst_port;
udp->val.src_port = spec->hdr.src_port;
- if (!mask)
- mask = default_mask;
udp->mask.dst_port = mask->hdr.dst_port;
udp->mask.src_port = mask->hdr.src_port;
/* Remove unwanted bits from values. */
udp->val.src_port &= udp->mask.src_port;
udp->val.dst_port &= udp->mask.dst_port;
return 0;
+error:
+ return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+ item, msg);
}
/**
- * Convert TCP item to Verbs specification.
+ * Merge TCP pattern item into flow rule handle.
*
- * @param item[in]
- * Item specification.
- * @param default_mask[in]
- * Default bit-masks to use when item->mask is not provided.
- * @param data[in, out]
- * User structure.
+ * Additional mlx4-specific constraints on supported fields:
+ *
+ * - No support for partial masks.
+ *
+ * @param[in, out] flow
+ * Flow rule handle to update.
+ * @param[in] item
+ * Pattern item to merge.
+ * @param[in] proc
+ * Associated item-processing object.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
-mlx4_flow_create_tcp(const struct rte_flow_item *item,
- const void *default_mask,
- void *data)
+mlx4_flow_merge_tcp(struct rte_flow *flow,
+ const struct rte_flow_item *item,
+ const struct mlx4_flow_proc_item *proc,
+ struct rte_flow_error *error)
{
const struct rte_flow_item_tcp *spec = item->spec;
- const struct rte_flow_item_tcp *mask = item->mask;
- struct mlx4_flow *flow = (struct mlx4_flow *)data;
+ const struct rte_flow_item_tcp *mask =
+ spec ? (item->mask ? item->mask : proc->mask_default) : NULL;
struct ibv_flow_spec_tcp_udp *tcp;
- unsigned int tcp_size = sizeof(struct ibv_flow_spec_tcp_udp);
+ const char *msg;
+ if (mask &&
+ ((uint16_t)(mask->hdr.src_port + 1) > UINT16_C(1) ||
+ (uint16_t)(mask->hdr.dst_port + 1) > UINT16_C(1))) {
+ msg = "mlx4 does not support matching partial TCP fields";
+ goto error;
+ }
+ if (!flow->ibv_attr)
+ return 0;
++flow->ibv_attr->num_of_specs;
- flow->ibv_attr->priority = 0;
- tcp = (void *)((uintptr_t)flow->ibv_attr + flow->offset);
+ tcp = (void *)((uintptr_t)flow->ibv_attr + flow->ibv_attr_size);
*tcp = (struct ibv_flow_spec_tcp_udp) {
.type = IBV_FLOW_SPEC_TCP,
- .size = tcp_size,
+ .size = sizeof(*tcp),
};
if (!spec)
return 0;
tcp->val.dst_port = spec->hdr.dst_port;
tcp->val.src_port = spec->hdr.src_port;
- if (!mask)
- mask = default_mask;
tcp->mask.dst_port = mask->hdr.dst_port;
tcp->mask.src_port = mask->hdr.src_port;
/* Remove unwanted bits from values. */
tcp->val.src_port &= tcp->mask.src_port;
tcp->val.dst_port &= tcp->mask.dst_port;
return 0;
+error:
+ return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+ item, msg);
}
/**
- * Check support for a given item.
+ * Perform basic sanity checks on a pattern item.
*
- * @param item[in]
+ * @param[in] item
* Item specification.
- * @param mask[in]
- * Bit-masks covering supported fields to compare with spec, last and mask in
- * \item.
- * @param size
- * Bit-Mask size in bytes.
+ * @param[in] proc
+ * Associated item-processing object.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
*
* @return
- * 0 on success, negative value otherwise.
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
-mlx4_flow_item_validate(const struct rte_flow_item *item,
- const uint8_t *mask, unsigned int size)
+mlx4_flow_item_check(const struct rte_flow_item *item,
+ const struct mlx4_flow_proc_item *proc,
+ struct rte_flow_error *error)
{
- int ret = 0;
+ const uint8_t *mask;
+ unsigned int i;
+ /* item->last and item->mask cannot exist without item->spec. */
if (!item->spec && (item->mask || item->last))
- return -1;
- if (item->spec && !item->mask) {
- unsigned int i;
- const uint8_t *spec = item->spec;
-
- for (i = 0; i < size; ++i)
- if ((spec[i] | mask[i]) != mask[i])
- return -1;
- }
- if (item->last && !item->mask) {
- unsigned int i;
- const uint8_t *spec = item->last;
-
- for (i = 0; i < size; ++i)
- if ((spec[i] | mask[i]) != mask[i])
- return -1;
- }
- if (item->spec && item->last) {
- uint8_t spec[size];
- uint8_t last[size];
- const uint8_t *apply = mask;
- unsigned int i;
-
- if (item->mask)
- apply = item->mask;
- for (i = 0; i < size; ++i) {
- spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
- last[i] = ((const uint8_t *)item->last)[i] & apply[i];
- }
- ret = memcmp(spec, last, size);
- }
- return ret;
-}
-
-static int
-mlx4_flow_validate_eth(const struct rte_flow_item *item,
- const uint8_t *mask, unsigned int size)
-{
- if (item->mask) {
- const struct rte_flow_item_eth *mask = item->mask;
-
- if (mask->dst.addr_bytes[0] != 0xff ||
- mask->dst.addr_bytes[1] != 0xff ||
- mask->dst.addr_bytes[2] != 0xff ||
- mask->dst.addr_bytes[3] != 0xff ||
- mask->dst.addr_bytes[4] != 0xff ||
- mask->dst.addr_bytes[5] != 0xff)
- return -1;
- }
- return mlx4_flow_item_validate(item, mask, size);
-}
-
-static int
-mlx4_flow_validate_vlan(const struct rte_flow_item *item,
- const uint8_t *mask, unsigned int size)
-{
- if (item->mask) {
- const struct rte_flow_item_vlan *mask = item->mask;
-
- if (mask->tci != 0 &&
- ntohs(mask->tci) != 0x0fff)
- return -1;
- }
- return mlx4_flow_item_validate(item, mask, size);
-}
-
-static int
-mlx4_flow_validate_ipv4(const struct rte_flow_item *item,
- const uint8_t *mask, unsigned int size)
-{
- if (item->mask) {
- const struct rte_flow_item_ipv4 *mask = item->mask;
-
- if (mask->hdr.src_addr != 0 &&
- mask->hdr.src_addr != 0xffffffff)
- return -1;
- if (mask->hdr.dst_addr != 0 &&
- mask->hdr.dst_addr != 0xffffffff)
- return -1;
- }
- return mlx4_flow_item_validate(item, mask, size);
-}
-
-static int
-mlx4_flow_validate_udp(const struct rte_flow_item *item,
- const uint8_t *mask, unsigned int size)
-{
- if (item->mask) {
- const struct rte_flow_item_udp *mask = item->mask;
-
- if (mask->hdr.src_port != 0 &&
- mask->hdr.src_port != 0xffff)
- return -1;
- if (mask->hdr.dst_port != 0 &&
- mask->hdr.dst_port != 0xffff)
- return -1;
- }
- return mlx4_flow_item_validate(item, mask, size);
-}
-
-static int
-mlx4_flow_validate_tcp(const struct rte_flow_item *item,
- const uint8_t *mask, unsigned int size)
-{
- if (item->mask) {
- const struct rte_flow_item_tcp *mask = item->mask;
-
- if (mask->hdr.src_port != 0 &&
- mask->hdr.src_port != 0xffff)
- return -1;
- if (mask->hdr.dst_port != 0 &&
- mask->hdr.dst_port != 0xffff)
- return -1;
+ return rte_flow_error_set
+ (error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "\"mask\" or \"last\" field provided without a"
+ " corresponding \"spec\"");
+ /* No spec, no mask, no problem. */
+ if (!item->spec)
+ return 0;
+ mask = item->mask ?
+ (const uint8_t *)item->mask :
+ (const uint8_t *)proc->mask_default;
+ assert(mask);
+ /*
+ * Single-pass check to make sure that:
+ * - Mask is supported, no bits are set outside proc->mask_support.
+ * - Both item->spec and item->last are included in mask.
+ */
+ for (i = 0; i != proc->mask_sz; ++i) {
+ if (!mask[i])
+ continue;
+ if ((mask[i] | ((const uint8_t *)proc->mask_support)[i]) !=
+ ((const uint8_t *)proc->mask_support)[i])
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+ item, "unsupported field found in \"mask\"");
+ if (item->last &&
+ (((const uint8_t *)item->spec)[i] & mask[i]) !=
+ (((const uint8_t *)item->last)[i] & mask[i]))
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+ item,
+ "range between \"spec\" and \"last\""
+ " is larger than \"mask\"");
}
- return mlx4_flow_item_validate(item, mask, size);
+ return 0;
}
/** Graph of supported items and associated actions. */
-static const struct mlx4_flow_items mlx4_flow_items[] = {
+static const struct mlx4_flow_proc_item mlx4_flow_proc_item_list[] = {
[RTE_FLOW_ITEM_TYPE_END] = {
- .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
+ .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_ETH),
},
[RTE_FLOW_ITEM_TYPE_ETH] = {
- .items = ITEMS(RTE_FLOW_ITEM_TYPE_VLAN,
- RTE_FLOW_ITEM_TYPE_IPV4),
- .actions = valid_actions,
- .mask = &(const struct rte_flow_item_eth){
+ .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_VLAN,
+ RTE_FLOW_ITEM_TYPE_IPV4),
+ .mask_support = &(const struct rte_flow_item_eth){
+ /* Only destination MAC can be matched. */
.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
- .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
},
- .default_mask = &rte_flow_item_eth_mask,
+ .mask_default = &rte_flow_item_eth_mask,
.mask_sz = sizeof(struct rte_flow_item_eth),
- .validate = mlx4_flow_validate_eth,
- .convert = mlx4_flow_create_eth,
+ .merge = mlx4_flow_merge_eth,
.dst_sz = sizeof(struct ibv_flow_spec_eth),
},
[RTE_FLOW_ITEM_TYPE_VLAN] = {
- .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4),
- .actions = valid_actions,
- .mask = &(const struct rte_flow_item_vlan){
- /* rte_flow_item_vlan_mask is invalid for mlx4. */
-#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
- .tci = 0x0fff,
-#else
- .tci = 0xff0f,
-#endif
+ .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_IPV4),
+ .mask_support = &(const struct rte_flow_item_vlan){
+ /* Only TCI VID matching is supported. */
+ .tci = RTE_BE16(0x0fff),
},
+ .mask_default = &rte_flow_item_vlan_mask,
.mask_sz = sizeof(struct rte_flow_item_vlan),
- .validate = mlx4_flow_validate_vlan,
- .convert = mlx4_flow_create_vlan,
+ .merge = mlx4_flow_merge_vlan,
.dst_sz = 0,
},
[RTE_FLOW_ITEM_TYPE_IPV4] = {
- .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
- RTE_FLOW_ITEM_TYPE_TCP),
- .actions = valid_actions,
- .mask = &(const struct rte_flow_item_ipv4){
+ .next_item = NEXT_ITEM(RTE_FLOW_ITEM_TYPE_UDP,
+ RTE_FLOW_ITEM_TYPE_TCP),
+ .mask_support = &(const struct rte_flow_item_ipv4){
.hdr = {
- .src_addr = -1,
- .dst_addr = -1,
+ .src_addr = RTE_BE32(0xffffffff),
+ .dst_addr = RTE_BE32(0xffffffff),
},
},
- .default_mask = &rte_flow_item_ipv4_mask,
+ .mask_default = &rte_flow_item_ipv4_mask,
.mask_sz = sizeof(struct rte_flow_item_ipv4),
- .validate = mlx4_flow_validate_ipv4,
- .convert = mlx4_flow_create_ipv4,
+ .merge = mlx4_flow_merge_ipv4,
.dst_sz = sizeof(struct ibv_flow_spec_ipv4),
},
[RTE_FLOW_ITEM_TYPE_UDP] = {
- .actions = valid_actions,
- .mask = &(const struct rte_flow_item_udp){
+ .mask_support = &(const struct rte_flow_item_udp){
.hdr = {
- .src_port = -1,
- .dst_port = -1,
+ .src_port = RTE_BE16(0xffff),
+ .dst_port = RTE_BE16(0xffff),
},
},
- .default_mask = &rte_flow_item_udp_mask,
+ .mask_default = &rte_flow_item_udp_mask,
.mask_sz = sizeof(struct rte_flow_item_udp),
- .validate = mlx4_flow_validate_udp,
- .convert = mlx4_flow_create_udp,
+ .merge = mlx4_flow_merge_udp,
.dst_sz = sizeof(struct ibv_flow_spec_tcp_udp),
},
[RTE_FLOW_ITEM_TYPE_TCP] = {
- .actions = valid_actions,
- .mask = &(const struct rte_flow_item_tcp){
+ .mask_support = &(const struct rte_flow_item_tcp){
.hdr = {
- .src_port = -1,
- .dst_port = -1,
+ .src_port = RTE_BE16(0xffff),
+ .dst_port = RTE_BE16(0xffff),
},
},
- .default_mask = &rte_flow_item_tcp_mask,
+ .mask_default = &rte_flow_item_tcp_mask,
.mask_sz = sizeof(struct rte_flow_item_tcp),
- .validate = mlx4_flow_validate_tcp,
- .convert = mlx4_flow_create_tcp,
+ .merge = mlx4_flow_merge_tcp,
.dst_sz = sizeof(struct ibv_flow_spec_tcp_udp),
},
};
/**
- * Validate a flow supported by the NIC.
+ * Make sure a flow rule is supported and initialize associated structure.
*
* @param priv
* Pointer to private structure.
* @param[in] attr
* Flow rule attributes.
- * @param[in] items
+ * @param[in] pattern
* Pattern specification (list terminated by the END pattern item).
* @param[in] actions
* Associated actions (list terminated by the END action).
* @param[out] error
* Perform verbose error reporting if not NULL.
- * @param[in, out] flow
- * Flow structure to update.
+ * @param[in, out] addr
+ * Buffer where the resulting flow rule handle pointer must be stored.
+ * If NULL, stop processing after validation stage.
*
* @return
* 0 on success, a negative errno value otherwise and rte_errno is set.
*/
static int
-priv_flow_validate(struct priv *priv,
- const struct rte_flow_attr *attr,
- const struct rte_flow_item items[],
- const struct rte_flow_action actions[],
- struct rte_flow_error *error,
- struct mlx4_flow *flow)
+mlx4_flow_prepare(struct priv *priv,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item pattern[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error,
+ struct rte_flow **addr)
{
- const struct mlx4_flow_items *cur_item = mlx4_flow_items;
- struct mlx4_flow_action action = {
- .queue = 0,
- .drop = 0,
- };
-
- (void)priv;
- if (attr->group) {
- rte_flow_error_set(error, ENOTSUP,
- RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
- NULL,
- "groups are not supported");
- return -rte_errno;
- }
- if (attr->priority) {
- rte_flow_error_set(error, ENOTSUP,
- RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
- NULL,
- "priorities are not supported");
- return -rte_errno;
- }
- if (attr->egress) {
- rte_flow_error_set(error, ENOTSUP,
- RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
- NULL,
- "egress is not supported");
- return -rte_errno;
- }
- if (!attr->ingress) {
- rte_flow_error_set(error, ENOTSUP,
- RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
- NULL,
- "only ingress is supported");
- return -rte_errno;
- }
- /* Go over items list. */
- for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
- const struct mlx4_flow_items *token = NULL;
+ const struct rte_flow_item *item;
+ const struct rte_flow_action *action;
+ const struct mlx4_flow_proc_item *proc;
+ struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) };
+ struct rte_flow *flow = &temp;
+ const char *msg = NULL;
+
+ if (attr->group)
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+ NULL, "groups are not supported");
+ if (attr->priority > MLX4_FLOW_PRIORITY_LAST)
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+ NULL, "maximum priority level is "
+ MLX4_STR_EXPAND(MLX4_FLOW_PRIORITY_LAST));
+ if (attr->egress)
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
+ NULL, "egress is not supported");
+ if (!attr->ingress)
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+ NULL, "only ingress is supported");
+fill:
+ proc = mlx4_flow_proc_item_list;
+ /* Go over pattern. */
+ for (item = pattern; item->type; ++item) {
+ const struct mlx4_flow_proc_item *next = NULL;
unsigned int i;
int err;
- if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
+ if (item->type == RTE_FLOW_ITEM_TYPE_VOID)
+ continue;
+ if (item->type == MLX4_FLOW_ITEM_TYPE_INTERNAL) {
+ flow->internal = 1;
continue;
- /*
- * The nic can support patterns with NULL eth spec only
- * if eth is a single item in a rule.
- */
- if (!items->spec &&
- items->type == RTE_FLOW_ITEM_TYPE_ETH) {
- const struct rte_flow_item *next = items + 1;
-
- if (next->type != RTE_FLOW_ITEM_TYPE_END) {
- rte_flow_error_set(error, ENOTSUP,
- RTE_FLOW_ERROR_TYPE_ITEM,
- items,
- "the rule requires"
- " an Ethernet spec");
- return -rte_errno;
- }
}
- for (i = 0;
- cur_item->items &&
- cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
- ++i) {
- if (cur_item->items[i] == items->type) {
- token = &mlx4_flow_items[items->type];
+ if (flow->promisc || flow->allmulti) {
+ msg = "mlx4 does not support additional matching"
+ " criteria combined with indiscriminate"
+ " matching on Ethernet headers";
+ goto exit_item_not_supported;
+ }
+ for (i = 0; proc->next_item && proc->next_item[i]; ++i) {
+ if (proc->next_item[i] == item->type) {
+ next = &mlx4_flow_proc_item_list[item->type];
break;
}
}
- if (!token)
- goto exit_item_not_supported;
- cur_item = token;
- err = cur_item->validate(items,
- (const uint8_t *)cur_item->mask,
- cur_item->mask_sz);
- if (err)
+ if (!next)
goto exit_item_not_supported;
- if (flow->ibv_attr && cur_item->convert) {
- err = cur_item->convert(items,
- (cur_item->default_mask ?
- cur_item->default_mask :
- cur_item->mask),
- flow);
+ proc = next;
+ /*
+ * Perform basic sanity checks only once, while handle is
+ * not allocated.
+ */
+ if (flow == &temp) {
+ err = mlx4_flow_item_check(item, proc, error);
if (err)
- goto exit_item_not_supported;
+ return err;
}
- flow->offset += cur_item->dst_sz;
+ if (proc->merge) {
+ err = proc->merge(flow, item, proc, error);
+ if (err)
+ return err;
+ }
+ flow->ibv_attr_size += proc->dst_sz;
}
- /* Go over actions list */
- for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
- if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
- continue;
- } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
- action.drop = 1;
- } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
- const struct rte_flow_action_queue *queue =
- (const struct rte_flow_action_queue *)
- actions->conf;
+ /* Go over actions list. */
+ for (action = actions; action->type; ++action) {
+ switch (action->type) {
+ const struct rte_flow_action_queue *queue;
+ const struct rte_flow_action_rss *rss;
+ const struct rte_eth_rss_conf *rss_conf;
+ unsigned int i;
- if (!queue || (queue->index > (priv->rxqs_n - 1)))
+ case RTE_FLOW_ACTION_TYPE_VOID:
+ continue;
+ case RTE_FLOW_ACTION_TYPE_DROP:
+ flow->drop = 1;
+ break;
+ case RTE_FLOW_ACTION_TYPE_QUEUE:
+ if (flow->rss)
+ break;
+ queue = action->conf;
+ if (queue->index >= priv->dev->data->nb_rx_queues) {
+ msg = "queue target index beyond number of"
+ " configured Rx queues";
goto exit_action_not_supported;
- action.queue = 1;
- action.queues_n = 1;
- action.queues[0] = queue->index;
- } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
- int i;
- int ierr;
- const struct rte_flow_action_rss *rss =
- (const struct rte_flow_action_rss *)
- actions->conf;
-
- if (!priv->hw_rss) {
- rte_flow_error_set(error, ENOTSUP,
- RTE_FLOW_ERROR_TYPE_ACTION,
- actions,
- "RSS cannot be used with "
- "the current configuration");
- return -rte_errno;
}
- if (!priv->isolated) {
- rte_flow_error_set(error, ENOTSUP,
- RTE_FLOW_ERROR_TYPE_ACTION,
- actions,
- "RSS cannot be used without "
- "isolated mode");
- return -rte_errno;
+ flow->rss = mlx4_rss_get
+ (priv, 0, mlx4_rss_hash_key_default, 1,
+ &queue->index);
+ if (!flow->rss) {
+ msg = "not enough resources for additional"
+ " single-queue RSS context";
+ goto exit_action_not_supported;
+ }
+ break;
+ case RTE_FLOW_ACTION_TYPE_RSS:
+ if (flow->rss)
+ break;
+ rss = action->conf;
+ /* Default RSS configuration if none is provided. */
+ rss_conf =
+ rss->rss_conf ?
+ rss->rss_conf :
+ &(struct rte_eth_rss_conf){
+ .rss_key = mlx4_rss_hash_key_default,
+ .rss_key_len = MLX4_RSS_HASH_KEY_SIZE,
+ .rss_hf = (ETH_RSS_IPV4 |
+ ETH_RSS_NONFRAG_IPV4_TCP |
+ ETH_RSS_IPV6 |
+ ETH_RSS_NONFRAG_IPV6_TCP),
+ };
+ /* Sanity checks. */
+ for (i = 0; i < rss->num; ++i)
+ if (rss->queue[i] >=
+ priv->dev->data->nb_rx_queues)
+ break;
+ if (i != rss->num) {
+ msg = "queue index target beyond number of"
+ " configured Rx queues";
+ goto exit_action_not_supported;
}
if (!rte_is_power_of_2(rss->num)) {
- rte_flow_error_set(error, ENOTSUP,
- RTE_FLOW_ERROR_TYPE_ACTION,
- actions,
- "the number of queues "
- "should be power of two");
- return -rte_errno;
+ msg = "for RSS, mlx4 requires the number of"
+ " queues to be a power of two";
+ goto exit_action_not_supported;
}
- if (priv->max_rss_tbl_sz < rss->num) {
- rte_flow_error_set(error, ENOTSUP,
- RTE_FLOW_ERROR_TYPE_ACTION,
- actions,
- "the number of queues "
- "is too large");
- return -rte_errno;
+ if (rss_conf->rss_key_len !=
+ sizeof(flow->rss->key)) {
+ msg = "mlx4 supports exactly one RSS hash key"
+ " length: "
+ MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE);
+ goto exit_action_not_supported;
}
- /* checking indexes array */
- ierr = 0;
- for (i = 0; i < rss->num; ++i) {
- int j;
- if (rss->queue[i] >= priv->rxqs_n)
- ierr = 1;
- /*
- * Prevent the user from specifying
- * the same queue twice in the RSS array.
- */
- for (j = i + 1; j < rss->num && !ierr; ++j)
- if (rss->queue[j] == rss->queue[i])
- ierr = 1;
- if (ierr) {
- rte_flow_error_set(
- error,
- ENOTSUP,
- RTE_FLOW_ERROR_TYPE_HANDLE,
- NULL,
- "RSS action only supports "
- "unique queue indices "
- "in a list");
- return -rte_errno;
- }
+ for (i = 1; i < rss->num; ++i)
+ if (rss->queue[i] - rss->queue[i - 1] != 1)
+ break;
+ if (i != rss->num) {
+ msg = "mlx4 requires RSS contexts to use"
+ " consecutive queue indices only";
+ goto exit_action_not_supported;
}
- action.queue = 1;
- action.queues_n = rss->num;
- for (i = 0; i < rss->num; ++i)
- action.queues[i] = rss->queue[i];
- } else {
+ if (rss->queue[0] % rss->num) {
+ msg = "mlx4 requires the first queue of a RSS"
+ " context to be aligned on a multiple"
+ " of the context size";
+ goto exit_action_not_supported;
+ }
+ flow->rss = mlx4_rss_get
+ (priv, mlx4_conv_rss_hf(rss_conf->rss_hf),
+ rss_conf->rss_key, rss->num, rss->queue);
+ if (!flow->rss) {
+ msg = "either invalid parameters or not enough"
+ " resources for additional multi-queue"
+ " RSS context";
+ goto exit_action_not_supported;
+ }
+ break;
+ default:
goto exit_action_not_supported;
}
}
- if (!action.queue && !action.drop) {
- rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
- NULL, "no valid action");
- return -rte_errno;
+ if (!flow->rss && !flow->drop)
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "no valid action");
+ /* Validation ends here. */
+ if (!addr) {
+ if (flow->rss)
+ mlx4_rss_put(flow->rss);
+ return 0;
}
+ if (flow == &temp) {
+ /* Allocate proper handle based on collected data. */
+ const struct mlx4_malloc_vec vec[] = {
+ {
+ .align = alignof(struct rte_flow),
+ .size = sizeof(*flow),
+ .addr = (void **)&flow,
+ },
+ {
+ .align = alignof(struct ibv_flow_attr),
+ .size = temp.ibv_attr_size,
+ .addr = (void **)&temp.ibv_attr,
+ },
+ };
+
+ if (!mlx4_zmallocv(__func__, vec, RTE_DIM(vec)))
+ return rte_flow_error_set
+ (error, -rte_errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "flow rule handle allocation failure");
+ /* Most fields will be updated by second pass. */
+ *flow = (struct rte_flow){
+ .ibv_attr = temp.ibv_attr,
+ .ibv_attr_size = sizeof(*flow->ibv_attr),
+ .rss = temp.rss,
+ };
+ *flow->ibv_attr = (struct ibv_flow_attr){
+ .type = IBV_FLOW_ATTR_NORMAL,
+ .size = sizeof(*flow->ibv_attr),
+ .priority = attr->priority,
+ .port = priv->port,
+ };
+ goto fill;
+ }
+ *addr = flow;
return 0;
exit_item_not_supported:
- rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
- items, "item not supported");
- return -rte_errno;
+ return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+ item, msg ? msg : "item not supported");
exit_action_not_supported:
- rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
- actions, "action not supported");
- return -rte_errno;
+ return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+ action, msg ? msg : "action not supported");
}
/**
@@ -769,552 +874,691 @@ exit_action_not_supported:
* @see rte_flow_validate()
* @see rte_flow_ops
*/
-int
+static int
mlx4_flow_validate(struct rte_eth_dev *dev,
const struct rte_flow_attr *attr,
- const struct rte_flow_item items[],
+ const struct rte_flow_item pattern[],
const struct rte_flow_action actions[],
struct rte_flow_error *error)
{
struct priv *priv = dev->data->dev_private;
- int ret;
- struct mlx4_flow flow = { .offset = sizeof(struct ibv_flow_attr) };
-
- priv_lock(priv);
- ret = priv_flow_validate(priv, attr, items, actions, error, &flow);
- priv_unlock(priv);
- return ret;
-}
-
-/**
- * Destroy a drop queue.
- *
- * @param priv
- * Pointer to private structure.
- */
-static void
-mlx4_flow_destroy_drop_queue(struct priv *priv)
-{
- if (priv->flow_drop_queue) {
- struct rte_flow_drop *fdq = priv->flow_drop_queue;
- priv->flow_drop_queue = NULL;
- claim_zero(ibv_destroy_qp(fdq->qp));
- claim_zero(ibv_destroy_cq(fdq->cq));
- rte_free(fdq);
- }
+ return mlx4_flow_prepare(priv, attr, pattern, actions, error, NULL);
}
/**
- * Create a single drop queue for all drop flows.
+ * Get a drop flow rule resources instance.
*
* @param priv
* Pointer to private structure.
*
* @return
- * 0 on success, negative value otherwise.
+ * Pointer to drop flow resources on success, NULL otherwise and rte_errno
+ * is set.
*/
-static int
-mlx4_flow_create_drop_queue(struct priv *priv)
+static struct mlx4_drop *
+mlx4_drop_get(struct priv *priv)
{
- struct ibv_qp *qp;
- struct ibv_cq *cq;
- struct rte_flow_drop *fdq;
+ struct mlx4_drop *drop = priv->drop;
- fdq = rte_calloc(__func__, 1, sizeof(*fdq), 0);
- if (!fdq) {
- ERROR("Cannot allocate memory for drop struct");
- goto err;
- }
- cq = ibv_exp_create_cq(priv->ctx, 1, NULL, NULL, 0,
- &(struct ibv_exp_cq_init_attr){
- .comp_mask = 0,
- });
- if (!cq) {
- ERROR("Cannot create drop CQ");
- goto err_create_cq;
- }
- qp = ibv_exp_create_qp(priv->ctx,
- &(struct ibv_exp_qp_init_attr){
- .send_cq = cq,
- .recv_cq = cq,
- .cap = {
- .max_recv_wr = 1,
- .max_recv_sge = 1,
- },
- .qp_type = IBV_QPT_RAW_PACKET,
- .comp_mask =
- IBV_EXP_QP_INIT_ATTR_PD |
- IBV_EXP_QP_INIT_ATTR_PORT,
- .pd = priv->pd,
- .port_num = priv->port,
- });
- if (!qp) {
- ERROR("Cannot create drop QP");
- goto err_create_qp;
+ if (drop) {
+ assert(drop->refcnt);
+ assert(drop->priv == priv);
+ ++drop->refcnt;
+ return drop;
}
- *fdq = (struct rte_flow_drop){
- .qp = qp,
- .cq = cq,
+ drop = rte_malloc(__func__, sizeof(*drop), 0);
+ if (!drop)
+ goto error;
+ *drop = (struct mlx4_drop){
+ .priv = priv,
+ .refcnt = 1,
};
- priv->flow_drop_queue = fdq;
- return 0;
-err_create_qp:
- claim_zero(ibv_destroy_cq(cq));
-err_create_cq:
- rte_free(fdq);
-err:
- return -1;
+ drop->cq = ibv_create_cq(priv->ctx, 1, NULL, NULL, 0);
+ if (!drop->cq)
+ goto error;
+ drop->qp = ibv_create_qp(priv->pd,
+ &(struct ibv_qp_init_attr){
+ .send_cq = drop->cq,
+ .recv_cq = drop->cq,
+ .qp_type = IBV_QPT_RAW_PACKET,
+ });
+ if (!drop->qp)
+ goto error;
+ priv->drop = drop;
+ return drop;
+error:
+ if (drop->qp)
+ claim_zero(ibv_destroy_qp(drop->qp));
+ if (drop->cq)
+ claim_zero(ibv_destroy_cq(drop->cq));
+ if (drop)
+ rte_free(drop);
+ rte_errno = ENOMEM;
+ return NULL;
}
/**
- * Get RSS parent rxq structure for given queues.
+ * Give back a drop flow rule resources instance.
*
- * Creates a new or returns an existed one.
- *
- * @param priv
- * Pointer to private structure.
- * @param queues
- * queues indices array, NULL in default RSS case.
- * @param children_n
- * the size of queues array.
- *
- * @return
- * Pointer to a parent rxq structure, NULL on failure.
+ * @param drop
+ * Pointer to drop flow rule resources.
*/
-static struct rxq *
-priv_parent_get(struct priv *priv,
- uint16_t queues[],
- uint16_t children_n,
- struct rte_flow_error *error)
+static void
+mlx4_drop_put(struct mlx4_drop *drop)
{
- unsigned int i;
- struct rxq *parent;
-
- for (parent = LIST_FIRST(&priv->parents);
- parent;
- parent = LIST_NEXT(parent, next)) {
- unsigned int same = 0;
- unsigned int overlap = 0;
-
- /*
- * Find out whether an appropriate parent queue already exists
- * and can be reused, otherwise make sure there are no overlaps.
- */
- for (i = 0; i < children_n; ++i) {
- unsigned int j;
-
- for (j = 0; j < parent->rss.queues_n; ++j) {
- if (parent->rss.queues[j] != queues[i])
- continue;
- ++overlap;
- if (i == j)
- ++same;
- }
- }
- if (same == children_n &&
- children_n == parent->rss.queues_n)
- return parent;
- else if (overlap)
- goto error;
- }
- /* Exclude the cases when some QPs were created without RSS */
- for (i = 0; i < children_n; ++i) {
- struct rxq *rxq = (*priv->rxqs)[queues[i]];
- if (rxq->qp)
- goto error;
- }
- parent = priv_parent_create(priv, queues, children_n);
- if (!parent) {
- rte_flow_error_set(error,
- ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
- NULL, "flow rule creation failure");
- return NULL;
- }
- return parent;
-
-error:
- rte_flow_error_set(error,
- EEXIST,
- RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
- NULL,
- "sharing a queue between several"
- " RSS groups is not supported");
- return NULL;
+ assert(drop->refcnt);
+ if (--drop->refcnt)
+ return;
+ drop->priv->drop = NULL;
+ claim_zero(ibv_destroy_qp(drop->qp));
+ claim_zero(ibv_destroy_cq(drop->cq));
+ rte_free(drop);
}
/**
- * Complete flow rule creation.
+ * Toggle a configured flow rule.
*
* @param priv
* Pointer to private structure.
- * @param ibv_attr
- * Verbs flow attributes.
- * @param action
- * Target action structure.
+ * @param flow
+ * Flow rule handle to toggle.
+ * @param enable
+ * Whether associated Verbs flow must be created or removed.
* @param[out] error
* Perform verbose error reporting if not NULL.
*
* @return
- * A flow if the rule could be created.
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
*/
-static struct rte_flow *
-priv_flow_create_action_queue(struct priv *priv,
- struct ibv_flow_attr *ibv_attr,
- struct mlx4_flow_action *action,
- struct rte_flow_error *error)
+static int
+mlx4_flow_toggle(struct priv *priv,
+ struct rte_flow *flow,
+ int enable,
+ struct rte_flow_error *error)
{
- struct ibv_qp *qp;
- struct rte_flow *rte_flow;
- struct rxq *rxq_parent = NULL;
+ struct ibv_qp *qp = NULL;
+ const char *msg;
+ int err;
- assert(priv->pd);
- assert(priv->ctx);
- rte_flow = rte_calloc(__func__, 1, sizeof(*rte_flow), 0);
- if (!rte_flow) {
- rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
- NULL, "cannot allocate flow memory");
- return NULL;
+ if (!enable) {
+ if (!flow->ibv_flow)
+ return 0;
+ claim_zero(ibv_destroy_flow(flow->ibv_flow));
+ flow->ibv_flow = NULL;
+ if (flow->drop)
+ mlx4_drop_put(priv->drop);
+ else if (flow->rss)
+ mlx4_rss_detach(flow->rss);
+ return 0;
}
- if (action->drop) {
- qp = priv->flow_drop_queue ? priv->flow_drop_queue->qp : NULL;
- } else {
- int ret;
+ assert(flow->ibv_attr);
+ if (!flow->internal &&
+ !priv->isolated &&
+ flow->ibv_attr->priority == MLX4_FLOW_PRIORITY_LAST) {
+ if (flow->ibv_flow) {
+ claim_zero(ibv_destroy_flow(flow->ibv_flow));
+ flow->ibv_flow = NULL;
+ if (flow->drop)
+ mlx4_drop_put(priv->drop);
+ else if (flow->rss)
+ mlx4_rss_detach(flow->rss);
+ }
+ err = EACCES;
+ msg = ("priority level "
+ MLX4_STR_EXPAND(MLX4_FLOW_PRIORITY_LAST)
+ " is reserved when not in isolated mode");
+ goto error;
+ }
+ if (flow->rss) {
+ struct mlx4_rss *rss = flow->rss;
+ int missing = 0;
unsigned int i;
- struct rxq *rxq = NULL;
- if (action->queues_n > 1) {
- rxq_parent = priv_parent_get(priv, action->queues,
- action->queues_n, error);
- if (!rxq_parent)
- goto error;
+ /* Stop at the first nonexistent target queue. */
+ for (i = 0; i != rss->queues; ++i)
+ if (rss->queue_id[i] >=
+ priv->dev->data->nb_rx_queues ||
+ !priv->dev->data->rx_queues[rss->queue_id[i]]) {
+ missing = 1;
+ break;
+ }
+ if (flow->ibv_flow) {
+ if (missing ^ !flow->drop)
+ return 0;
+ /* Verbs flow needs updating. */
+ claim_zero(ibv_destroy_flow(flow->ibv_flow));
+ flow->ibv_flow = NULL;
+ if (flow->drop)
+ mlx4_drop_put(priv->drop);
+ else
+ mlx4_rss_detach(rss);
}
- for (i = 0; i < action->queues_n; ++i) {
- rxq = (*priv->rxqs)[action->queues[i]];
- /*
- * In case of isolated mode we postpone
- * ibv receive queue creation till the first
- * rte_flow rule will be applied on that queue.
- */
- if (!rxq->qp) {
- assert(priv->isolated);
- ret = rxq_create_qp(rxq, rxq->elts_n,
- 0, 0, rxq_parent);
- if (ret) {
- rte_flow_error_set(
- error,
- ENOMEM,
- RTE_FLOW_ERROR_TYPE_HANDLE,
- NULL,
- "flow rule creation failure");
- goto error;
- }
+ if (!missing) {
+ err = mlx4_rss_attach(rss);
+ if (err) {
+ err = -err;
+ msg = "cannot create indirection table or hash"
+ " QP to associate flow rule with";
+ goto error;
}
+ qp = rss->qp;
}
- qp = action->queues_n > 1 ? rxq_parent->qp : rxq->qp;
- rte_flow->qp = qp;
+ /* A missing target queue drops traffic implicitly. */
+ flow->drop = missing;
}
- rte_flow->ibv_attr = ibv_attr;
- if (!priv->started)
- return rte_flow;
- rte_flow->ibv_flow = ibv_create_flow(qp, rte_flow->ibv_attr);
- if (!rte_flow->ibv_flow) {
- rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
- NULL, "flow rule creation failure");
- goto error;
+ if (flow->drop) {
+ mlx4_drop_get(priv);
+ if (!priv->drop) {
+ err = rte_errno;
+ msg = "resources for drop flow rule cannot be created";
+ goto error;
+ }
+ qp = priv->drop->qp;
}
- return rte_flow;
-
+ assert(qp);
+ if (flow->ibv_flow)
+ return 0;
+ flow->ibv_flow = ibv_create_flow(qp, flow->ibv_attr);
+ if (flow->ibv_flow)
+ return 0;
+ if (flow->drop)
+ mlx4_drop_put(priv->drop);
+ else if (flow->rss)
+ mlx4_rss_detach(flow->rss);
+ err = errno;
+ msg = "flow rule rejected by device";
error:
- if (rxq_parent)
- rxq_parent_cleanup(rxq_parent);
- rte_free(rte_flow);
- return NULL;
+ return rte_flow_error_set
+ (error, err, RTE_FLOW_ERROR_TYPE_HANDLE, flow, msg);
}
/**
- * Convert a flow.
- *
- * @param priv
- * Pointer to private structure.
- * @param[in] attr
- * Flow rule attributes.
- * @param[in] items
- * Pattern specification (list terminated by the END pattern item).
- * @param[in] actions
- * Associated actions (list terminated by the END action).
- * @param[out] error
- * Perform verbose error reporting if not NULL.
+ * Create a flow.
*
- * @return
- * A flow on success, NULL otherwise.
+ * @see rte_flow_create()
+ * @see rte_flow_ops
*/
static struct rte_flow *
-priv_flow_create(struct priv *priv,
+mlx4_flow_create(struct rte_eth_dev *dev,
const struct rte_flow_attr *attr,
- const struct rte_flow_item items[],
+ const struct rte_flow_item pattern[],
const struct rte_flow_action actions[],
struct rte_flow_error *error)
{
- struct rte_flow *rte_flow;
- struct mlx4_flow_action action;
- struct mlx4_flow flow = { .offset = sizeof(struct ibv_flow_attr), };
+ struct priv *priv = dev->data->dev_private;
+ struct rte_flow *flow;
int err;
- err = priv_flow_validate(priv, attr, items, actions, error, &flow);
+ err = mlx4_flow_prepare(priv, attr, pattern, actions, error, &flow);
if (err)
return NULL;
- flow.ibv_attr = rte_malloc(__func__, flow.offset, 0);
- if (!flow.ibv_attr) {
- rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
- NULL, "cannot allocate ibv_attr memory");
- return NULL;
- }
- flow.offset = sizeof(struct ibv_flow_attr);
- *flow.ibv_attr = (struct ibv_flow_attr){
- .comp_mask = 0,
- .type = IBV_FLOW_ATTR_NORMAL,
- .size = sizeof(struct ibv_flow_attr),
- .priority = attr->priority,
- .num_of_specs = 0,
- .port = priv->port,
- .flags = 0,
- };
- claim_zero(priv_flow_validate(priv, attr, items, actions,
- error, &flow));
- action = (struct mlx4_flow_action){
- .queue = 0,
- .drop = 0,
- };
- for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
- if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
- continue;
- } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
- action.queue = 1;
- action.queues_n = 1;
- action.queues[0] =
- ((const struct rte_flow_action_queue *)
- actions->conf)->index;
- } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
- action.drop = 1;
- } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
- unsigned int i;
- const struct rte_flow_action_rss *rss =
- (const struct rte_flow_action_rss *)
- actions->conf;
+ err = mlx4_flow_toggle(priv, flow, priv->started, error);
+ if (!err) {
+ struct rte_flow *curr = LIST_FIRST(&priv->flows);
- action.queue = 1;
- action.queues_n = rss->num;
- for (i = 0; i < rss->num; ++i)
- action.queues[i] = rss->queue[i];
+ /* New rules are inserted after internal ones. */
+ if (!curr || !curr->internal) {
+ LIST_INSERT_HEAD(&priv->flows, flow, next);
} else {
- rte_flow_error_set(error, ENOTSUP,
- RTE_FLOW_ERROR_TYPE_ACTION,
- actions, "unsupported action");
- goto exit;
+ while (LIST_NEXT(curr, next) &&
+ LIST_NEXT(curr, next)->internal)
+ curr = LIST_NEXT(curr, next);
+ LIST_INSERT_AFTER(curr, flow, next);
}
+ return flow;
}
- rte_flow = priv_flow_create_action_queue(priv, flow.ibv_attr,
- &action, error);
- if (rte_flow)
- return rte_flow;
-exit:
- rte_free(flow.ibv_attr);
+ if (flow->rss)
+ mlx4_rss_put(flow->rss);
+ rte_flow_error_set(error, -err, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ error->message);
+ rte_free(flow);
return NULL;
}
/**
- * Create a flow.
+ * Configure isolated mode.
*
- * @see rte_flow_create()
- * @see rte_flow_ops
- */
-struct rte_flow *
-mlx4_flow_create(struct rte_eth_dev *dev,
- const struct rte_flow_attr *attr,
- const struct rte_flow_item items[],
- const struct rte_flow_action actions[],
- struct rte_flow_error *error)
-{
- struct priv *priv = dev->data->dev_private;
- struct rte_flow *flow;
-
- priv_lock(priv);
- flow = priv_flow_create(priv, attr, items, actions, error);
- if (flow) {
- LIST_INSERT_HEAD(&priv->flows, flow, next);
- DEBUG("Flow created %p", (void *)flow);
- }
- priv_unlock(priv);
- return flow;
-}
-
-/**
* @see rte_flow_isolate()
- *
- * Must be done before calling dev_configure().
- *
- * @param dev
- * Pointer to the ethernet device structure.
- * @param enable
- * Nonzero to enter isolated mode, attempt to leave it otherwise.
- * @param[out] error
- * Perform verbose error reporting if not NULL. PMDs initialize this
- * structure in case of error only.
- *
- * @return
- * 0 on success, a negative value on error.
+ * @see rte_flow_ops
*/
-int
+static int
mlx4_flow_isolate(struct rte_eth_dev *dev,
int enable,
struct rte_flow_error *error)
{
struct priv *priv = dev->data->dev_private;
- priv_lock(priv);
- if (priv->rxqs) {
- rte_flow_error_set(error, ENOTSUP,
- RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
- NULL, "isolated mode must be set"
- " before configuring the device");
- priv_unlock(priv);
+ if (!!enable == !!priv->isolated)
+ return 0;
+ priv->isolated = !!enable;
+ if (mlx4_flow_sync(priv, error)) {
+ priv->isolated = !enable;
return -rte_errno;
}
- priv->isolated = !!enable;
- priv_unlock(priv);
return 0;
}
/**
- * Destroy a flow.
+ * Destroy a flow rule.
*
- * @param priv
- * Pointer to private structure.
- * @param[in] flow
- * Flow to destroy.
+ * @see rte_flow_destroy()
+ * @see rte_flow_ops
*/
-static void
-priv_flow_destroy(struct priv *priv, struct rte_flow *flow)
+static int
+mlx4_flow_destroy(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ struct rte_flow_error *error)
{
- (void)priv;
+ struct priv *priv = dev->data->dev_private;
+ int err = mlx4_flow_toggle(priv, flow, 0, error);
+
+ if (err)
+ return err;
LIST_REMOVE(flow, next);
- if (flow->ibv_flow)
- claim_zero(ibv_destroy_flow(flow->ibv_flow));
- rte_free(flow->ibv_attr);
- DEBUG("Flow destroyed %p", (void *)flow);
+ if (flow->rss)
+ mlx4_rss_put(flow->rss);
rte_free(flow);
+ return 0;
}
/**
- * Destroy a flow.
+ * Destroy user-configured flow rules.
*
- * @see rte_flow_destroy()
+ * This function skips internal flows rules.
+ *
+ * @see rte_flow_flush()
* @see rte_flow_ops
*/
-int
-mlx4_flow_destroy(struct rte_eth_dev *dev,
- struct rte_flow *flow,
- struct rte_flow_error *error)
+static int
+mlx4_flow_flush(struct rte_eth_dev *dev,
+ struct rte_flow_error *error)
{
struct priv *priv = dev->data->dev_private;
+ struct rte_flow *flow = LIST_FIRST(&priv->flows);
+
+ while (flow) {
+ struct rte_flow *next = LIST_NEXT(flow, next);
- (void)error;
- priv_lock(priv);
- priv_flow_destroy(priv, flow);
- priv_unlock(priv);
+ if (!flow->internal)
+ mlx4_flow_destroy(dev, flow, error);
+ flow = next;
+ }
return 0;
}
/**
- * Destroy all flows.
+ * Helper function to determine the next configured VLAN filter.
*
* @param priv
* Pointer to private structure.
+ * @param vlan
+ * VLAN ID to use as a starting point.
+ *
+ * @return
+ * Next configured VLAN ID or a high value (>= 4096) if there is none.
*/
-static void
-priv_flow_flush(struct priv *priv)
+static uint16_t
+mlx4_flow_internal_next_vlan(struct priv *priv, uint16_t vlan)
+{
+ while (vlan < 4096) {
+ if (priv->dev->data->vlan_filter_conf.ids[vlan / 64] &
+ (UINT64_C(1) << (vlan % 64)))
+ return vlan;
+ ++vlan;
+ }
+ return vlan;
+}
+
+/**
+ * Generate internal flow rules.
+ *
+ * Various flow rules are created depending on the mode the device is in:
+ *
+ * 1. Promiscuous: port MAC + catch-all (VLAN filtering is ignored).
+ * 2. All multicast: port MAC/VLAN + catch-all multicast.
+ * 3. Otherwise: port MAC/VLAN + broadcast MAC/VLAN.
+ *
+ * About MAC flow rules:
+ *
+ * - MAC flow rules are generated from @p dev->data->mac_addrs
+ * (@p priv->mac array).
+ * - An additional flow rule for Ethernet broadcasts is also generated.
+ * - All these are per-VLAN if @p dev->data->dev_conf.rxmode.hw_vlan_filter
+ * is enabled and VLAN filters are configured.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
{
- while (!LIST_EMPTY(&priv->flows)) {
- struct rte_flow *flow;
+ struct rte_flow_attr attr = {
+ .priority = MLX4_FLOW_PRIORITY_LAST,
+ .ingress = 1,
+ };
+ struct rte_flow_item_eth eth_spec;
+ const struct rte_flow_item_eth eth_mask = {
+ .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+ };
+ const struct rte_flow_item_eth eth_allmulti = {
+ .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
+ };
+ struct rte_flow_item_vlan vlan_spec;
+ const struct rte_flow_item_vlan vlan_mask = {
+ .tci = RTE_BE16(0x0fff),
+ };
+ struct rte_flow_item pattern[] = {
+ {
+ .type = MLX4_FLOW_ITEM_TYPE_INTERNAL,
+ },
+ {
+ .type = RTE_FLOW_ITEM_TYPE_ETH,
+ .spec = &eth_spec,
+ .mask = &eth_mask,
+ },
+ {
+ /* Replaced with VLAN if filtering is enabled. */
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ },
+ {
+ .type = RTE_FLOW_ITEM_TYPE_END,
+ },
+ };
+ /*
+ * Round number of queues down to their previous power of 2 to
+ * comply with RSS context limitations. Extra queues silently do not
+ * get RSS by default.
+ */
+ uint32_t queues =
+ rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
+ alignas(struct rte_flow_action_rss) uint8_t rss_conf_data
+ [offsetof(struct rte_flow_action_rss, queue) +
+ sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues];
+ struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data;
+ struct rte_flow_action actions[] = {
+ {
+ .type = RTE_FLOW_ACTION_TYPE_RSS,
+ .conf = rss_conf,
+ },
+ {
+ .type = RTE_FLOW_ACTION_TYPE_END,
+ },
+ };
+ struct ether_addr *rule_mac = &eth_spec.dst;
+ rte_be16_t *rule_vlan =
+ priv->dev->data->dev_conf.rxmode.hw_vlan_filter &&
+ !priv->dev->data->promiscuous ?
+ &vlan_spec.tci :
+ NULL;
+ int broadcast =
+ !priv->dev->data->promiscuous &&
+ !priv->dev->data->all_multicast;
+ uint16_t vlan = 0;
+ struct rte_flow *flow;
+ unsigned int i;
+ int err = 0;
- flow = LIST_FIRST(&priv->flows);
- priv_flow_destroy(priv, flow);
+ /* Nothing to be done if there are no Rx queues. */
+ if (!queues)
+ goto error;
+ /* Prepare default RSS configuration. */
+ *rss_conf = (struct rte_flow_action_rss){
+ .rss_conf = NULL, /* Rely on default fallback settings. */
+ .num = queues,
+ };
+ for (i = 0; i != queues; ++i)
+ rss_conf->queue[i] = i;
+ /*
+ * Set up VLAN item if filtering is enabled and at least one VLAN
+ * filter is configured.
+ */
+ if (rule_vlan) {
+ vlan = mlx4_flow_internal_next_vlan(priv, 0);
+ if (vlan < 4096) {
+ pattern[2] = (struct rte_flow_item){
+ .type = RTE_FLOW_ITEM_TYPE_VLAN,
+ .spec = &vlan_spec,
+ .mask = &vlan_mask,
+ };
+next_vlan:
+ *rule_vlan = rte_cpu_to_be_16(vlan);
+ } else {
+ rule_vlan = NULL;
+ }
}
+ for (i = 0; i != RTE_DIM(priv->mac) + broadcast; ++i) {
+ const struct ether_addr *mac;
+
+ /* Broadcasts are handled by an extra iteration. */
+ if (i < RTE_DIM(priv->mac))
+ mac = &priv->mac[i];
+ else
+ mac = &eth_mask.dst;
+ if (is_zero_ether_addr(mac))
+ continue;
+ /* Check if MAC flow rule is already present. */
+ for (flow = LIST_FIRST(&priv->flows);
+ flow && flow->internal;
+ flow = LIST_NEXT(flow, next)) {
+ const struct ibv_flow_spec_eth *eth =
+ (const void *)((uintptr_t)flow->ibv_attr +
+ sizeof(*flow->ibv_attr));
+ unsigned int j;
+
+ if (!flow->mac)
+ continue;
+ assert(flow->ibv_attr->type == IBV_FLOW_ATTR_NORMAL);
+ assert(flow->ibv_attr->num_of_specs == 1);
+ assert(eth->type == IBV_FLOW_SPEC_ETH);
+ assert(flow->rss);
+ if (rule_vlan &&
+ (eth->val.vlan_tag != *rule_vlan ||
+ eth->mask.vlan_tag != RTE_BE16(0x0fff)))
+ continue;
+ if (!rule_vlan && eth->mask.vlan_tag)
+ continue;
+ for (j = 0; j != sizeof(mac->addr_bytes); ++j)
+ if (eth->val.dst_mac[j] != mac->addr_bytes[j] ||
+ eth->mask.dst_mac[j] != UINT8_C(0xff) ||
+ eth->val.src_mac[j] != UINT8_C(0x00) ||
+ eth->mask.src_mac[j] != UINT8_C(0x00))
+ break;
+ if (j != sizeof(mac->addr_bytes))
+ continue;
+ if (flow->rss->queues != queues ||
+ memcmp(flow->rss->queue_id, rss_conf->queue,
+ queues * sizeof(flow->rss->queue_id[0])))
+ continue;
+ break;
+ }
+ if (!flow || !flow->internal) {
+ /* Not found, create a new flow rule. */
+ memcpy(rule_mac, mac, sizeof(*mac));
+ flow = mlx4_flow_create(priv->dev, &attr, pattern,
+ actions, error);
+ if (!flow) {
+ err = -rte_errno;
+ goto error;
+ }
+ }
+ flow->select = 1;
+ flow->mac = 1;
+ }
+ if (rule_vlan) {
+ vlan = mlx4_flow_internal_next_vlan(priv, vlan + 1);
+ if (vlan < 4096)
+ goto next_vlan;
+ }
+ /* Take care of promiscuous and all multicast flow rules. */
+ if (!broadcast) {
+ for (flow = LIST_FIRST(&priv->flows);
+ flow && flow->internal;
+ flow = LIST_NEXT(flow, next)) {
+ if (priv->dev->data->promiscuous) {
+ if (flow->promisc)
+ break;
+ } else {
+ assert(priv->dev->data->all_multicast);
+ if (flow->allmulti)
+ break;
+ }
+ }
+ if (flow && flow->internal) {
+ assert(flow->rss);
+ if (flow->rss->queues != queues ||
+ memcmp(flow->rss->queue_id, rss_conf->queue,
+ queues * sizeof(flow->rss->queue_id[0])))
+ flow = NULL;
+ }
+ if (!flow || !flow->internal) {
+ /* Not found, create a new flow rule. */
+ if (priv->dev->data->promiscuous) {
+ pattern[1].spec = NULL;
+ pattern[1].mask = NULL;
+ } else {
+ assert(priv->dev->data->all_multicast);
+ pattern[1].spec = &eth_allmulti;
+ pattern[1].mask = &eth_allmulti;
+ }
+ pattern[2] = pattern[3];
+ flow = mlx4_flow_create(priv->dev, &attr, pattern,
+ actions, error);
+ if (!flow) {
+ err = -rte_errno;
+ goto error;
+ }
+ }
+ assert(flow->promisc || flow->allmulti);
+ flow->select = 1;
+ }
+error:
+ /* Clear selection and clean up stale internal flow rules. */
+ flow = LIST_FIRST(&priv->flows);
+ while (flow && flow->internal) {
+ struct rte_flow *next = LIST_NEXT(flow, next);
+
+ if (!flow->select)
+ claim_zero(mlx4_flow_destroy(priv->dev, flow, error));
+ else
+ flow->select = 0;
+ flow = next;
+ }
+ return err;
}
/**
- * Destroy all flows.
+ * Synchronize flow rules.
*
- * @see rte_flow_flush()
- * @see rte_flow_ops
+ * This function synchronizes flow rules with the state of the device by
+ * taking into account isolated mode and whether target queues are
+ * configured.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
*/
int
-mlx4_flow_flush(struct rte_eth_dev *dev,
- struct rte_flow_error *error)
+mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error)
{
- struct priv *priv = dev->data->dev_private;
+ struct rte_flow *flow;
+ int ret;
- (void)error;
- priv_lock(priv);
- priv_flow_flush(priv);
- priv_unlock(priv);
+ /* Internal flow rules are guaranteed to come first in the list. */
+ if (priv->isolated) {
+ /*
+ * Get rid of them in isolated mode, stop at the first
+ * non-internal rule found.
+ */
+ for (flow = LIST_FIRST(&priv->flows);
+ flow && flow->internal;
+ flow = LIST_FIRST(&priv->flows))
+ claim_zero(mlx4_flow_destroy(priv->dev, flow, error));
+ } else {
+ /* Refresh internal rules. */
+ ret = mlx4_flow_internal(priv, error);
+ if (ret)
+ return ret;
+ }
+ /* Toggle the remaining flow rules . */
+ LIST_FOREACH(flow, &priv->flows, next) {
+ ret = mlx4_flow_toggle(priv, flow, priv->started, error);
+ if (ret)
+ return ret;
+ }
+ if (!priv->started)
+ assert(!priv->drop);
return 0;
}
/**
- * Remove all flows.
+ * Clean up all flow rules.
*
- * Called by dev_stop() to remove all flows.
+ * Unlike mlx4_flow_flush(), this function takes care of all remaining flow
+ * rules regardless of whether they are internal or user-configured.
*
* @param priv
* Pointer to private structure.
*/
void
-mlx4_priv_flow_stop(struct priv *priv)
+mlx4_flow_clean(struct priv *priv)
{
struct rte_flow *flow;
- for (flow = LIST_FIRST(&priv->flows);
- flow;
- flow = LIST_NEXT(flow, next)) {
- claim_zero(ibv_destroy_flow(flow->ibv_flow));
- flow->ibv_flow = NULL;
- DEBUG("Flow %p removed", (void *)flow);
- }
- mlx4_flow_destroy_drop_queue(priv);
+ while ((flow = LIST_FIRST(&priv->flows)))
+ mlx4_flow_destroy(priv->dev, flow, NULL);
+ assert(LIST_EMPTY(&priv->rss));
}
+static const struct rte_flow_ops mlx4_flow_ops = {
+ .validate = mlx4_flow_validate,
+ .create = mlx4_flow_create,
+ .destroy = mlx4_flow_destroy,
+ .flush = mlx4_flow_flush,
+ .isolate = mlx4_flow_isolate,
+};
+
/**
- * Add all flows.
+ * Manage filter operations.
*
- * @param priv
- * Pointer to private structure.
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param filter_type
+ * Filter type.
+ * @param filter_op
+ * Operation to perform.
+ * @param arg
+ * Pointer to operation-specific structure.
*
* @return
- * 0 on success, a errno value otherwise and rte_errno is set.
+ * 0 on success, negative errno value otherwise and rte_errno is set.
*/
int
-mlx4_priv_flow_start(struct priv *priv)
+mlx4_filter_ctrl(struct rte_eth_dev *dev,
+ enum rte_filter_type filter_type,
+ enum rte_filter_op filter_op,
+ void *arg)
{
- int ret;
- struct ibv_qp *qp;
- struct rte_flow *flow;
-
- ret = mlx4_flow_create_drop_queue(priv);
- if (ret)
- return -1;
- for (flow = LIST_FIRST(&priv->flows);
- flow;
- flow = LIST_NEXT(flow, next)) {
- qp = flow->qp ? flow->qp : priv->flow_drop_queue->qp;
- flow->ibv_flow = ibv_create_flow(qp, flow->ibv_attr);
- if (!flow->ibv_flow) {
- DEBUG("Flow %p cannot be applied", (void *)flow);
- rte_errno = EINVAL;
- return rte_errno;
- }
- DEBUG("Flow %p applied", (void *)flow);
+ switch (filter_type) {
+ case RTE_ETH_FILTER_GENERIC:
+ if (filter_op != RTE_ETH_FILTER_GET)
+ break;
+ *(const void **)arg = &mlx4_flow_ops;
+ return 0;
+ default:
+ ERROR("%p: filter type (%d) not supported",
+ (void *)dev, filter_type);
+ break;
}
- return 0;
+ rte_errno = ENOTSUP;
+ return -rte_errno;
}
diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h
index beabcf2d..651fd37b 100644
--- a/drivers/net/mlx4/mlx4_flow.h
+++ b/drivers/net/mlx4/mlx4_flow.h
@@ -2,7 +2,7 @@
* BSD LICENSE
*
* Copyright 2017 6WIND S.A.
- * Copyright 2017 Mellanox.
+ * Copyright 2017 Mellanox
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -34,12 +34,10 @@
#ifndef RTE_PMD_MLX4_FLOW_H_
#define RTE_PMD_MLX4_FLOW_H_
-#include <stddef.h>
#include <stdint.h>
#include <sys/queue.h>
-/* Verbs header. */
-/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+/* Verbs headers do not support -pedantic. */
#ifdef PEDANTIC
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
@@ -48,61 +46,40 @@
#pragma GCC diagnostic error "-Wpedantic"
#endif
+#include <rte_eth_ctrl.h>
+#include <rte_ethdev.h>
#include <rte_flow.h>
#include <rte_flow_driver.h>
#include <rte_byteorder.h>
-#include "mlx4.h"
+/** Last and lowest priority level for a flow rule. */
+#define MLX4_FLOW_PRIORITY_LAST UINT32_C(0xfff)
+/** Meta pattern item used to distinguish internal rules. */
+#define MLX4_FLOW_ITEM_TYPE_INTERNAL ((enum rte_flow_item_type)-1)
+
+/** PMD-specific (mlx4) definition of a flow rule handle. */
struct rte_flow {
LIST_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
struct ibv_flow *ibv_flow; /**< Verbs flow. */
struct ibv_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */
- struct ibv_qp *qp; /**< Verbs queue pair. */
+ uint32_t ibv_attr_size; /**< Size of Verbs attributes. */
+ uint32_t select:1; /**< Used by operations on the linked list. */
+ uint32_t internal:1; /**< Internal flow rule outside isolated mode. */
+ uint32_t mac:1; /**< Rule associated with a configured MAC address. */
+ uint32_t promisc:1; /**< This rule matches everything. */
+ uint32_t allmulti:1; /**< This rule matches all multicast traffic. */
+ uint32_t drop:1; /**< This rule drops packets. */
+ struct mlx4_rss *rss; /**< Rx target. */
};
-int
-mlx4_flow_validate(struct rte_eth_dev *dev,
- const struct rte_flow_attr *attr,
- const struct rte_flow_item items[],
- const struct rte_flow_action actions[],
- struct rte_flow_error *error);
-
-struct rte_flow *
-mlx4_flow_create(struct rte_eth_dev *dev,
- const struct rte_flow_attr *attr,
- const struct rte_flow_item items[],
- const struct rte_flow_action actions[],
- struct rte_flow_error *error);
-
-int
-mlx4_flow_destroy(struct rte_eth_dev *dev,
- struct rte_flow *flow,
- struct rte_flow_error *error);
-
-int
-mlx4_flow_flush(struct rte_eth_dev *dev,
- struct rte_flow_error *error);
-
-/** Structure to pass to the conversion function. */
-struct mlx4_flow {
- struct ibv_flow_attr *ibv_attr; /**< Verbs attribute. */
- unsigned int offset; /**< Offset in bytes in the ibv_attr buffer. */
-};
-
-int
-mlx4_flow_isolate(struct rte_eth_dev *dev,
- int enable,
- struct rte_flow_error *error);
-
-struct mlx4_flow_action {
- uint32_t drop:1; /**< Target is a drop queue. */
- uint32_t queue:1; /**< Target is a receive queue. */
- uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queue indices to use. */
- uint16_t queues_n; /**< Number of entries in queue[] */
-};
+/* mlx4_flow.c */
-int mlx4_priv_flow_start(struct priv *priv);
-void mlx4_priv_flow_stop(struct priv *priv);
+int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error);
+void mlx4_flow_clean(struct priv *priv);
+int mlx4_filter_ctrl(struct rte_eth_dev *dev,
+ enum rte_filter_type filter_type,
+ enum rte_filter_op filter_op,
+ void *arg);
#endif /* RTE_PMD_MLX4_FLOW_H_ */
diff --git a/drivers/net/mlx4/mlx4_intr.c b/drivers/net/mlx4/mlx4_intr.c
new file mode 100644
index 00000000..b17d109a
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_intr.c
@@ -0,0 +1,397 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Interrupts handling for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_alarm.h>
+#include <rte_errno.h>
+#include <rte_ethdev.h>
+#include <rte_io.h>
+#include <rte_interrupts.h>
+
+#include "mlx4.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+static int mlx4_link_status_check(struct priv *priv);
+
+/**
+ * Clean up Rx interrupts handler.
+ *
+ * @param priv
+ * Pointer to private structure.
+ */
+static void
+mlx4_rx_intr_vec_disable(struct priv *priv)
+{
+ struct rte_intr_handle *intr_handle = &priv->intr_handle;
+
+ rte_intr_free_epoll_fd(intr_handle);
+ free(intr_handle->intr_vec);
+ intr_handle->nb_efd = 0;
+ intr_handle->intr_vec = NULL;
+}
+
+/**
+ * Allocate queue vector and fill epoll fd list for Rx interrupts.
+ *
+ * @param priv
+ * Pointer to private structure.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_rx_intr_vec_enable(struct priv *priv)
+{
+ unsigned int i;
+ unsigned int rxqs_n = priv->dev->data->nb_rx_queues;
+ unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+ unsigned int count = 0;
+ struct rte_intr_handle *intr_handle = &priv->intr_handle;
+
+ mlx4_rx_intr_vec_disable(priv);
+ intr_handle->intr_vec = malloc(sizeof(intr_handle->intr_vec[rxqs_n]));
+ if (intr_handle->intr_vec == NULL) {
+ rte_errno = ENOMEM;
+ ERROR("failed to allocate memory for interrupt vector,"
+ " Rx interrupts will not be supported");
+ return -rte_errno;
+ }
+ for (i = 0; i != n; ++i) {
+ struct rxq *rxq = priv->dev->data->rx_queues[i];
+
+ /* Skip queues that cannot request interrupts. */
+ if (!rxq || !rxq->channel) {
+ /* Use invalid intr_vec[] index to disable entry. */
+ intr_handle->intr_vec[i] =
+ RTE_INTR_VEC_RXTX_OFFSET +
+ RTE_MAX_RXTX_INTR_VEC_ID;
+ continue;
+ }
+ if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
+ rte_errno = E2BIG;
+ ERROR("too many Rx queues for interrupt vector size"
+ " (%d), Rx interrupts cannot be enabled",
+ RTE_MAX_RXTX_INTR_VEC_ID);
+ mlx4_rx_intr_vec_disable(priv);
+ return -rte_errno;
+ }
+ intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
+ intr_handle->efds[count] = rxq->channel->fd;
+ count++;
+ }
+ if (!count)
+ mlx4_rx_intr_vec_disable(priv);
+ else
+ intr_handle->nb_efd = count;
+ return 0;
+}
+
+/**
+ * Process scheduled link status check.
+ *
+ * If LSC interrupts are requested, process related callback.
+ *
+ * @param priv
+ * Pointer to private structure.
+ */
+static void
+mlx4_link_status_alarm(struct priv *priv)
+{
+ const struct rte_intr_conf *const intr_conf =
+ &priv->dev->data->dev_conf.intr_conf;
+
+ assert(priv->intr_alarm == 1);
+ priv->intr_alarm = 0;
+ if (intr_conf->lsc && !mlx4_link_status_check(priv))
+ _rte_eth_dev_callback_process(priv->dev,
+ RTE_ETH_EVENT_INTR_LSC,
+ NULL, NULL);
+}
+
+/**
+ * Check link status.
+ *
+ * In case of inconsistency, another check is scheduled.
+ *
+ * @param priv
+ * Pointer to private structure.
+ *
+ * @return
+ * 0 on success (link status is consistent), negative errno value
+ * otherwise and rte_errno is set.
+ */
+static int
+mlx4_link_status_check(struct priv *priv)
+{
+ struct rte_eth_link *link = &priv->dev->data->dev_link;
+ int ret = mlx4_link_update(priv->dev, 0);
+
+ if (ret)
+ return ret;
+ if ((!link->link_speed && link->link_status) ||
+ (link->link_speed && !link->link_status)) {
+ if (!priv->intr_alarm) {
+ /* Inconsistent status, check again later. */
+ ret = rte_eal_alarm_set(MLX4_INTR_ALARM_TIMEOUT,
+ (void (*)(void *))
+ mlx4_link_status_alarm,
+ priv);
+ if (ret)
+ return ret;
+ priv->intr_alarm = 1;
+ }
+ rte_errno = EINPROGRESS;
+ return -rte_errno;
+ }
+ return 0;
+}
+
+/**
+ * Handle interrupts from the NIC.
+ *
+ * @param priv
+ * Pointer to private structure.
+ */
+static void
+mlx4_interrupt_handler(struct priv *priv)
+{
+ enum { LSC, RMV, };
+ static const enum rte_eth_event_type type[] = {
+ [LSC] = RTE_ETH_EVENT_INTR_LSC,
+ [RMV] = RTE_ETH_EVENT_INTR_RMV,
+ };
+ uint32_t caught[RTE_DIM(type)] = { 0 };
+ struct ibv_async_event event;
+ const struct rte_intr_conf *const intr_conf =
+ &priv->dev->data->dev_conf.intr_conf;
+ unsigned int i;
+
+ /* Read all message and acknowledge them. */
+ while (!ibv_get_async_event(priv->ctx, &event)) {
+ switch (event.event_type) {
+ case IBV_EVENT_PORT_ACTIVE:
+ case IBV_EVENT_PORT_ERR:
+ if (intr_conf->lsc && !mlx4_link_status_check(priv))
+ ++caught[LSC];
+ break;
+ case IBV_EVENT_DEVICE_FATAL:
+ if (intr_conf->rmv)
+ ++caught[RMV];
+ break;
+ default:
+ DEBUG("event type %d on physical port %d not handled",
+ event.event_type, event.element.port_num);
+ }
+ ibv_ack_async_event(&event);
+ }
+ for (i = 0; i != RTE_DIM(caught); ++i)
+ if (caught[i])
+ _rte_eth_dev_callback_process(priv->dev, type[i],
+ NULL, NULL);
+}
+
+/**
+ * MLX4 CQ notification .
+ *
+ * @param rxq
+ * Pointer to receive queue structure.
+ * @param solicited
+ * Is request solicited or not.
+ */
+static void
+mlx4_arm_cq(struct rxq *rxq, int solicited)
+{
+ struct mlx4_cq *cq = &rxq->mcq;
+ uint64_t doorbell;
+ uint32_t sn = cq->arm_sn & MLX4_CQ_DB_GEQ_N_MASK;
+ uint32_t ci = cq->cons_index & MLX4_CQ_DB_CI_MASK;
+ uint32_t cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT;
+
+ *cq->arm_db = rte_cpu_to_be_32(sn << 28 | cmd | ci);
+ /*
+ * Make sure that the doorbell record in host memory is
+ * written before ringing the doorbell via PCI MMIO.
+ */
+ rte_wmb();
+ doorbell = sn << 28 | cmd | cq->cqn;
+ doorbell <<= 32;
+ doorbell |= ci;
+ rte_write64(rte_cpu_to_be_64(doorbell), cq->cq_db_reg);
+}
+
+/**
+ * Uninstall interrupt handler.
+ *
+ * @param priv
+ * Pointer to private structure.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_intr_uninstall(struct priv *priv)
+{
+ int err = rte_errno; /* Make sure rte_errno remains unchanged. */
+
+ if (priv->intr_handle.fd != -1) {
+ rte_intr_callback_unregister(&priv->intr_handle,
+ (void (*)(void *))
+ mlx4_interrupt_handler,
+ priv);
+ priv->intr_handle.fd = -1;
+ }
+ rte_eal_alarm_cancel((void (*)(void *))mlx4_link_status_alarm, priv);
+ priv->intr_alarm = 0;
+ mlx4_rx_intr_vec_disable(priv);
+ rte_errno = err;
+ return 0;
+}
+
+/**
+ * Install interrupt handler.
+ *
+ * @param priv
+ * Pointer to private structure.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_intr_install(struct priv *priv)
+{
+ const struct rte_intr_conf *const intr_conf =
+ &priv->dev->data->dev_conf.intr_conf;
+ int rc;
+
+ mlx4_intr_uninstall(priv);
+ if (intr_conf->rxq && mlx4_rx_intr_vec_enable(priv) < 0)
+ goto error;
+ if (intr_conf->lsc | intr_conf->rmv) {
+ priv->intr_handle.fd = priv->ctx->async_fd;
+ rc = rte_intr_callback_register(&priv->intr_handle,
+ (void (*)(void *))
+ mlx4_interrupt_handler,
+ priv);
+ if (rc < 0) {
+ rte_errno = -rc;
+ goto error;
+ }
+ }
+ return 0;
+error:
+ mlx4_intr_uninstall(priv);
+ return -rte_errno;
+}
+
+/**
+ * DPDK callback for Rx queue interrupt disable.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * Rx queue index.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct rxq *rxq = dev->data->rx_queues[idx];
+ struct ibv_cq *ev_cq;
+ void *ev_ctx;
+ int ret;
+
+ if (!rxq || !rxq->channel) {
+ ret = EINVAL;
+ } else {
+ ret = ibv_get_cq_event(rxq->cq->channel, &ev_cq, &ev_ctx);
+ if (ret || ev_cq != rxq->cq)
+ ret = EINVAL;
+ }
+ if (ret) {
+ rte_errno = ret;
+ WARN("unable to disable interrupt on rx queue %d",
+ idx);
+ } else {
+ rxq->mcq.arm_sn++;
+ ibv_ack_cq_events(rxq->cq, 1);
+ }
+ return -ret;
+}
+
+/**
+ * DPDK callback for Rx queue interrupt enable.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * Rx queue index.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct rxq *rxq = dev->data->rx_queues[idx];
+ int ret = 0;
+
+ if (!rxq || !rxq->channel) {
+ ret = EINVAL;
+ rte_errno = ret;
+ WARN("unable to arm interrupt on rx queue %d", idx);
+ } else {
+ mlx4_arm_cq(rxq, 0);
+ }
+ return -ret;
+}
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
new file mode 100644
index 00000000..2a3e2695
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -0,0 +1,293 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Memory management functions for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_memory.h>
+#include <rte_mempool.h>
+#include <rte_spinlock.h>
+
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+struct mlx4_check_mempool_data {
+ int ret;
+ char *start;
+ char *end;
+};
+
+/**
+ * Called by mlx4_check_mempool() when iterating the memory chunks.
+ *
+ * @param[in] mp
+ * Pointer to memory pool (unused).
+ * @param[in, out] data
+ * Pointer to shared buffer with mlx4_check_mempool().
+ * @param[in] memhdr
+ * Pointer to mempool chunk header.
+ * @param mem_idx
+ * Mempool element index (unused).
+ */
+static void
+mlx4_check_mempool_cb(struct rte_mempool *mp, void *opaque,
+ struct rte_mempool_memhdr *memhdr,
+ unsigned int mem_idx)
+{
+ struct mlx4_check_mempool_data *data = opaque;
+
+ (void)mp;
+ (void)mem_idx;
+ /* It already failed, skip the next chunks. */
+ if (data->ret != 0)
+ return;
+ /* It is the first chunk. */
+ if (data->start == NULL && data->end == NULL) {
+ data->start = memhdr->addr;
+ data->end = data->start + memhdr->len;
+ return;
+ }
+ if (data->end == memhdr->addr) {
+ data->end += memhdr->len;
+ return;
+ }
+ if (data->start == (char *)memhdr->addr + memhdr->len) {
+ data->start -= memhdr->len;
+ return;
+ }
+ /* Error, mempool is not virtually contiguous. */
+ data->ret = -1;
+}
+
+/**
+ * Check if a mempool can be used: it must be virtually contiguous.
+ *
+ * @param[in] mp
+ * Pointer to memory pool.
+ * @param[out] start
+ * Pointer to the start address of the mempool virtual memory area.
+ * @param[out] end
+ * Pointer to the end address of the mempool virtual memory area.
+ *
+ * @return
+ * 0 on success (mempool is virtually contiguous), -1 on error.
+ */
+static int
+mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start, uintptr_t *end)
+{
+ struct mlx4_check_mempool_data data;
+
+ memset(&data, 0, sizeof(data));
+ rte_mempool_mem_iter(mp, mlx4_check_mempool_cb, &data);
+ *start = (uintptr_t)data.start;
+ *end = (uintptr_t)data.end;
+ return data.ret;
+}
+
+/**
+ * Obtain a memory region from a memory pool.
+ *
+ * If a matching memory region already exists, it is returned with its
+ * reference count incremented, otherwise a new one is registered.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param mp
+ * Pointer to memory pool.
+ *
+ * @return
+ * Memory region pointer, NULL in case of error and rte_errno is set.
+ */
+struct mlx4_mr *
+mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
+{
+ const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ uintptr_t start;
+ uintptr_t end;
+ unsigned int i;
+ struct mlx4_mr *mr;
+
+ if (mlx4_check_mempool(mp, &start, &end) != 0) {
+ rte_errno = EINVAL;
+ ERROR("mempool %p: not virtually contiguous",
+ (void *)mp);
+ return NULL;
+ }
+ DEBUG("mempool %p area start=%p end=%p size=%zu",
+ (void *)mp, (void *)start, (void *)end,
+ (size_t)(end - start));
+ /* Round start and end to page boundary if found in memory segments. */
+ for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+ uintptr_t addr = (uintptr_t)ms[i].addr;
+ size_t len = ms[i].len;
+ unsigned int align = ms[i].hugepage_sz;
+
+ if ((start > addr) && (start < addr + len))
+ start = RTE_ALIGN_FLOOR(start, align);
+ if ((end > addr) && (end < addr + len))
+ end = RTE_ALIGN_CEIL(end, align);
+ }
+ DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+ (void *)mp, (void *)start, (void *)end,
+ (size_t)(end - start));
+ rte_spinlock_lock(&priv->mr_lock);
+ LIST_FOREACH(mr, &priv->mr, next)
+ if (mp == mr->mp && start >= mr->start && end <= mr->end)
+ break;
+ if (mr) {
+ ++mr->refcnt;
+ goto release;
+ }
+ mr = rte_malloc(__func__, sizeof(*mr), 0);
+ if (!mr) {
+ rte_errno = ENOMEM;
+ goto release;
+ }
+ *mr = (struct mlx4_mr){
+ .start = start,
+ .end = end,
+ .refcnt = 1,
+ .priv = priv,
+ .mr = ibv_reg_mr(priv->pd, (void *)start, end - start,
+ IBV_ACCESS_LOCAL_WRITE),
+ .mp = mp,
+ };
+ if (mr->mr) {
+ mr->lkey = mr->mr->lkey;
+ LIST_INSERT_HEAD(&priv->mr, mr, next);
+ } else {
+ rte_free(mr);
+ mr = NULL;
+ rte_errno = errno ? errno : EINVAL;
+ }
+release:
+ rte_spinlock_unlock(&priv->mr_lock);
+ return mr;
+}
+
+/**
+ * Release a memory region.
+ *
+ * This function decrements its reference count and destroys it after
+ * reaching 0.
+ *
+ * Note to avoid race conditions given this function may be used from the
+ * data plane, it's extremely important that each user holds its own
+ * reference.
+ *
+ * @param mr
+ * Memory region to release.
+ */
+void
+mlx4_mr_put(struct mlx4_mr *mr)
+{
+ struct priv *priv = mr->priv;
+
+ rte_spinlock_lock(&priv->mr_lock);
+ assert(mr->refcnt);
+ if (--mr->refcnt)
+ goto release;
+ LIST_REMOVE(mr, next);
+ claim_zero(ibv_dereg_mr(mr->mr));
+ rte_free(mr);
+release:
+ rte_spinlock_unlock(&priv->mr_lock);
+}
+
+/**
+ * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
+ * If mp2mr[] is full, remove an entry first.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param[in] mp
+ * Memory pool for which a memory region lkey must be added.
+ * @param[in] i
+ * Index in memory pool (MP) where to add memory region (MR).
+ *
+ * @return
+ * Added mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t
+mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
+{
+ struct mlx4_mr *mr;
+
+ /* Add a new entry, register MR first. */
+ DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+ (void *)txq, mp->name, (void *)mp);
+ mr = mlx4_mr_get(txq->priv, mp);
+ if (unlikely(mr == NULL)) {
+ DEBUG("%p: unable to configure MR, mlx4_mr_get() failed",
+ (void *)txq);
+ return (uint32_t)-1;
+ }
+ if (unlikely(i == RTE_DIM(txq->mp2mr))) {
+ /* Table is full, remove oldest entry. */
+ DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+ (void *)txq);
+ --i;
+ mlx4_mr_put(txq->mp2mr[0].mr);
+ memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+ (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ }
+ /* Store the new entry. */
+ txq->mp2mr[i].mp = mp;
+ txq->mp2mr[i].mr = mr;
+ txq->mp2mr[i].lkey = mr->lkey;
+ DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
+ (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
+ return txq->mp2mr[i].lkey;
+}
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
new file mode 100644
index 00000000..fcc7c129
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -0,0 +1,177 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MLX4_PRM_H_
+#define MLX4_PRM_H_
+
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx4dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+/* ConnectX-3 Tx queue basic block. */
+#define MLX4_TXBB_SHIFT 6
+#define MLX4_TXBB_SIZE (1 << MLX4_TXBB_SHIFT)
+
+/* Typical TSO descriptor with 16 gather entries is 352 bytes. */
+#define MLX4_MAX_WQE_SIZE 512
+#define MLX4_MAX_WQE_TXBBS (MLX4_MAX_WQE_SIZE / MLX4_TXBB_SIZE)
+
+/* Send queue stamping/invalidating information. */
+#define MLX4_SQ_STAMP_STRIDE 64
+#define MLX4_SQ_STAMP_DWORDS (MLX4_SQ_STAMP_STRIDE / 4)
+#define MLX4_SQ_STAMP_SHIFT 31
+#define MLX4_SQ_STAMP_VAL 0x7fffffff
+
+/* Work queue element (WQE) flags. */
+#define MLX4_BIT_WQE_OWN 0x80000000
+#define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
+#define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+
+#define MLX4_SIZE_TO_TXBBS(size) \
+ (RTE_ALIGN((size), (MLX4_TXBB_SIZE)) >> (MLX4_TXBB_SHIFT))
+
+/* CQE checksum flags. */
+enum {
+ MLX4_CQE_L2_TUNNEL_IPV4 = (int)(1u << 25),
+ MLX4_CQE_L2_TUNNEL_L4_CSUM = (int)(1u << 26),
+ MLX4_CQE_L2_TUNNEL = (int)(1u << 27),
+ MLX4_CQE_L2_VLAN_MASK = (int)(3u << 29),
+ MLX4_CQE_L2_TUNNEL_IPOK = (int)(1u << 31),
+};
+
+/* CQE status flags. */
+#define MLX4_CQE_STATUS_IPV4 (1 << 22)
+#define MLX4_CQE_STATUS_IPV4F (1 << 23)
+#define MLX4_CQE_STATUS_IPV6 (1 << 24)
+#define MLX4_CQE_STATUS_IPV4OPT (1 << 25)
+#define MLX4_CQE_STATUS_TCP (1 << 26)
+#define MLX4_CQE_STATUS_UDP (1 << 27)
+#define MLX4_CQE_STATUS_PTYPE_MASK \
+ (MLX4_CQE_STATUS_IPV4 | \
+ MLX4_CQE_STATUS_IPV4F | \
+ MLX4_CQE_STATUS_IPV6 | \
+ MLX4_CQE_STATUS_IPV4OPT | \
+ MLX4_CQE_STATUS_TCP | \
+ MLX4_CQE_STATUS_UDP)
+
+/* Send queue information. */
+struct mlx4_sq {
+ volatile uint8_t *buf; /**< SQ buffer. */
+ volatile uint8_t *eob; /**< End of SQ buffer */
+ uint32_t head; /**< SQ head counter in units of TXBBS. */
+ uint32_t tail; /**< SQ tail counter in units of TXBBS. */
+ uint32_t txbb_cnt; /**< Num of WQEBB in the Q (should be ^2). */
+ uint32_t txbb_cnt_mask; /**< txbbs_cnt mask (txbb_cnt is ^2). */
+ uint32_t headroom_txbbs; /**< Num of txbbs that should be kept free. */
+ volatile uint32_t *db; /**< Pointer to the doorbell. */
+ uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */
+};
+
+#define mlx4_get_send_wqe(sq, n) ((sq)->buf + ((n) * (MLX4_TXBB_SIZE)))
+
+/* Completion queue events, numbers and masks. */
+#define MLX4_CQ_DB_GEQ_N_MASK 0x3
+#define MLX4_CQ_DOORBELL 0x20
+#define MLX4_CQ_DB_CI_MASK 0xffffff
+
+/* Completion queue information. */
+struct mlx4_cq {
+ volatile void *cq_uar; /**< CQ user access region. */
+ volatile void *cq_db_reg; /**< CQ doorbell register. */
+ volatile uint32_t *set_ci_db; /**< Pointer to the CQ doorbell. */
+ volatile uint32_t *arm_db; /**< Arming Rx events doorbell. */
+ volatile uint8_t *buf; /**< Pointer to the completion queue buffer. */
+ uint32_t cqe_cnt; /**< Number of entries in the queue. */
+ uint32_t cqe_64:1; /**< CQ entry size is 64 bytes. */
+ uint32_t cons_index; /**< Last queue entry that was handled. */
+ uint32_t cqn; /**< CQ number. */
+ int arm_sn; /**< Rx event counter. */
+};
+
+/**
+ * Retrieve a CQE entry from a CQ.
+ *
+ * cqe = cq->buf + cons_index * cqe_size + cqe_offset
+ *
+ * Where cqe_size is 32 or 64 bytes and cqe_offset is 0 or 32 (depending on
+ * cqe_size).
+ *
+ * @param cq
+ * CQ to retrieve entry from.
+ * @param index
+ * Entry index.
+ *
+ * @return
+ * Pointer to CQE entry.
+ */
+static inline volatile struct mlx4_cqe *
+mlx4_get_cqe(struct mlx4_cq *cq, uint32_t index)
+{
+ return (volatile struct mlx4_cqe *)(cq->buf +
+ ((index & (cq->cqe_cnt - 1)) <<
+ (5 + cq->cqe_64)) +
+ (cq->cqe_64 << 5));
+}
+
+/**
+ * Transpose a flag in a value.
+ *
+ * @param val
+ * Input value.
+ * @param from
+ * Flag to retrieve from input value.
+ * @param to
+ * Flag to set in output value.
+ *
+ * @return
+ * Output value with transposed flag enabled if present on input.
+ */
+static inline uint64_t
+mlx4_transpose(uint64_t val, uint64_t from, uint64_t to)
+{
+ return (from >= to ?
+ (val & from) / (from / to) :
+ (val & from) * (to / from));
+}
+
+#endif /* MLX4_PRM_H_ */
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
new file mode 100644
index 00000000..8b97a894
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -0,0 +1,873 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Rx queues configuration for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx4dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_ethdev.h>
+#include <rte_flow.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+
+#include "mlx4.h"
+#include "mlx4_flow.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Historical RSS hash key.
+ *
+ * This used to be the default for mlx4 in Linux before v3.19 switched to
+ * generating random hash keys through netdev_rss_key_fill().
+ *
+ * It is used in this PMD for consistency with past DPDK releases but can
+ * now be overridden through user configuration.
+ *
+ * Note: this is not const to work around API quirks.
+ */
+uint8_t
+mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = {
+ 0x2c, 0xc6, 0x81, 0xd1,
+ 0x5b, 0xdb, 0xf4, 0xf7,
+ 0xfc, 0xa2, 0x83, 0x19,
+ 0xdb, 0x1a, 0x3e, 0x94,
+ 0x6b, 0x9e, 0x38, 0xd9,
+ 0x2c, 0x9c, 0x03, 0xd1,
+ 0xad, 0x99, 0x44, 0xa7,
+ 0xd9, 0x56, 0x3d, 0x59,
+ 0x06, 0x3c, 0x25, 0xf3,
+ 0xfc, 0x1f, 0xdc, 0x2a,
+};
+
+/**
+ * Obtain a RSS context with specified properties.
+ *
+ * Used when creating a flow rule targeting one or several Rx queues.
+ *
+ * If a matching RSS context already exists, it is returned with its
+ * reference count incremented.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param fields
+ * Fields for RSS processing (Verbs format).
+ * @param[in] key
+ * Hash key to use (whose size is exactly MLX4_RSS_HASH_KEY_SIZE).
+ * @param queues
+ * Number of target queues.
+ * @param[in] queue_id
+ * Target queues.
+ *
+ * @return
+ * Pointer to RSS context on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx4_rss *
+mlx4_rss_get(struct priv *priv, uint64_t fields,
+ uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+ uint16_t queues, const uint16_t queue_id[])
+{
+ struct mlx4_rss *rss;
+ size_t queue_id_size = sizeof(queue_id[0]) * queues;
+
+ LIST_FOREACH(rss, &priv->rss, next)
+ if (fields == rss->fields &&
+ queues == rss->queues &&
+ !memcmp(key, rss->key, MLX4_RSS_HASH_KEY_SIZE) &&
+ !memcmp(queue_id, rss->queue_id, queue_id_size)) {
+ ++rss->refcnt;
+ return rss;
+ }
+ rss = rte_malloc(__func__, offsetof(struct mlx4_rss, queue_id) +
+ queue_id_size, 0);
+ if (!rss)
+ goto error;
+ *rss = (struct mlx4_rss){
+ .priv = priv,
+ .refcnt = 1,
+ .usecnt = 0,
+ .qp = NULL,
+ .ind = NULL,
+ .fields = fields,
+ .queues = queues,
+ };
+ memcpy(rss->key, key, MLX4_RSS_HASH_KEY_SIZE);
+ memcpy(rss->queue_id, queue_id, queue_id_size);
+ LIST_INSERT_HEAD(&priv->rss, rss, next);
+ return rss;
+error:
+ rte_errno = ENOMEM;
+ return NULL;
+}
+
+/**
+ * Release a RSS context instance.
+ *
+ * Used when destroying a flow rule targeting one or several Rx queues.
+ *
+ * This function decrements the reference count of the context and destroys
+ * it after reaching 0. The context must have no users at this point; all
+ * prior calls to mlx4_rss_attach() must have been followed by matching
+ * calls to mlx4_rss_detach().
+ *
+ * @param rss
+ * RSS context to release.
+ */
+void
+mlx4_rss_put(struct mlx4_rss *rss)
+{
+ assert(rss->refcnt);
+ if (--rss->refcnt)
+ return;
+ assert(!rss->usecnt);
+ assert(!rss->qp);
+ assert(!rss->ind);
+ LIST_REMOVE(rss, next);
+ rte_free(rss);
+}
+
+/**
+ * Attach a user to a RSS context instance.
+ *
+ * Used when the RSS QP and indirection table objects must be instantiated,
+ * that is, when a flow rule must be enabled.
+ *
+ * This function increments the usage count of the context.
+ *
+ * @param rss
+ * RSS context to attach to.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rss_attach(struct mlx4_rss *rss)
+{
+ assert(rss->refcnt);
+ if (rss->usecnt++) {
+ assert(rss->qp);
+ assert(rss->ind);
+ return 0;
+ }
+
+ struct ibv_wq *ind_tbl[rss->queues];
+ struct priv *priv = rss->priv;
+ const char *msg;
+ unsigned int i = 0;
+ int ret;
+
+ if (!rte_is_power_of_2(RTE_DIM(ind_tbl))) {
+ ret = EINVAL;
+ msg = "number of RSS queues must be a power of two";
+ goto error;
+ }
+ for (i = 0; i != RTE_DIM(ind_tbl); ++i) {
+ uint16_t id = rss->queue_id[i];
+ struct rxq *rxq = NULL;
+
+ if (id < priv->dev->data->nb_rx_queues)
+ rxq = priv->dev->data->rx_queues[id];
+ if (!rxq) {
+ ret = EINVAL;
+ msg = "RSS target queue is not configured";
+ goto error;
+ }
+ ret = mlx4_rxq_attach(rxq);
+ if (ret) {
+ ret = -ret;
+ msg = "unable to attach RSS target queue";
+ goto error;
+ }
+ ind_tbl[i] = rxq->wq;
+ }
+ rss->ind = ibv_create_rwq_ind_table
+ (priv->ctx,
+ &(struct ibv_rwq_ind_table_init_attr){
+ .log_ind_tbl_size = rte_log2_u32(RTE_DIM(ind_tbl)),
+ .ind_tbl = ind_tbl,
+ .comp_mask = 0,
+ });
+ if (!rss->ind) {
+ ret = errno ? errno : EINVAL;
+ msg = "RSS indirection table creation failure";
+ goto error;
+ }
+ rss->qp = ibv_create_qp_ex
+ (priv->ctx,
+ &(struct ibv_qp_init_attr_ex){
+ .comp_mask = (IBV_QP_INIT_ATTR_PD |
+ IBV_QP_INIT_ATTR_RX_HASH |
+ IBV_QP_INIT_ATTR_IND_TABLE),
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .pd = priv->pd,
+ .rwq_ind_tbl = rss->ind,
+ .rx_hash_conf = {
+ .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
+ .rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE,
+ .rx_hash_key = rss->key,
+ .rx_hash_fields_mask = rss->fields,
+ },
+ });
+ if (!rss->qp) {
+ ret = errno ? errno : EINVAL;
+ msg = "RSS hash QP creation failure";
+ goto error;
+ }
+ ret = ibv_modify_qp
+ (rss->qp,
+ &(struct ibv_qp_attr){
+ .qp_state = IBV_QPS_INIT,
+ .port_num = priv->port,
+ },
+ IBV_QP_STATE | IBV_QP_PORT);
+ if (ret) {
+ msg = "failed to switch RSS hash QP to INIT state";
+ goto error;
+ }
+ ret = ibv_modify_qp
+ (rss->qp,
+ &(struct ibv_qp_attr){
+ .qp_state = IBV_QPS_RTR,
+ },
+ IBV_QP_STATE);
+ if (ret) {
+ msg = "failed to switch RSS hash QP to RTR state";
+ goto error;
+ }
+ return 0;
+error:
+ if (rss->qp) {
+ claim_zero(ibv_destroy_qp(rss->qp));
+ rss->qp = NULL;
+ }
+ if (rss->ind) {
+ claim_zero(ibv_destroy_rwq_ind_table(rss->ind));
+ rss->ind = NULL;
+ }
+ while (i--)
+ mlx4_rxq_detach(priv->dev->data->rx_queues[rss->queue_id[i]]);
+ ERROR("mlx4: %s", msg);
+ --rss->usecnt;
+ rte_errno = ret;
+ return -ret;
+}
+
+/**
+ * Detach a user from a RSS context instance.
+ *
+ * Used when disabling (not destroying) a flow rule.
+ *
+ * This function decrements the usage count of the context and destroys
+ * usage resources after reaching 0.
+ *
+ * @param rss
+ * RSS context to detach from.
+ */
+void
+mlx4_rss_detach(struct mlx4_rss *rss)
+{
+ struct priv *priv = rss->priv;
+ unsigned int i;
+
+ assert(rss->refcnt);
+ assert(rss->qp);
+ assert(rss->ind);
+ if (--rss->usecnt)
+ return;
+ claim_zero(ibv_destroy_qp(rss->qp));
+ rss->qp = NULL;
+ claim_zero(ibv_destroy_rwq_ind_table(rss->ind));
+ rss->ind = NULL;
+ for (i = 0; i != rss->queues; ++i)
+ mlx4_rxq_detach(priv->dev->data->rx_queues[rss->queue_id[i]]);
+}
+
+/**
+ * Initialize common RSS context resources.
+ *
+ * Because ConnectX-3 hardware limitations require a fixed order in the
+ * indirection table, WQs must be allocated sequentially to be part of a
+ * common RSS context.
+ *
+ * Since a newly created WQ cannot be moved to a different context, this
+ * function allocates them all at once, one for each configured Rx queue,
+ * as well as all related resources (CQs and mbufs).
+ *
+ * This must therefore be done before creating any Rx flow rules relying on
+ * indirection tables.
+ *
+ * @param priv
+ * Pointer to private structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rss_init(struct priv *priv)
+{
+ struct rte_eth_dev *dev = priv->dev;
+ uint8_t log2_range = rte_log2_u32(dev->data->nb_rx_queues);
+ uint32_t wq_num_prev = 0;
+ const char *msg;
+ unsigned int i;
+ int ret;
+
+ /* Prepare range for RSS contexts before creating the first WQ. */
+ ret = mlx4dv_set_context_attr(priv->ctx,
+ MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ,
+ &log2_range);
+ if (ret) {
+ ERROR("cannot set up range size for RSS context to %u"
+ " (for %u Rx queues), error: %s",
+ 1 << log2_range, dev->data->nb_rx_queues, strerror(ret));
+ rte_errno = ret;
+ return -ret;
+ }
+ for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) {
+ struct rxq *rxq = priv->dev->data->rx_queues[i];
+ struct ibv_cq *cq;
+ struct ibv_wq *wq;
+ uint32_t wq_num;
+
+ /* Attach the configured Rx queues. */
+ if (rxq) {
+ assert(!rxq->usecnt);
+ ret = mlx4_rxq_attach(rxq);
+ if (!ret) {
+ wq_num = rxq->wq->wq_num;
+ goto wq_num_check;
+ }
+ ret = -ret;
+ msg = "unable to create Rx queue resources";
+ goto error;
+ }
+ /*
+ * WQs are temporarily allocated for unconfigured Rx queues
+ * to maintain proper index alignment in indirection table
+ * by skipping unused WQ numbers.
+ *
+ * The reason this works at all even though these WQs are
+ * immediately destroyed is that WQNs are allocated
+ * sequentially and are guaranteed to never be reused in the
+ * same context by the underlying implementation.
+ */
+ cq = ibv_create_cq(priv->ctx, 1, NULL, NULL, 0);
+ if (!cq) {
+ ret = ENOMEM;
+ msg = "placeholder CQ creation failure";
+ goto error;
+ }
+ wq = ibv_create_wq
+ (priv->ctx,
+ &(struct ibv_wq_init_attr){
+ .wq_type = IBV_WQT_RQ,
+ .max_wr = 1,
+ .max_sge = 1,
+ .pd = priv->pd,
+ .cq = cq,
+ });
+ if (wq) {
+ wq_num = wq->wq_num;
+ claim_zero(ibv_destroy_wq(wq));
+ } else {
+ wq_num = 0; /* Shut up GCC 4.8 warnings. */
+ }
+ claim_zero(ibv_destroy_cq(cq));
+ if (!wq) {
+ ret = ENOMEM;
+ msg = "placeholder WQ creation failure";
+ goto error;
+ }
+wq_num_check:
+ /*
+ * While guaranteed by the implementation, make sure WQ
+ * numbers are really sequential (as the saying goes,
+ * trust, but verify).
+ */
+ if (i && wq_num - wq_num_prev != 1) {
+ if (rxq)
+ mlx4_rxq_detach(rxq);
+ ret = ERANGE;
+ msg = "WQ numbers are not sequential";
+ goto error;
+ }
+ wq_num_prev = wq_num;
+ }
+ return 0;
+error:
+ ERROR("cannot initialize common RSS resources (queue %u): %s: %s",
+ i, msg, strerror(ret));
+ while (i--) {
+ struct rxq *rxq = priv->dev->data->rx_queues[i];
+
+ if (rxq)
+ mlx4_rxq_detach(rxq);
+ }
+ rte_errno = ret;
+ return -ret;
+}
+
+/**
+ * Release common RSS context resources.
+ *
+ * As the reverse of mlx4_rss_init(), this must be done after removing all
+ * flow rules relying on indirection tables.
+ *
+ * @param priv
+ * Pointer to private structure.
+ */
+void
+mlx4_rss_deinit(struct priv *priv)
+{
+ unsigned int i;
+
+ for (i = 0; i != priv->dev->data->nb_rx_queues; ++i) {
+ struct rxq *rxq = priv->dev->data->rx_queues[i];
+
+ if (rxq) {
+ assert(rxq->usecnt == 1);
+ mlx4_rxq_detach(rxq);
+ }
+ }
+}
+
+/**
+ * Attach a user to a Rx queue.
+ *
+ * Used when the resources of an Rx queue must be instantiated for it to
+ * become in a usable state.
+ *
+ * This function increments the usage count of the Rx queue.
+ *
+ * @param rxq
+ * Pointer to Rx queue structure.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rxq_attach(struct rxq *rxq)
+{
+ if (rxq->usecnt++) {
+ assert(rxq->cq);
+ assert(rxq->wq);
+ assert(rxq->wqes);
+ assert(rxq->rq_db);
+ return 0;
+ }
+
+ struct priv *priv = rxq->priv;
+ const uint32_t elts_n = 1 << rxq->elts_n;
+ const uint32_t sges_n = 1 << rxq->sges_n;
+ struct rte_mbuf *(*elts)[elts_n] = rxq->elts;
+ struct mlx4dv_obj mlxdv;
+ struct mlx4dv_rwq dv_rwq;
+ struct mlx4dv_cq dv_cq = { .comp_mask = MLX4DV_CQ_MASK_UAR, };
+ const char *msg;
+ struct ibv_cq *cq = NULL;
+ struct ibv_wq *wq = NULL;
+ volatile struct mlx4_wqe_data_seg (*wqes)[];
+ unsigned int i;
+ int ret;
+
+ assert(rte_is_power_of_2(elts_n));
+ cq = ibv_create_cq(priv->ctx, elts_n / sges_n, NULL, rxq->channel, 0);
+ if (!cq) {
+ ret = ENOMEM;
+ msg = "CQ creation failure";
+ goto error;
+ }
+ wq = ibv_create_wq
+ (priv->ctx,
+ &(struct ibv_wq_init_attr){
+ .wq_type = IBV_WQT_RQ,
+ .max_wr = elts_n / sges_n,
+ .max_sge = sges_n,
+ .pd = priv->pd,
+ .cq = cq,
+ });
+ if (!wq) {
+ ret = errno ? errno : EINVAL;
+ msg = "WQ creation failure";
+ goto error;
+ }
+ ret = ibv_modify_wq
+ (wq,
+ &(struct ibv_wq_attr){
+ .attr_mask = IBV_WQ_ATTR_STATE,
+ .wq_state = IBV_WQS_RDY,
+ });
+ if (ret) {
+ msg = "WQ state change to IBV_WQS_RDY failed";
+ goto error;
+ }
+ /* Retrieve device queue information. */
+ mlxdv.cq.in = cq;
+ mlxdv.cq.out = &dv_cq;
+ mlxdv.rwq.in = wq;
+ mlxdv.rwq.out = &dv_rwq;
+ ret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_RWQ | MLX4DV_OBJ_CQ);
+ if (ret) {
+ msg = "failed to obtain device information from WQ/CQ objects";
+ goto error;
+ }
+ wqes = (volatile struct mlx4_wqe_data_seg (*)[])
+ ((uintptr_t)dv_rwq.buf.buf + dv_rwq.rq.offset);
+ for (i = 0; i != RTE_DIM(*elts); ++i) {
+ volatile struct mlx4_wqe_data_seg *scat = &(*wqes)[i];
+ struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
+
+ if (buf == NULL) {
+ while (i--) {
+ rte_pktmbuf_free_seg((*elts)[i]);
+ (*elts)[i] = NULL;
+ }
+ ret = ENOMEM;
+ msg = "cannot allocate mbuf";
+ goto error;
+ }
+ /* Headroom is reserved by rte_pktmbuf_alloc(). */
+ assert(buf->data_off == RTE_PKTMBUF_HEADROOM);
+ /* Buffer is supposed to be empty. */
+ assert(rte_pktmbuf_data_len(buf) == 0);
+ assert(rte_pktmbuf_pkt_len(buf) == 0);
+ /* Only the first segment keeps headroom. */
+ if (i % sges_n)
+ buf->data_off = 0;
+ buf->port = rxq->port_id;
+ buf->data_len = rte_pktmbuf_tailroom(buf);
+ buf->pkt_len = rte_pktmbuf_tailroom(buf);
+ buf->nb_segs = 1;
+ *scat = (struct mlx4_wqe_data_seg){
+ .addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
+ uintptr_t)),
+ .byte_count = rte_cpu_to_be_32(buf->data_len),
+ .lkey = rte_cpu_to_be_32(rxq->mr->lkey),
+ };
+ (*elts)[i] = buf;
+ }
+ DEBUG("%p: allocated and configured %u segments (max %u packets)",
+ (void *)rxq, elts_n, elts_n / sges_n);
+ rxq->cq = cq;
+ rxq->wq = wq;
+ rxq->wqes = wqes;
+ rxq->rq_db = dv_rwq.rdb;
+ rxq->mcq.buf = dv_cq.buf.buf;
+ rxq->mcq.cqe_cnt = dv_cq.cqe_cnt;
+ rxq->mcq.set_ci_db = dv_cq.set_ci_db;
+ rxq->mcq.cqe_64 = (dv_cq.cqe_size & 64) ? 1 : 0;
+ rxq->mcq.arm_db = dv_cq.arm_db;
+ rxq->mcq.arm_sn = dv_cq.arm_sn;
+ rxq->mcq.cqn = dv_cq.cqn;
+ rxq->mcq.cq_uar = dv_cq.cq_uar;
+ rxq->mcq.cq_db_reg = (uint8_t *)dv_cq.cq_uar + MLX4_CQ_DOORBELL;
+ /* Update doorbell counter. */
+ rxq->rq_ci = elts_n / sges_n;
+ rte_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+ return 0;
+error:
+ if (wq)
+ claim_zero(ibv_destroy_wq(wq));
+ if (cq)
+ claim_zero(ibv_destroy_cq(cq));
+ rte_errno = ret;
+ ERROR("error while attaching Rx queue %p: %s: %s",
+ (void *)rxq, msg, strerror(ret));
+ return -ret;
+}
+
+/**
+ * Detach a user from a Rx queue.
+ *
+ * This function decrements the usage count of the Rx queue and destroys
+ * usage resources after reaching 0.
+ *
+ * @param rxq
+ * Pointer to Rx queue structure.
+ */
+void
+mlx4_rxq_detach(struct rxq *rxq)
+{
+ unsigned int i;
+ struct rte_mbuf *(*elts)[1 << rxq->elts_n] = rxq->elts;
+
+ if (--rxq->usecnt)
+ return;
+ rxq->rq_ci = 0;
+ memset(&rxq->mcq, 0, sizeof(rxq->mcq));
+ rxq->rq_db = NULL;
+ rxq->wqes = NULL;
+ claim_zero(ibv_destroy_wq(rxq->wq));
+ rxq->wq = NULL;
+ claim_zero(ibv_destroy_cq(rxq->cq));
+ rxq->cq = NULL;
+ DEBUG("%p: freeing Rx queue elements", (void *)rxq);
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ if (!(*elts)[i])
+ continue;
+ rte_pktmbuf_free_seg((*elts)[i]);
+ (*elts)[i] = NULL;
+ }
+}
+
+/**
+ * DPDK callback to configure a Rx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * Rx queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ * Thresholds parameters.
+ * @param mp
+ * Memory pool for buffer allocations.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ unsigned int socket, const struct rte_eth_rxconf *conf,
+ struct rte_mempool *mp)
+{
+ struct priv *priv = dev->data->dev_private;
+ uint32_t mb_len = rte_pktmbuf_data_room_size(mp);
+ struct rte_mbuf *(*elts)[rte_align32pow2(desc)];
+ struct rxq *rxq;
+ struct mlx4_malloc_vec vec[] = {
+ {
+ .align = RTE_CACHE_LINE_SIZE,
+ .size = sizeof(*rxq),
+ .addr = (void **)&rxq,
+ },
+ {
+ .align = RTE_CACHE_LINE_SIZE,
+ .size = sizeof(*elts),
+ .addr = (void **)&elts,
+ },
+ };
+ int ret;
+
+ (void)conf; /* Thresholds configuration (ignored). */
+ DEBUG("%p: configuring queue %u for %u descriptors",
+ (void *)dev, idx, desc);
+ if (idx >= dev->data->nb_rx_queues) {
+ rte_errno = EOVERFLOW;
+ ERROR("%p: queue index out of range (%u >= %u)",
+ (void *)dev, idx, dev->data->nb_rx_queues);
+ return -rte_errno;
+ }
+ rxq = dev->data->rx_queues[idx];
+ if (rxq) {
+ rte_errno = EEXIST;
+ ERROR("%p: Rx queue %u already configured, release it first",
+ (void *)dev, idx);
+ return -rte_errno;
+ }
+ if (!desc) {
+ rte_errno = EINVAL;
+ ERROR("%p: invalid number of Rx descriptors", (void *)dev);
+ return -rte_errno;
+ }
+ if (desc != RTE_DIM(*elts)) {
+ desc = RTE_DIM(*elts);
+ WARN("%p: increased number of descriptors in Rx queue %u"
+ " to the next power of two (%u)",
+ (void *)dev, idx, desc);
+ }
+ /* Allocate and initialize Rx queue. */
+ mlx4_zmallocv_socket("RXQ", vec, RTE_DIM(vec), socket);
+ if (!rxq) {
+ ERROR("%p: unable to allocate queue index %u",
+ (void *)dev, idx);
+ return -rte_errno;
+ }
+ *rxq = (struct rxq){
+ .priv = priv,
+ .mp = mp,
+ .port_id = dev->data->port_id,
+ .sges_n = 0,
+ .elts_n = rte_log2_u32(desc),
+ .elts = elts,
+ /* Toggle Rx checksum offload if hardware supports it. */
+ .csum = (priv->hw_csum &&
+ dev->data->dev_conf.rxmode.hw_ip_checksum),
+ .csum_l2tun = (priv->hw_csum_l2tun &&
+ dev->data->dev_conf.rxmode.hw_ip_checksum),
+ .stats = {
+ .idx = idx,
+ },
+ .socket = socket,
+ };
+ /* Enable scattered packets support for this queue if necessary. */
+ assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+ if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
+ (mb_len - RTE_PKTMBUF_HEADROOM)) {
+ ;
+ } else if (dev->data->dev_conf.rxmode.enable_scatter) {
+ uint32_t size =
+ RTE_PKTMBUF_HEADROOM +
+ dev->data->dev_conf.rxmode.max_rx_pkt_len;
+ uint32_t sges_n;
+
+ /*
+ * Determine the number of SGEs needed for a full packet
+ * and round it to the next power of two.
+ */
+ sges_n = rte_log2_u32((size / mb_len) + !!(size % mb_len));
+ rxq->sges_n = sges_n;
+ /* Make sure sges_n did not overflow. */
+ size = mb_len * (1 << rxq->sges_n);
+ size -= RTE_PKTMBUF_HEADROOM;
+ if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
+ rte_errno = EOVERFLOW;
+ ERROR("%p: too many SGEs (%u) needed to handle"
+ " requested maximum packet size %u",
+ (void *)dev,
+ 1 << sges_n,
+ dev->data->dev_conf.rxmode.max_rx_pkt_len);
+ goto error;
+ }
+ } else {
+ WARN("%p: the requested maximum Rx packet size (%u) is"
+ " larger than a single mbuf (%u) and scattered"
+ " mode has not been requested",
+ (void *)dev,
+ dev->data->dev_conf.rxmode.max_rx_pkt_len,
+ mb_len - RTE_PKTMBUF_HEADROOM);
+ }
+ DEBUG("%p: maximum number of segments per packet: %u",
+ (void *)dev, 1 << rxq->sges_n);
+ if (desc % (1 << rxq->sges_n)) {
+ rte_errno = EINVAL;
+ ERROR("%p: number of Rx queue descriptors (%u) is not a"
+ " multiple of maximum segments per packet (%u)",
+ (void *)dev,
+ desc,
+ 1 << rxq->sges_n);
+ goto error;
+ }
+ /* Use the entire Rx mempool as the memory region. */
+ rxq->mr = mlx4_mr_get(priv, mp);
+ if (!rxq->mr) {
+ ERROR("%p: MR creation failure: %s",
+ (void *)dev, strerror(rte_errno));
+ goto error;
+ }
+ if (dev->data->dev_conf.intr_conf.rxq) {
+ rxq->channel = ibv_create_comp_channel(priv->ctx);
+ if (rxq->channel == NULL) {
+ rte_errno = ENOMEM;
+ ERROR("%p: Rx interrupt completion channel creation"
+ " failure: %s",
+ (void *)dev, strerror(rte_errno));
+ goto error;
+ }
+ if (mlx4_fd_set_non_blocking(rxq->channel->fd) < 0) {
+ ERROR("%p: unable to make Rx interrupt completion"
+ " channel non-blocking: %s",
+ (void *)dev, strerror(rte_errno));
+ goto error;
+ }
+ }
+ DEBUG("%p: adding Rx queue %p to list", (void *)dev, (void *)rxq);
+ dev->data->rx_queues[idx] = rxq;
+ return 0;
+error:
+ dev->data->rx_queues[idx] = NULL;
+ ret = rte_errno;
+ mlx4_rx_queue_release(rxq);
+ rte_errno = ret;
+ assert(rte_errno > 0);
+ return -rte_errno;
+}
+
+/**
+ * DPDK callback to release a Rx queue.
+ *
+ * @param dpdk_rxq
+ * Generic Rx queue pointer.
+ */
+void
+mlx4_rx_queue_release(void *dpdk_rxq)
+{
+ struct rxq *rxq = (struct rxq *)dpdk_rxq;
+ struct priv *priv;
+ unsigned int i;
+
+ if (rxq == NULL)
+ return;
+ priv = rxq->priv;
+ for (i = 0; i != priv->dev->data->nb_rx_queues; ++i)
+ if (priv->dev->data->rx_queues[i] == rxq) {
+ DEBUG("%p: removing Rx queue %p from list",
+ (void *)priv->dev, (void *)rxq);
+ priv->dev->data->rx_queues[i] = NULL;
+ break;
+ }
+ assert(!rxq->cq);
+ assert(!rxq->wq);
+ assert(!rxq->wqes);
+ assert(!rxq->rq_db);
+ if (rxq->channel)
+ claim_zero(ibv_destroy_comp_channel(rxq->channel));
+ if (rxq->mr)
+ mlx4_mr_put(rxq->mr);
+ rte_free(rxq);
+}
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
new file mode 100644
index 00000000..3985e06d
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -0,0 +1,1071 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Data plane functions for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_io.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include "mlx4.h"
+#include "mlx4_prm.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+#define WQE_ONE_DATA_SEG_SIZE \
+ (sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))
+
+/**
+ * Pointer-value pair structure used in tx_post_send for saving the first
+ * DWORD (32 byte) of a TXBB.
+ */
+struct pv {
+ volatile struct mlx4_wqe_data_seg *dseg;
+ uint32_t val;
+};
+
+/** A table to translate Rx completion flags to packet type. */
+uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
+ /*
+ * The index to the array should have:
+ * bit[7] - MLX4_CQE_L2_TUNNEL
+ * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4
+ * bit[5] - MLX4_CQE_STATUS_UDP
+ * bit[4] - MLX4_CQE_STATUS_TCP
+ * bit[3] - MLX4_CQE_STATUS_IPV4OPT
+ * bit[2] - MLX4_CQE_STATUS_IPV6
+ * bit[1] - MLX4_CQE_STATUS_IPV4F
+ * bit[0] - MLX4_CQE_STATUS_IPV4
+ * giving a total of up to 256 entries.
+ */
+ [0x00] = RTE_PTYPE_L2_ETHER,
+ [0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+ [0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG,
+ [0x03] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_FRAG,
+ [0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+ [0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT,
+ [0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L4_FRAG,
+ [0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP,
+ [0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP,
+ [0x14] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_TCP,
+ [0x18] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L4_TCP,
+ [0x19] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L4_TCP,
+ [0x1a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L4_TCP,
+ [0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_UDP,
+ [0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_L4_UDP,
+ [0x24] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_L4_UDP,
+ [0x28] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L4_UDP,
+ [0x29] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L4_UDP,
+ [0x2a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
+ RTE_PTYPE_L4_UDP,
+ /* Tunneled - L3 IPV6 */
+ [0x80] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+ [0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
+ [0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x83] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0x84] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
+ [0x88] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT,
+ [0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT,
+ [0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG,
+ /* Tunneled - L3 IPV6, TCP */
+ [0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x93] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x94] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x98] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x99] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0x9a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_TCP,
+ /* Tunneled - L3 IPV6, UDP */
+ [0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0xa3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0xaa] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_UDP,
+ /* Tunneled - L3 IPV4 */
+ [0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+ [0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
+ [0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0xc3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG,
+ [0xc4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
+ [0xc8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT,
+ [0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT,
+ [0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_FRAG,
+ /* Tunneled - L3 IPV4, TCP */
+ [0xd0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0xd3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0xd4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0xd8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0xd9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT |
+ RTE_PTYPE_INNER_L4_TCP,
+ [0xda] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_TCP,
+ /* Tunneled - L3 IPV4, UDP */
+ [0xe0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0xe3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0xe4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L4_UDP,
+ [0xe8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP,
+ [0xe9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP,
+ [0xea] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
+ RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG |
+ RTE_PTYPE_INNER_L4_UDP,
+};
+
+/**
+ * Stamp a WQE so it won't be reused by the HW.
+ *
+ * Routine is used when freeing WQE used by the chip or when failing
+ * building an WQ entry has failed leaving partial information on the queue.
+ *
+ * @param sq
+ * Pointer to the SQ structure.
+ * @param index
+ * Index of the freed WQE.
+ * @param num_txbbs
+ * Number of blocks to stamp.
+ * If < 0 the routine will use the size written in the WQ entry.
+ * @param owner
+ * The value of the WQE owner bit to use in the stamp.
+ *
+ * @return
+ * The number of Tx basic blocs (TXBB) the WQE contained.
+ */
+static int
+mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner)
+{
+ uint32_t stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
+ (!!owner << MLX4_SQ_STAMP_SHIFT));
+ volatile uint8_t *wqe = mlx4_get_send_wqe(sq,
+ (index & sq->txbb_cnt_mask));
+ volatile uint32_t *ptr = (volatile uint32_t *)wqe;
+ int i;
+ int txbbs_size;
+ int num_txbbs;
+
+ /* Extract the size from the control segment of the WQE. */
+ num_txbbs = MLX4_SIZE_TO_TXBBS((((volatile struct mlx4_wqe_ctrl_seg *)
+ wqe)->fence_size & 0x3f) << 4);
+ txbbs_size = num_txbbs * MLX4_TXBB_SIZE;
+ /* Optimize the common case when there is no wrap-around. */
+ if (wqe + txbbs_size <= sq->eob) {
+ /* Stamp the freed descriptor. */
+ for (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) {
+ *ptr = stamp;
+ ptr += MLX4_SQ_STAMP_DWORDS;
+ }
+ } else {
+ /* Stamp the freed descriptor. */
+ for (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) {
+ *ptr = stamp;
+ ptr += MLX4_SQ_STAMP_DWORDS;
+ if ((volatile uint8_t *)ptr >= sq->eob) {
+ ptr = (volatile uint32_t *)sq->buf;
+ stamp ^= RTE_BE32(0x80000000);
+ }
+ }
+ }
+ return num_txbbs;
+}
+
+/**
+ * Manage Tx completions.
+ *
+ * When sending a burst, mlx4_tx_burst() posts several WRs.
+ * To improve performance, a completion event is only required once every
+ * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
+ * for other WRs, but this information would not be used anyway.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ *
+ * @return
+ * 0 on success, -1 on failure.
+ */
+static int
+mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
+ struct mlx4_sq *sq)
+{
+ unsigned int elts_comp = txq->elts_comp;
+ unsigned int elts_tail = txq->elts_tail;
+ struct mlx4_cq *cq = &txq->mcq;
+ volatile struct mlx4_cqe *cqe;
+ uint32_t cons_index = cq->cons_index;
+ uint16_t new_index;
+ uint16_t nr_txbbs = 0;
+ int pkts = 0;
+
+ /*
+ * Traverse over all CQ entries reported and handle each WQ entry
+ * reported by them.
+ */
+ do {
+ cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index);
+ if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+ !!(cons_index & cq->cqe_cnt)))
+ break;
+ /*
+ * Make sure we read the CQE after we read the ownership bit.
+ */
+ rte_io_rmb();
+#ifndef NDEBUG
+ if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+ MLX4_CQE_OPCODE_ERROR)) {
+ volatile struct mlx4_err_cqe *cqe_err =
+ (volatile struct mlx4_err_cqe *)cqe;
+ ERROR("%p CQE error - vendor syndrome: 0x%x"
+ " syndrome: 0x%x\n",
+ (void *)txq, cqe_err->vendor_err,
+ cqe_err->syndrome);
+ }
+#endif /* NDEBUG */
+ /* Get WQE index reported in the CQE. */
+ new_index =
+ rte_be_to_cpu_16(cqe->wqe_index) & sq->txbb_cnt_mask;
+ do {
+ /* Free next descriptor. */
+ nr_txbbs +=
+ mlx4_txq_stamp_freed_wqe(sq,
+ (sq->tail + nr_txbbs) & sq->txbb_cnt_mask,
+ !!((sq->tail + nr_txbbs) & sq->txbb_cnt));
+ pkts++;
+ } while (((sq->tail + nr_txbbs) & sq->txbb_cnt_mask) !=
+ new_index);
+ cons_index++;
+ } while (1);
+ if (unlikely(pkts == 0))
+ return 0;
+ /* Update CQ. */
+ cq->cons_index = cons_index;
+ *cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index & MLX4_CQ_DB_CI_MASK);
+ sq->tail = sq->tail + nr_txbbs;
+ /* Update the list of packets posted for transmission. */
+ elts_comp -= pkts;
+ assert(elts_comp <= txq->elts_comp);
+ /*
+ * Assume completion status is successful as nothing can be done about
+ * it anyway.
+ */
+ elts_tail += pkts;
+ if (elts_tail >= elts_n)
+ elts_tail -= elts_n;
+ txq->elts_tail = elts_tail;
+ txq->elts_comp = elts_comp;
+ return 0;
+}
+
+/**
+ * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
+ * the cloned mbuf is allocated is returned instead.
+ *
+ * @param buf
+ * Pointer to mbuf.
+ *
+ * @return
+ * Memory pool where data is located for given mbuf.
+ */
+static struct rte_mempool *
+mlx4_txq_mb2mp(struct rte_mbuf *buf)
+{
+ if (unlikely(RTE_MBUF_INDIRECT(buf)))
+ return rte_mbuf_from_indirect(buf)->pool;
+ return buf->pool;
+}
+
+static int
+mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
+ volatile struct mlx4_wqe_ctrl_seg **pctrl)
+{
+ int wqe_real_size;
+ int nr_txbbs;
+ struct pv *pv = (struct pv *)txq->bounce_buf;
+ struct mlx4_sq *sq = &txq->msq;
+ uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+ volatile struct mlx4_wqe_ctrl_seg *ctrl;
+ volatile struct mlx4_wqe_data_seg *dseg;
+ struct rte_mbuf *sbuf;
+ uint32_t lkey;
+ uintptr_t addr;
+ uint32_t byte_count;
+ int pv_counter = 0;
+
+ /* Calculate the needed work queue entry size for this packet. */
+ wqe_real_size = sizeof(volatile struct mlx4_wqe_ctrl_seg) +
+ buf->nb_segs * sizeof(volatile struct mlx4_wqe_data_seg);
+ nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+ /*
+ * Check that there is room for this WQE in the send queue and that
+ * the WQE size is legal.
+ */
+ if (((sq->head - sq->tail) + nr_txbbs +
+ sq->headroom_txbbs) >= sq->txbb_cnt ||
+ nr_txbbs > MLX4_MAX_WQE_TXBBS) {
+ return -1;
+ }
+ /* Get the control and data entries of the WQE. */
+ ctrl = (volatile struct mlx4_wqe_ctrl_seg *)
+ mlx4_get_send_wqe(sq, head_idx);
+ dseg = (volatile struct mlx4_wqe_data_seg *)
+ ((uintptr_t)ctrl + sizeof(struct mlx4_wqe_ctrl_seg));
+ *pctrl = ctrl;
+ /* Fill the data segments with buffer information. */
+ for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+ addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+ rte_prefetch0((volatile void *)addr);
+ /* Handle WQE wraparound. */
+ if (dseg >= (volatile struct mlx4_wqe_data_seg *)sq->eob)
+ dseg = (volatile struct mlx4_wqe_data_seg *)sq->buf;
+ dseg->addr = rte_cpu_to_be_64(addr);
+ /* Memory region key (big endian) for this memory pool. */
+ lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+ dseg->lkey = rte_cpu_to_be_32(lkey);
+#ifndef NDEBUG
+ /* Calculate the needed work queue entry size for this packet */
+ if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR association",
+ (void *)txq);
+ /*
+ * Restamp entry in case of failure.
+ * Make sure that size is written correctly
+ * Note that we give ownership to the SW, not the HW.
+ */
+ wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+ buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+ ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+ mlx4_txq_stamp_freed_wqe(sq, head_idx,
+ (sq->head & sq->txbb_cnt) ? 0 : 1);
+ return -1;
+ }
+#endif /* NDEBUG */
+ if (likely(sbuf->data_len)) {
+ byte_count = rte_cpu_to_be_32(sbuf->data_len);
+ } else {
+ /*
+ * Zero length segment is treated as inline segment
+ * with zero data.
+ */
+ byte_count = RTE_BE32(0x80000000);
+ }
+ /*
+ * If the data segment is not at the beginning of a
+ * Tx basic block (TXBB) then write the byte count,
+ * else postpone the writing to just before updating the
+ * control segment.
+ */
+ if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+#if RTE_CACHE_LINE_SIZE < 64
+ /*
+ * Need a barrier here before writing the byte_count
+ * fields to make sure that all the data is visible
+ * before the byte_count field is set.
+ * Otherwise, if the segment begins a new cacheline,
+ * the HCA prefetcher could grab the 64-byte chunk and
+ * get a valid (!= 0xffffffff) byte count but stale
+ * data, and end up sending the wrong data.
+ */
+ rte_io_wmb();
+#endif /* RTE_CACHE_LINE_SIZE */
+ dseg->byte_count = byte_count;
+ } else {
+ /*
+ * This data segment starts at the beginning of a new
+ * TXBB, so we need to postpone its byte_count writing
+ * for later.
+ */
+ pv[pv_counter].dseg = dseg;
+ pv[pv_counter++].val = byte_count;
+ }
+ }
+ /* Write the first DWORD of each TXBB save earlier. */
+ if (pv_counter) {
+ /* Need a barrier here before writing the byte_count. */
+ rte_io_wmb();
+ for (--pv_counter; pv_counter >= 0; pv_counter--)
+ pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
+ }
+ /* Fill the control parameters for this packet. */
+ ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+ return nr_txbbs;
+}
+
+/**
+ * DPDK callback for Tx.
+ *
+ * @param dpdk_txq
+ * Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ unsigned int elts_head = txq->elts_head;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int bytes_sent = 0;
+ unsigned int i;
+ unsigned int max;
+ struct mlx4_sq *sq = &txq->msq;
+ int nr_txbbs;
+
+ assert(txq->elts_comp_cd != 0);
+ if (likely(txq->elts_comp != 0))
+ mlx4_txq_complete(txq, elts_n, sq);
+ max = (elts_n - (elts_head - txq->elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ struct rte_mbuf *buf = pkts[i];
+ unsigned int elts_head_next =
+ (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
+ struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
+ struct txq_elt *elt = &(*txq->elts)[elts_head];
+ uint32_t owner_opcode = MLX4_OPCODE_SEND;
+ volatile struct mlx4_wqe_ctrl_seg *ctrl;
+ volatile struct mlx4_wqe_data_seg *dseg;
+ union {
+ uint32_t flags;
+ uint16_t flags16[2];
+ } srcrb;
+ uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+ uint32_t lkey;
+ uintptr_t addr;
+
+ /* Clean up old buffer. */
+ if (likely(elt->buf != NULL)) {
+ struct rte_mbuf *tmp = elt->buf;
+
+#ifndef NDEBUG
+ /* Poisoning. */
+ memset(elt, 0x66, sizeof(*elt));
+#endif
+ /* Faster than rte_pktmbuf_free(). */
+ do {
+ struct rte_mbuf *next = tmp->next;
+
+ rte_pktmbuf_free_seg(tmp);
+ tmp = next;
+ } while (tmp != NULL);
+ }
+ RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+ if (buf->nb_segs == 1) {
+ /*
+ * Check that there is room for this WQE in the send
+ * queue and that the WQE size is legal
+ */
+ if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs) >=
+ sq->txbb_cnt || 1 > MLX4_MAX_WQE_TXBBS) {
+ elt->buf = NULL;
+ break;
+ }
+ /* Get the control and data entries of the WQE. */
+ ctrl = (volatile struct mlx4_wqe_ctrl_seg *)
+ mlx4_get_send_wqe(sq, head_idx);
+ dseg = (volatile struct mlx4_wqe_data_seg *)
+ ((uintptr_t)ctrl +
+ sizeof(struct mlx4_wqe_ctrl_seg));
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ rte_prefetch0((volatile void *)addr);
+ /* Handle WQE wraparound. */
+ if (dseg >=
+ (volatile struct mlx4_wqe_data_seg *)sq->eob)
+ dseg = (volatile struct mlx4_wqe_data_seg *)
+ sq->buf;
+ dseg->addr = rte_cpu_to_be_64(addr);
+ /* Memory region key (big endian). */
+ lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+ dseg->lkey = rte_cpu_to_be_32(lkey);
+#ifndef NDEBUG
+ if (unlikely(dseg->lkey ==
+ rte_cpu_to_be_32((uint32_t)-1))) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR association",
+ (void *)txq);
+ /*
+ * Restamp entry in case of failure.
+ * Make sure that size is written correctly
+ * Note that we give ownership to the SW,
+ * not the HW.
+ */
+ ctrl->fence_size =
+ (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
+ mlx4_txq_stamp_freed_wqe(sq, head_idx,
+ (sq->head & sq->txbb_cnt) ? 0 : 1);
+ elt->buf = NULL;
+ break;
+ }
+#endif /* NDEBUG */
+ /* Never be TXBB aligned, no need compiler barrier. */
+ dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
+ /* Fill the control parameters for this packet. */
+ ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
+ nr_txbbs = 1;
+ } else {
+ nr_txbbs = mlx4_tx_burst_segs(buf, txq, &ctrl);
+ if (nr_txbbs < 0) {
+ elt->buf = NULL;
+ break;
+ }
+ }
+ /*
+ * For raw Ethernet, the SOLICIT flag is used to indicate
+ * that no ICRC should be calculated.
+ */
+ txq->elts_comp_cd -= nr_txbbs;
+ if (unlikely(txq->elts_comp_cd <= 0)) {
+ txq->elts_comp_cd = txq->elts_comp_cd_init;
+ srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
+ MLX4_WQE_CTRL_CQ_UPDATE);
+ } else {
+ srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
+ }
+ /* Enable HW checksum offload if requested */
+ if (txq->csum &&
+ (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
+ const uint64_t is_tunneled = (buf->ol_flags &
+ (PKT_TX_TUNNEL_GRE |
+ PKT_TX_TUNNEL_VXLAN));
+
+ if (is_tunneled && txq->csum_l2tun) {
+ owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
+ MLX4_WQE_CTRL_IL4_HDR_CSUM;
+ if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
+ srcrb.flags |=
+ RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
+ } else {
+ srcrb.flags |=
+ RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
+ MLX4_WQE_CTRL_TCP_UDP_CSUM);
+ }
+ }
+ if (txq->lb) {
+ /*
+ * Copy destination MAC address to the WQE, this allows
+ * loopback in eSwitch, so that VFs and PF can
+ * communicate with each other.
+ */
+ srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *));
+ ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *,
+ sizeof(uint16_t)));
+ } else {
+ ctrl->imm = 0;
+ }
+ ctrl->srcrb_flags = srcrb.flags;
+ /*
+ * Make sure descriptor is fully written before
+ * setting ownership bit (because HW can start
+ * executing as soon as we do).
+ */
+ rte_io_wmb();
+ ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode |
+ ((sq->head & sq->txbb_cnt) ?
+ MLX4_BIT_WQE_OWN : 0));
+ sq->head += nr_txbbs;
+ elt->buf = buf;
+ bytes_sent += buf->pkt_len;
+ elts_head = elts_head_next;
+ }
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Increment send statistics counters. */
+ txq->stats.opackets += i;
+ txq->stats.obytes += bytes_sent;
+ /* Make sure that descriptors are written before doorbell record. */
+ rte_wmb();
+ /* Ring QP doorbell. */
+ rte_write32(txq->msq.doorbell_qpn, txq->msq.db);
+ txq->elts_head = elts_head;
+ txq->elts_comp += i;
+ return i;
+}
+
+/**
+ * Translate Rx completion flags to packet type.
+ *
+ * @param[in] cqe
+ * Pointer to CQE.
+ *
+ * @return
+ * Packet type for struct rte_mbuf.
+ */
+static inline uint32_t
+rxq_cq_to_pkt_type(volatile struct mlx4_cqe *cqe)
+{
+ uint8_t idx = 0;
+ uint32_t pinfo = rte_be_to_cpu_32(cqe->vlan_my_qpn);
+ uint32_t status = rte_be_to_cpu_32(cqe->status);
+
+ /*
+ * The index to the array should have:
+ * bit[7] - MLX4_CQE_L2_TUNNEL
+ * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4
+ */
+ if (!(pinfo & MLX4_CQE_L2_VLAN_MASK) && (pinfo & MLX4_CQE_L2_TUNNEL))
+ idx |= ((pinfo & MLX4_CQE_L2_TUNNEL) >> 20) |
+ ((pinfo & MLX4_CQE_L2_TUNNEL_IPV4) >> 19);
+ /*
+ * The index to the array should have:
+ * bit[5] - MLX4_CQE_STATUS_UDP
+ * bit[4] - MLX4_CQE_STATUS_TCP
+ * bit[3] - MLX4_CQE_STATUS_IPV4OPT
+ * bit[2] - MLX4_CQE_STATUS_IPV6
+ * bit[1] - MLX4_CQE_STATUS_IPV4F
+ * bit[0] - MLX4_CQE_STATUS_IPV4
+ * giving a total of up to 256 entries.
+ */
+ idx |= ((status & MLX4_CQE_STATUS_PTYPE_MASK) >> 22);
+ return mlx4_ptype_table[idx];
+}
+
+/**
+ * Translate Rx completion flags to offload flags.
+ *
+ * @param flags
+ * Rx completion flags returned by mlx4_cqe_flags().
+ * @param csum
+ * Whether Rx checksums are enabled.
+ * @param csum_l2tun
+ * Whether Rx L2 tunnel checksums are enabled.
+ *
+ * @return
+ * Offload flags (ol_flags) in mbuf format.
+ */
+static inline uint32_t
+rxq_cq_to_ol_flags(uint32_t flags, int csum, int csum_l2tun)
+{
+ uint32_t ol_flags = 0;
+
+ if (csum)
+ ol_flags |=
+ mlx4_transpose(flags,
+ MLX4_CQE_STATUS_IP_HDR_CSUM_OK,
+ PKT_RX_IP_CKSUM_GOOD) |
+ mlx4_transpose(flags,
+ MLX4_CQE_STATUS_TCP_UDP_CSUM_OK,
+ PKT_RX_L4_CKSUM_GOOD);
+ if ((flags & MLX4_CQE_L2_TUNNEL) && csum_l2tun)
+ ol_flags |=
+ mlx4_transpose(flags,
+ MLX4_CQE_L2_TUNNEL_IPOK,
+ PKT_RX_IP_CKSUM_GOOD) |
+ mlx4_transpose(flags,
+ MLX4_CQE_L2_TUNNEL_L4_CSUM,
+ PKT_RX_L4_CKSUM_GOOD);
+ return ol_flags;
+}
+
+/**
+ * Extract checksum information from CQE flags.
+ *
+ * @param cqe
+ * Pointer to CQE structure.
+ * @param csum
+ * Whether Rx checksums are enabled.
+ * @param csum_l2tun
+ * Whether Rx L2 tunnel checksums are enabled.
+ *
+ * @return
+ * CQE checksum information.
+ */
+static inline uint32_t
+mlx4_cqe_flags(volatile struct mlx4_cqe *cqe, int csum, int csum_l2tun)
+{
+ uint32_t flags = 0;
+
+ /*
+ * The relevant bits are in different locations on their
+ * CQE fields therefore we can join them in one 32bit
+ * variable.
+ */
+ if (csum)
+ flags = (rte_be_to_cpu_32(cqe->status) &
+ MLX4_CQE_STATUS_IPV4_CSUM_OK);
+ if (csum_l2tun)
+ flags |= (rte_be_to_cpu_32(cqe->vlan_my_qpn) &
+ (MLX4_CQE_L2_TUNNEL |
+ MLX4_CQE_L2_TUNNEL_IPOK |
+ MLX4_CQE_L2_TUNNEL_L4_CSUM |
+ MLX4_CQE_L2_TUNNEL_IPV4));
+ return flags;
+}
+
+/**
+ * Poll one CQE from CQ.
+ *
+ * @param rxq
+ * Pointer to the receive queue structure.
+ * @param[out] out
+ * Just polled CQE.
+ *
+ * @return
+ * Number of bytes of the CQE, 0 in case there is no completion.
+ */
+static unsigned int
+mlx4_cq_poll_one(struct rxq *rxq, volatile struct mlx4_cqe **out)
+{
+ int ret = 0;
+ volatile struct mlx4_cqe *cqe = NULL;
+ struct mlx4_cq *cq = &rxq->mcq;
+
+ cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index);
+ if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+ !!(cq->cons_index & cq->cqe_cnt))
+ goto out;
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rte_rmb();
+ assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK));
+ assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) !=
+ MLX4_CQE_OPCODE_ERROR);
+ ret = rte_be_to_cpu_32(cqe->byte_cnt);
+ ++cq->cons_index;
+out:
+ *out = cqe;
+ return ret;
+}
+
+/**
+ * DPDK callback for Rx with scattered packets support.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct rxq *rxq = dpdk_rxq;
+ const uint32_t wr_cnt = (1 << rxq->elts_n) - 1;
+ const uint16_t sges_n = rxq->sges_n;
+ struct rte_mbuf *pkt = NULL;
+ struct rte_mbuf *seg = NULL;
+ unsigned int i = 0;
+ uint32_t rq_ci = rxq->rq_ci << sges_n;
+ int len = 0;
+
+ while (pkts_n) {
+ volatile struct mlx4_cqe *cqe;
+ uint32_t idx = rq_ci & wr_cnt;
+ struct rte_mbuf *rep = (*rxq->elts)[idx];
+ volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx];
+
+ /* Update the 'next' pointer of the previous segment. */
+ if (pkt)
+ seg->next = rep;
+ seg = rep;
+ rte_prefetch0(seg);
+ rte_prefetch0(scat);
+ rep = rte_mbuf_raw_alloc(rxq->mp);
+ if (unlikely(rep == NULL)) {
+ ++rxq->stats.rx_nombuf;
+ if (!pkt) {
+ /*
+ * No buffers before we even started,
+ * bail out silently.
+ */
+ break;
+ }
+ while (pkt != seg) {
+ assert(pkt != (*rxq->elts)[idx]);
+ rep = pkt->next;
+ pkt->next = NULL;
+ pkt->nb_segs = 1;
+ rte_mbuf_raw_free(pkt);
+ pkt = rep;
+ }
+ break;
+ }
+ if (!pkt) {
+ /* Looking for the new packet. */
+ len = mlx4_cq_poll_one(rxq, &cqe);
+ if (!len) {
+ rte_mbuf_raw_free(rep);
+ break;
+ }
+ if (unlikely(len < 0)) {
+ /* Rx error, packet is likely too large. */
+ rte_mbuf_raw_free(rep);
+ ++rxq->stats.idropped;
+ goto skip;
+ }
+ pkt = seg;
+ /* Update packet information. */
+ pkt->packet_type = rxq_cq_to_pkt_type(cqe);
+ pkt->ol_flags = 0;
+ pkt->pkt_len = len;
+ if (rxq->csum | rxq->csum_l2tun) {
+ uint32_t flags =
+ mlx4_cqe_flags(cqe,
+ rxq->csum,
+ rxq->csum_l2tun);
+
+ pkt->ol_flags =
+ rxq_cq_to_ol_flags(flags,
+ rxq->csum,
+ rxq->csum_l2tun);
+ }
+ }
+ rep->nb_segs = 1;
+ rep->port = rxq->port_id;
+ rep->data_len = seg->data_len;
+ rep->data_off = seg->data_off;
+ (*rxq->elts)[idx] = rep;
+ /*
+ * Fill NIC descriptor with the new buffer. The lkey and size
+ * of the buffers are already known, only the buffer address
+ * changes.
+ */
+ scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+ if (len > seg->data_len) {
+ len -= seg->data_len;
+ ++pkt->nb_segs;
+ ++rq_ci;
+ continue;
+ }
+ /* The last segment. */
+ seg->data_len = len;
+ /* Increment bytes counter. */
+ rxq->stats.ibytes += pkt->pkt_len;
+ /* Return packet. */
+ *(pkts++) = pkt;
+ pkt = NULL;
+ --pkts_n;
+ ++i;
+skip:
+ /* Align consumer index to the next stride. */
+ rq_ci >>= sges_n;
+ ++rq_ci;
+ rq_ci <<= sges_n;
+ }
+ if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci))
+ return 0;
+ /* Update the consumer index. */
+ rxq->rq_ci = rq_ci >> sges_n;
+ rte_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+ *rxq->mcq.set_ci_db =
+ rte_cpu_to_be_32(rxq->mcq.cons_index & MLX4_CQ_DB_CI_MASK);
+ /* Increment packets counter. */
+ rxq->stats.ipackets += i;
+ return i;
+}
+
+/**
+ * Dummy DPDK callback for Tx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_txq
+ * Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ (void)dpdk_txq;
+ (void)pkts;
+ (void)pkts_n;
+ return 0;
+}
+
+/**
+ * Dummy DPDK callback for Rx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ (void)dpdk_rxq;
+ (void)pkts;
+ (void)pkts_n;
+ return 0;
+}
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
new file mode 100644
index 00000000..4acad801
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -0,0 +1,214 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MLX4_RXTX_H_
+#define MLX4_RXTX_H_
+
+#include <stdint.h>
+#include <sys/queue.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/mlx4dv.h>
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_ethdev.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+
+#include "mlx4.h"
+#include "mlx4_prm.h"
+
+/** Rx queue counters. */
+struct mlx4_rxq_stats {
+ unsigned int idx; /**< Mapping index. */
+ uint64_t ipackets; /**< Total of successfully received packets. */
+ uint64_t ibytes; /**< Total of successfully received bytes. */
+ uint64_t idropped; /**< Total of packets dropped when Rx ring full. */
+ uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */
+};
+
+/** Rx queue descriptor. */
+struct rxq {
+ struct priv *priv; /**< Back pointer to private data. */
+ struct rte_mempool *mp; /**< Memory pool for allocations. */
+ struct mlx4_mr *mr; /**< Memory region. */
+ struct ibv_cq *cq; /**< Completion queue. */
+ struct ibv_wq *wq; /**< Work queue. */
+ struct ibv_comp_channel *channel; /**< Rx completion channel. */
+ uint16_t rq_ci; /**< Saved RQ consumer index. */
+ uint16_t port_id; /**< Port ID for incoming packets. */
+ uint16_t sges_n; /**< Number of segments per packet (log2 value). */
+ uint16_t elts_n; /**< Mbuf queue size (log2 value). */
+ struct rte_mbuf *(*elts)[]; /**< Rx elements. */
+ volatile struct mlx4_wqe_data_seg (*wqes)[]; /**< HW queue entries. */
+ volatile uint32_t *rq_db; /**< RQ doorbell record. */
+ uint32_t csum:1; /**< Enable checksum offloading. */
+ uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */
+ struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */
+ struct mlx4_rxq_stats stats; /**< Rx queue counters. */
+ unsigned int socket; /**< CPU socket ID for allocations. */
+ uint32_t usecnt; /**< Number of users relying on queue resources. */
+ uint8_t data[]; /**< Remaining queue resources. */
+};
+
+/** Shared flow target for Rx queues. */
+struct mlx4_rss {
+ LIST_ENTRY(mlx4_rss) next; /**< Next entry in list. */
+ struct priv *priv; /**< Back pointer to private data. */
+ uint32_t refcnt; /**< Reference count for this object. */
+ uint32_t usecnt; /**< Number of users relying on @p qp and @p ind. */
+ struct ibv_qp *qp; /**< Queue pair. */
+ struct ibv_rwq_ind_table *ind; /**< Indirection table. */
+ uint64_t fields; /**< Fields for RSS processing (Verbs format). */
+ uint8_t key[MLX4_RSS_HASH_KEY_SIZE]; /**< Hash key to use. */
+ uint16_t queues; /**< Number of target queues. */
+ uint16_t queue_id[]; /**< Target queues. */
+};
+
+/** Tx element. */
+struct txq_elt {
+ struct rte_mbuf *buf; /**< Buffer. */
+};
+
+/** Rx queue counters. */
+struct mlx4_txq_stats {
+ unsigned int idx; /**< Mapping index. */
+ uint64_t opackets; /**< Total of successfully sent packets. */
+ uint64_t obytes; /**< Total of successfully sent bytes. */
+ uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+};
+
+/** Tx queue descriptor. */
+struct txq {
+ struct mlx4_sq msq; /**< Info for directly manipulating the SQ. */
+ struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */
+ unsigned int elts_head; /**< Current index in (*elts)[]. */
+ unsigned int elts_tail; /**< First element awaiting completion. */
+ unsigned int elts_comp; /**< Number of packets awaiting completion. */
+ int elts_comp_cd; /**< Countdown for next completion. */
+ unsigned int elts_comp_cd_init; /**< Initial value for countdown. */
+ unsigned int elts_n; /**< (*elts)[] length. */
+ struct txq_elt (*elts)[]; /**< Tx elements. */
+ struct mlx4_txq_stats stats; /**< Tx queue counters. */
+ uint32_t max_inline; /**< Max inline send size. */
+ uint32_t csum:1; /**< Enable checksum offloading. */
+ uint32_t csum_l2tun:1; /**< Same for L2 tunnels. */
+ uint32_t lb:1; /**< Whether packets should be looped back by eSwitch. */
+ uint8_t *bounce_buf;
+ /**< Memory used for storing the first DWORD of data TXBBs. */
+ struct {
+ const struct rte_mempool *mp; /**< Cached memory pool. */
+ struct mlx4_mr *mr; /**< Memory region (for mp). */
+ uint32_t lkey; /**< mr->lkey copy. */
+ } mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */
+ struct priv *priv; /**< Back pointer to private data. */
+ unsigned int socket; /**< CPU socket ID for allocations. */
+ struct ibv_cq *cq; /**< Completion queue. */
+ struct ibv_qp *qp; /**< Queue pair. */
+ uint8_t data[]; /**< Remaining queue resources. */
+};
+
+/* mlx4_rxq.c */
+
+uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE];
+int mlx4_rss_init(struct priv *priv);
+void mlx4_rss_deinit(struct priv *priv);
+struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields,
+ uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+ uint16_t queues, const uint16_t queue_id[]);
+void mlx4_rss_put(struct mlx4_rss *rss);
+int mlx4_rss_attach(struct mlx4_rss *rss);
+void mlx4_rss_detach(struct mlx4_rss *rss);
+int mlx4_rxq_attach(struct rxq *rxq);
+void mlx4_rxq_detach(struct rxq *rxq);
+int mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
+ uint16_t desc, unsigned int socket,
+ const struct rte_eth_rxconf *conf,
+ struct rte_mempool *mp);
+void mlx4_rx_queue_release(void *dpdk_rxq);
+
+/* mlx4_rxtx.c */
+
+uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+
+/* mlx4_txq.c */
+
+int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
+ uint16_t desc, unsigned int socket,
+ const struct rte_eth_txconf *conf);
+void mlx4_tx_queue_release(void *dpdk_txq);
+
+/**
+ * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
+ * Call mlx4_txq_add_mr() if MP is not registered yet.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param[in] mp
+ * Memory pool for which a memory region lkey must be returned.
+ *
+ * @return
+ * mr->lkey on success, (uint32_t)-1 on failure.
+ */
+static inline uint32_t
+mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+ unsigned int i;
+
+ for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+ if (unlikely(txq->mp2mr[i].mp == NULL)) {
+ /* Unknown MP, add a new MR for it. */
+ break;
+ }
+ if (txq->mp2mr[i].mp == mp) {
+ /* MP found MP. */
+ return txq->mp2mr[i].lkey;
+ }
+ }
+ return mlx4_txq_add_mr(txq, mp, i);
+}
+
+#endif /* MLX4_RXTX_H_ */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
new file mode 100644
index 00000000..7882a4d0
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -0,0 +1,414 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Tx queues configuration for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+
+#include "mlx4.h"
+#include "mlx4_autoconf.h"
+#include "mlx4_prm.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Free Tx queue elements.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ */
+static void
+mlx4_txq_free_elts(struct txq *txq)
+{
+ unsigned int elts_head = txq->elts_head;
+ unsigned int elts_tail = txq->elts_tail;
+ struct txq_elt (*elts)[txq->elts_n] = txq->elts;
+
+ DEBUG("%p: freeing WRs", (void *)txq);
+ while (elts_tail != elts_head) {
+ struct txq_elt *elt = &(*elts)[elts_tail];
+
+ assert(elt->buf != NULL);
+ rte_pktmbuf_free(elt->buf);
+ elt->buf = NULL;
+ if (++elts_tail == RTE_DIM(*elts))
+ elts_tail = 0;
+ }
+ txq->elts_tail = txq->elts_head;
+}
+
+struct txq_mp2mr_mbuf_check_data {
+ int ret;
+};
+
+/**
+ * Callback function for rte_mempool_obj_iter() to check whether a given
+ * mempool object looks like a mbuf.
+ *
+ * @param[in] mp
+ * The mempool pointer
+ * @param[in] arg
+ * Context data (struct mlx4_txq_mp2mr_mbuf_check_data). Contains the
+ * return value.
+ * @param[in] obj
+ * Object address.
+ * @param index
+ * Object index, unused.
+ */
+static void
+mlx4_txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
+ uint32_t index)
+{
+ struct txq_mp2mr_mbuf_check_data *data = arg;
+ struct rte_mbuf *buf = obj;
+
+ (void)index;
+ /*
+ * Check whether mbuf structure fits element size and whether mempool
+ * pointer is valid.
+ */
+ if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
+ data->ret = -1;
+}
+
+/**
+ * Iterator function for rte_mempool_walk() to register existing mempools and
+ * fill the MP to MR cache of a Tx queue.
+ *
+ * @param[in] mp
+ * Memory Pool to register.
+ * @param *arg
+ * Pointer to Tx queue structure.
+ */
+static void
+mlx4_txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
+{
+ struct txq *txq = arg;
+ struct txq_mp2mr_mbuf_check_data data = {
+ .ret = 0,
+ };
+
+ /* Register mempool only if the first element looks like a mbuf. */
+ if (rte_mempool_obj_iter(mp, mlx4_txq_mp2mr_mbuf_check, &data) == 0 ||
+ data.ret == -1)
+ return;
+ mlx4_txq_mp2mr(txq, mp);
+}
+
+/**
+ * Retrieves information needed in order to directly access the Tx queue.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param mlxdv
+ * Pointer to device information for this Tx queue.
+ */
+static void
+mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv)
+{
+ struct mlx4_sq *sq = &txq->msq;
+ struct mlx4_cq *cq = &txq->mcq;
+ struct mlx4dv_qp *dqp = mlxdv->qp.out;
+ struct mlx4dv_cq *dcq = mlxdv->cq.out;
+ uint32_t sq_size = (uint32_t)dqp->rq.offset - (uint32_t)dqp->sq.offset;
+
+ sq->buf = (uint8_t *)dqp->buf.buf + dqp->sq.offset;
+ /* Total length, including headroom and spare WQEs. */
+ sq->eob = sq->buf + sq_size;
+ sq->head = 0;
+ sq->tail = 0;
+ sq->txbb_cnt =
+ (dqp->sq.wqe_cnt << dqp->sq.wqe_shift) >> MLX4_TXBB_SHIFT;
+ sq->txbb_cnt_mask = sq->txbb_cnt - 1;
+ sq->db = dqp->sdb;
+ sq->doorbell_qpn = dqp->doorbell_qpn;
+ sq->headroom_txbbs =
+ (2048 + (1 << dqp->sq.wqe_shift)) >> MLX4_TXBB_SHIFT;
+ cq->buf = dcq->buf.buf;
+ cq->cqe_cnt = dcq->cqe_cnt;
+ cq->set_ci_db = dcq->set_ci_db;
+ cq->cqe_64 = (dcq->cqe_size & 64) ? 1 : 0;
+}
+
+/**
+ * DPDK callback to configure a Tx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * Tx queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ * Thresholds parameters.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ unsigned int socket, const struct rte_eth_txconf *conf)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct mlx4dv_obj mlxdv;
+ struct mlx4dv_qp dv_qp;
+ struct mlx4dv_cq dv_cq;
+ struct txq_elt (*elts)[desc];
+ struct ibv_qp_init_attr qp_init_attr;
+ struct txq *txq;
+ uint8_t *bounce_buf;
+ struct mlx4_malloc_vec vec[] = {
+ {
+ .align = RTE_CACHE_LINE_SIZE,
+ .size = sizeof(*txq),
+ .addr = (void **)&txq,
+ },
+ {
+ .align = RTE_CACHE_LINE_SIZE,
+ .size = sizeof(*elts),
+ .addr = (void **)&elts,
+ },
+ {
+ .align = RTE_CACHE_LINE_SIZE,
+ .size = MLX4_MAX_WQE_SIZE,
+ .addr = (void **)&bounce_buf,
+ },
+ };
+ int ret;
+
+ (void)conf; /* Thresholds configuration (ignored). */
+ DEBUG("%p: configuring queue %u for %u descriptors",
+ (void *)dev, idx, desc);
+ if (idx >= dev->data->nb_tx_queues) {
+ rte_errno = EOVERFLOW;
+ ERROR("%p: queue index out of range (%u >= %u)",
+ (void *)dev, idx, dev->data->nb_tx_queues);
+ return -rte_errno;
+ }
+ txq = dev->data->tx_queues[idx];
+ if (txq) {
+ rte_errno = EEXIST;
+ DEBUG("%p: Tx queue %u already configured, release it first",
+ (void *)dev, idx);
+ return -rte_errno;
+ }
+ if (!desc) {
+ rte_errno = EINVAL;
+ ERROR("%p: invalid number of Tx descriptors", (void *)dev);
+ return -rte_errno;
+ }
+ /* Allocate and initialize Tx queue. */
+ mlx4_zmallocv_socket("TXQ", vec, RTE_DIM(vec), socket);
+ if (!txq) {
+ ERROR("%p: unable to allocate queue index %u",
+ (void *)dev, idx);
+ return -rte_errno;
+ }
+ *txq = (struct txq){
+ .priv = priv,
+ .stats = {
+ .idx = idx,
+ },
+ .socket = socket,
+ .elts_n = desc,
+ .elts = elts,
+ .elts_head = 0,
+ .elts_tail = 0,
+ .elts_comp = 0,
+ /*
+ * Request send completion every MLX4_PMD_TX_PER_COMP_REQ
+ * packets or at least 4 times per ring.
+ */
+ .elts_comp_cd =
+ RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4),
+ .elts_comp_cd_init =
+ RTE_MIN(MLX4_PMD_TX_PER_COMP_REQ, desc / 4),
+ .csum = priv->hw_csum,
+ .csum_l2tun = priv->hw_csum_l2tun,
+ /* Enable Tx loopback for VF devices. */
+ .lb = !!priv->vf,
+ .bounce_buf = bounce_buf,
+ };
+ txq->cq = ibv_create_cq(priv->ctx, desc, NULL, NULL, 0);
+ if (!txq->cq) {
+ rte_errno = ENOMEM;
+ ERROR("%p: CQ creation failure: %s",
+ (void *)dev, strerror(rte_errno));
+ goto error;
+ }
+ qp_init_attr = (struct ibv_qp_init_attr){
+ .send_cq = txq->cq,
+ .recv_cq = txq->cq,
+ .cap = {
+ .max_send_wr =
+ RTE_MIN(priv->device_attr.max_qp_wr, desc),
+ .max_send_sge = 1,
+ .max_inline_data = MLX4_PMD_MAX_INLINE,
+ },
+ .qp_type = IBV_QPT_RAW_PACKET,
+ /* No completion events must occur by default. */
+ .sq_sig_all = 0,
+ };
+ txq->qp = ibv_create_qp(priv->pd, &qp_init_attr);
+ if (!txq->qp) {
+ rte_errno = errno ? errno : EINVAL;
+ ERROR("%p: QP creation failure: %s",
+ (void *)dev, strerror(rte_errno));
+ goto error;
+ }
+ txq->max_inline = qp_init_attr.cap.max_inline_data;
+ ret = ibv_modify_qp
+ (txq->qp,
+ &(struct ibv_qp_attr){
+ .qp_state = IBV_QPS_INIT,
+ .port_num = priv->port,
+ },
+ IBV_QP_STATE | IBV_QP_PORT);
+ if (ret) {
+ rte_errno = ret;
+ ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
+ (void *)dev, strerror(rte_errno));
+ goto error;
+ }
+ ret = ibv_modify_qp
+ (txq->qp,
+ &(struct ibv_qp_attr){
+ .qp_state = IBV_QPS_RTR,
+ },
+ IBV_QP_STATE);
+ if (ret) {
+ rte_errno = ret;
+ ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
+ (void *)dev, strerror(rte_errno));
+ goto error;
+ }
+ ret = ibv_modify_qp
+ (txq->qp,
+ &(struct ibv_qp_attr){
+ .qp_state = IBV_QPS_RTS,
+ },
+ IBV_QP_STATE);
+ if (ret) {
+ rte_errno = ret;
+ ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
+ (void *)dev, strerror(rte_errno));
+ goto error;
+ }
+ /* Retrieve device queue information. */
+ mlxdv.cq.in = txq->cq;
+ mlxdv.cq.out = &dv_cq;
+ mlxdv.qp.in = txq->qp;
+ mlxdv.qp.out = &dv_qp;
+ ret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_QP | MLX4DV_OBJ_CQ);
+ if (ret) {
+ rte_errno = EINVAL;
+ ERROR("%p: failed to obtain information needed for"
+ " accessing the device queues", (void *)dev);
+ goto error;
+ }
+ mlx4_txq_fill_dv_obj_info(txq, &mlxdv);
+ /* Pre-register known mempools. */
+ rte_mempool_walk(mlx4_txq_mp2mr_iter, txq);
+ DEBUG("%p: adding Tx queue %p to list", (void *)dev, (void *)txq);
+ dev->data->tx_queues[idx] = txq;
+ return 0;
+error:
+ dev->data->tx_queues[idx] = NULL;
+ ret = rte_errno;
+ mlx4_tx_queue_release(txq);
+ rte_errno = ret;
+ assert(rte_errno > 0);
+ return -rte_errno;
+}
+
+/**
+ * DPDK callback to release a Tx queue.
+ *
+ * @param dpdk_txq
+ * Generic Tx queue pointer.
+ */
+void
+mlx4_tx_queue_release(void *dpdk_txq)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ struct priv *priv;
+ unsigned int i;
+
+ if (txq == NULL)
+ return;
+ priv = txq->priv;
+ for (i = 0; i != priv->dev->data->nb_tx_queues; ++i)
+ if (priv->dev->data->tx_queues[i] == txq) {
+ DEBUG("%p: removing Tx queue %p from list",
+ (void *)priv->dev, (void *)txq);
+ priv->dev->data->tx_queues[i] = NULL;
+ break;
+ }
+ mlx4_txq_free_elts(txq);
+ if (txq->qp)
+ claim_zero(ibv_destroy_qp(txq->qp));
+ if (txq->cq)
+ claim_zero(ibv_destroy_cq(txq->cq));
+ for (i = 0; i != RTE_DIM(txq->mp2mr); ++i) {
+ if (!txq->mp2mr[i].mp)
+ break;
+ assert(txq->mp2mr[i].mr);
+ mlx4_mr_put(txq->mp2mr[i].mr);
+ }
+ rte_free(txq);
+}
diff --git a/drivers/net/mlx4/mlx4_utils.c b/drivers/net/mlx4/mlx4_utils.c
new file mode 100644
index 00000000..f18c7145
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_utils.c
@@ -0,0 +1,217 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Utility functions used by the mlx4 driver.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_memory.h>
+
+#include "mlx4_utils.h"
+
+/**
+ * Make a file descriptor non-blocking.
+ *
+ * @param fd
+ * File descriptor to alter.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_fd_set_non_blocking(int fd)
+{
+ int ret = fcntl(fd, F_GETFL);
+
+ if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK))
+ return 0;
+ assert(errno);
+ rte_errno = errno;
+ return -rte_errno;
+}
+
+/**
+ * Internal helper to allocate memory once for several disparate objects.
+ *
+ * The most restrictive alignment constraint for standard objects is assumed
+ * to be sizeof(double) and is used as a default value.
+ *
+ * C11 code would include stdalign.h and use alignof(max_align_t) however
+ * we'll stick with C99 for the time being.
+ */
+static inline size_t
+mlx4_mallocv_inline(const char *type, const struct mlx4_malloc_vec *vec,
+ unsigned int cnt, int zero, int socket)
+{
+ unsigned int i;
+ size_t size;
+ size_t least;
+ uint8_t *data = NULL;
+ int fill = !vec[0].addr;
+
+fill:
+ size = 0;
+ least = 0;
+ for (i = 0; i < cnt; ++i) {
+ size_t align = (uintptr_t)vec[i].align;
+
+ if (!align) {
+ align = sizeof(double);
+ } else if (!rte_is_power_of_2(align)) {
+ rte_errno = EINVAL;
+ goto error;
+ }
+ if (least < align)
+ least = align;
+ align = RTE_ALIGN_CEIL(size, align);
+ size = align + vec[i].size;
+ if (fill && vec[i].addr)
+ *vec[i].addr = data + align;
+ }
+ if (fill)
+ return size;
+ if (!zero)
+ data = rte_malloc_socket(type, size, least, socket);
+ else
+ data = rte_zmalloc_socket(type, size, least, socket);
+ if (data) {
+ fill = 1;
+ goto fill;
+ }
+ rte_errno = ENOMEM;
+error:
+ for (i = 0; i != cnt; ++i)
+ if (vec[i].addr)
+ *vec[i].addr = NULL;
+ return 0;
+}
+
+/**
+ * Allocate memory once for several disparate objects.
+ *
+ * This function adds iovec-like semantics (e.g. readv()) to rte_malloc().
+ * Memory is allocated once for several contiguous objects of nonuniform
+ * sizes and alignment constraints.
+ *
+ * Each entry of @p vec describes the size, alignment constraint and
+ * provides a buffer address where the resulting object pointer must be
+ * stored.
+ *
+ * The buffer of the first entry is guaranteed to point to the beginning of
+ * the allocated region and is safe to use with rte_free().
+ *
+ * NULL buffers are silently ignored.
+ *
+ * Providing a NULL buffer in the first entry prevents this function from
+ * allocating any memory but has otherwise no effect on its behavior. In
+ * this case, the contents of remaining non-NULL buffers are updated with
+ * addresses relative to zero (i.e. offsets that would have been used during
+ * the allocation).
+ *
+ * @param[in] type
+ * A string identifying the type of allocated objects (useful for debug
+ * purposes, such as identifying the cause of a memory leak). Can be NULL.
+ * @param[in, out] vec
+ * Description of objects to allocate memory for.
+ * @param cnt
+ * Number of entries in @p vec.
+ *
+ * @return
+ * Size in bytes of the allocated region including any padding. In case of
+ * error, rte_errno is set, 0 is returned and NULL is stored in the
+ * non-NULL buffers pointed by @p vec.
+ *
+ * @see struct mlx4_malloc_vec
+ * @see rte_malloc()
+ */
+size_t
+mlx4_mallocv(const char *type, const struct mlx4_malloc_vec *vec,
+ unsigned int cnt)
+{
+ return mlx4_mallocv_inline(type, vec, cnt, 0, SOCKET_ID_ANY);
+}
+
+/**
+ * Combines the semantics of mlx4_mallocv() with those of rte_zmalloc().
+ *
+ * @see mlx4_mallocv()
+ * @see rte_zmalloc()
+ */
+size_t
+mlx4_zmallocv(const char *type, const struct mlx4_malloc_vec *vec,
+ unsigned int cnt)
+{
+ return mlx4_mallocv_inline(type, vec, cnt, 1, SOCKET_ID_ANY);
+}
+
+/**
+ * Socket-aware version of mlx4_mallocv().
+ *
+ * This function takes one additional parameter.
+ *
+ * @param socket
+ * NUMA socket to allocate memory on. If SOCKET_ID_ANY is used, this
+ * function will behave the same as mlx4_mallocv().
+ *
+ * @see mlx4_mallocv()
+ * @see rte_malloc_socket()
+ */
+size_t
+mlx4_mallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+ unsigned int cnt, int socket)
+{
+ return mlx4_mallocv_inline(type, vec, cnt, 0, socket);
+}
+
+/**
+ * Combines the semantics of mlx4_mallocv_socket() with those of
+ * mlx4_zmalloc_socket().
+ *
+ * @see mlx4_mallocv_socket()
+ * @see rte_zmalloc_socket()
+ */
+size_t
+mlx4_zmallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+ unsigned int cnt, int socket)
+{
+ return mlx4_mallocv_inline(type, vec, cnt, 1, socket);
+}
diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h
new file mode 100644
index 00000000..dc529c9c
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_utils.h
@@ -0,0 +1,133 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MLX4_UTILS_H_
+#define MLX4_UTILS_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+
+#include "mlx4.h"
+
+#ifndef NDEBUG
+
+/*
+ * When debugging is enabled (NDEBUG not defined), file, line and function
+ * information replace the driver name (MLX4_DRIVER_NAME) in log messages.
+ */
+
+/** Return the file name part of a path. */
+static inline const char *
+pmd_drv_log_basename(const char *s)
+{
+ const char *n = s;
+
+ while (*n)
+ if (*(n++) == '/')
+ s = n;
+ return s;
+}
+
+#define PMD_DRV_LOG(level, ...) \
+ RTE_LOG(level, PMD, \
+ RTE_FMT("%s:%u: %s(): " RTE_FMT_HEAD(__VA_ARGS__,) "\n", \
+ pmd_drv_log_basename(__FILE__), \
+ __LINE__, \
+ __func__, \
+ RTE_FMT_TAIL(__VA_ARGS__,)))
+#define DEBUG(...) PMD_DRV_LOG(DEBUG, __VA_ARGS__)
+#ifndef MLX4_PMD_DEBUG_BROKEN_VERBS
+#define claim_zero(...) assert((__VA_ARGS__) == 0)
+#else /* MLX4_PMD_DEBUG_BROKEN_VERBS */
+#define claim_zero(...) \
+ (void)(((__VA_ARGS__) == 0) || \
+ DEBUG("Assertion `(" # __VA_ARGS__ ") == 0' failed (IGNORED)."))
+#endif /* MLX4_PMD_DEBUG_BROKEN_VERBS */
+
+#else /* NDEBUG */
+
+/*
+ * Like assert(), DEBUG() becomes a no-op and claim_zero() does not perform
+ * any check when debugging is disabled.
+ */
+
+#define PMD_DRV_LOG(level, ...) \
+ RTE_LOG(level, PMD, \
+ RTE_FMT(MLX4_DRIVER_NAME ": " \
+ RTE_FMT_HEAD(__VA_ARGS__,) "\n", \
+ RTE_FMT_TAIL(__VA_ARGS__,)))
+#define DEBUG(...) (void)0
+#define claim_zero(...) (__VA_ARGS__)
+
+#endif /* NDEBUG */
+
+#define INFO(...) PMD_DRV_LOG(INFO, __VA_ARGS__)
+#define WARN(...) PMD_DRV_LOG(WARNING, __VA_ARGS__)
+#define ERROR(...) PMD_DRV_LOG(ERR, __VA_ARGS__)
+
+/** Allocate a buffer on the stack and fill it with a printf format string. */
+#define MKSTR(name, ...) \
+ char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \
+ \
+ snprintf(name, sizeof(name), __VA_ARGS__)
+
+/** Generate a string out of the provided arguments. */
+#define MLX4_STR(...) # __VA_ARGS__
+
+/** Similar to MLX4_STR() with enclosed macros expanded first. */
+#define MLX4_STR_EXPAND(...) MLX4_STR(__VA_ARGS__)
+
+/** Object description used with mlx4_mallocv() and similar functions. */
+struct mlx4_malloc_vec {
+ size_t align; /**< Alignment constraint (power of 2), 0 if unknown. */
+ size_t size; /**< Object size. */
+ void **addr; /**< Storage for allocation address. */
+};
+
+/* mlx4_utils.c */
+
+int mlx4_fd_set_non_blocking(int fd);
+size_t mlx4_mallocv(const char *type, const struct mlx4_malloc_vec *vec,
+ unsigned int cnt);
+size_t mlx4_zmallocv(const char *type, const struct mlx4_malloc_vec *vec,
+ unsigned int cnt);
+size_t mlx4_mallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+ unsigned int cnt, int socket);
+size_t mlx4_zmallocv_socket(const char *type, const struct mlx4_malloc_vec *vec,
+ unsigned int cnt, int socket);
+
+#endif /* MLX4_UTILS_H_ */