diff options
Diffstat (limited to 'drivers/net/mlx4')
-rw-r--r-- | drivers/net/mlx4/Makefile | 11 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4.c | 772 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4.h | 198 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_flow.c | 1090 | ||||
-rw-r--r-- | drivers/net/mlx4/mlx4_flow.h | 102 |
5 files changed, 1854 insertions, 319 deletions
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile index efed953e..e873fb48 100644 --- a/drivers/net/mlx4/Makefile +++ b/drivers/net/mlx4/Makefile @@ -36,12 +36,7 @@ LIB = librte_pmd_mlx4.a # Sources. SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4.c - -# Dependencies. -DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += lib/librte_ether -DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += lib/librte_mbuf -DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += lib/librte_eal -DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += lib/librte_mempool +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_flow.c # Basic CFLAGS. CFLAGS += -O3 @@ -102,7 +97,7 @@ endif mlx4_autoconf.h.new: FORCE -mlx4_autoconf.h.new: $(RTE_SDK)/scripts/auto-config-h.sh +mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh $Q $(RM) -f -- '$@' $Q sh -- '$<' '$@' \ RSS_SUPPORT \ @@ -129,7 +124,7 @@ mlx4_autoconf.h: mlx4_autoconf.h.new cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \ mv '$<' '$@' -mlx4.o: mlx4_autoconf.h +$(SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD):.c=.o): mlx4_autoconf.h clean_mlx4: FORCE $Q rm -f -- mlx4_autoconf.h mlx4_autoconf.h.new diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index 6d43a977..ec4419a8 100644 --- a/drivers/net/mlx4/mlx4.c +++ b/drivers/net/mlx4/mlx4.c @@ -1,8 +1,8 @@ /*- * BSD LICENSE * - * Copyright 2012-2015 6WIND S.A. - * Copyright 2012 Mellanox. + * Copyright 2012-2017 6WIND S.A. + * Copyright 2012-2017 Mellanox. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -58,22 +58,9 @@ #include <linux/sockios.h> #include <fcntl.h> -/* Verbs header. */ -/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ -#ifdef PEDANTIC -#pragma GCC diagnostic ignored "-Wpedantic" -#endif -#include <infiniband/verbs.h> -#ifdef PEDANTIC -#pragma GCC diagnostic error "-Wpedantic" -#endif - -/* DPDK headers don't like -pedantic. */ -#ifdef PEDANTIC -#pragma GCC diagnostic ignored "-Wpedantic" -#endif #include <rte_ether.h> #include <rte_ethdev.h> +#include <rte_ethdev_pci.h> #include <rte_dev.h> #include <rte_mbuf.h> #include <rte_errno.h> @@ -86,30 +73,15 @@ #include <rte_log.h> #include <rte_alarm.h> #include <rte_memory.h> -#ifdef PEDANTIC -#pragma GCC diagnostic error "-Wpedantic" -#endif +#include <rte_flow.h> +#include <rte_kvargs.h> /* Generated configuration header. */ #include "mlx4_autoconf.h" -/* PMD header. */ +/* PMD headers. */ #include "mlx4.h" - -/* Runtime logging through RTE_LOG() is enabled when not in debugging mode. - * Intermediate LOG_*() macros add the required end-of-line characters. */ -#ifndef NDEBUG -#define INFO(...) DEBUG(__VA_ARGS__) -#define WARN(...) DEBUG(__VA_ARGS__) -#define ERROR(...) DEBUG(__VA_ARGS__) -#else -#define LOG__(level, m, ...) \ - RTE_LOG(level, PMD, MLX4_DRIVER_NAME ": " m "%c", __VA_ARGS__) -#define LOG_(level, ...) LOG__(level, __VA_ARGS__, '\n') -#define INFO(...) LOG_(INFO, __VA_ARGS__) -#define WARN(...) LOG_(WARNING, __VA_ARGS__) -#define ERROR(...) LOG_(ERR, __VA_ARGS__) -#endif +#include "mlx4_flow.h" /* Convenience macros for accessing mbuf fields. */ #define NEXT(m) ((m)->next) @@ -137,157 +109,6 @@ typedef union { (((val) & (from)) / ((from) / (to))) : \ (((val) & (from)) * ((to) / (from)))) -struct mlx4_rxq_stats { - unsigned int idx; /**< Mapping index. */ -#ifdef MLX4_PMD_SOFT_COUNTERS - uint64_t ipackets; /**< Total of successfully received packets. */ - uint64_t ibytes; /**< Total of successfully received bytes. */ -#endif - uint64_t idropped; /**< Total of packets dropped when RX ring full. */ - uint64_t rx_nombuf; /**< Total of RX mbuf allocation failures. */ -}; - -struct mlx4_txq_stats { - unsigned int idx; /**< Mapping index. */ -#ifdef MLX4_PMD_SOFT_COUNTERS - uint64_t opackets; /**< Total of successfully sent packets. */ - uint64_t obytes; /**< Total of successfully sent bytes. */ -#endif - uint64_t odropped; /**< Total of packets not sent when TX ring full. */ -}; - -/* RX element (scattered packets). */ -struct rxq_elt_sp { - struct ibv_recv_wr wr; /* Work Request. */ - struct ibv_sge sges[MLX4_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */ - struct rte_mbuf *bufs[MLX4_PMD_SGE_WR_N]; /* SGEs buffers. */ -}; - -/* RX element. */ -struct rxq_elt { - struct ibv_recv_wr wr; /* Work Request. */ - struct ibv_sge sge; /* Scatter/Gather Element. */ - /* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */ -}; - -/* RX queue descriptor. */ -struct rxq { - struct priv *priv; /* Back pointer to private data. */ - struct rte_mempool *mp; /* Memory Pool for allocations. */ - struct ibv_mr *mr; /* Memory Region (for mp). */ - struct ibv_cq *cq; /* Completion Queue. */ - struct ibv_qp *qp; /* Queue Pair. */ - struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ - struct ibv_exp_cq_family *if_cq; /* CQ interface. */ - /* - * Each VLAN ID requires a separate flow steering rule. - */ - BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES); - struct ibv_flow *mac_flow[MLX4_MAX_MAC_ADDRESSES][MLX4_MAX_VLAN_IDS]; - struct ibv_flow *promisc_flow; /* Promiscuous flow. */ - struct ibv_flow *allmulti_flow; /* Multicast flow. */ - unsigned int port_id; /* Port ID for incoming packets. */ - unsigned int elts_n; /* (*elts)[] length. */ - unsigned int elts_head; /* Current index in (*elts)[]. */ - union { - struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */ - struct rxq_elt (*no_sp)[]; /* RX elements. */ - } elts; - unsigned int sp:1; /* Use scattered RX elements. */ - unsigned int csum:1; /* Enable checksum offloading. */ - unsigned int csum_l2tun:1; /* Same for L2 tunnels. */ - struct mlx4_rxq_stats stats; /* RX queue counters. */ - unsigned int socket; /* CPU socket ID for allocations. */ - struct ibv_exp_res_domain *rd; /* Resource Domain. */ -}; - -/* TX element. */ -struct txq_elt { - struct rte_mbuf *buf; -}; - -/* Linear buffer type. It is used when transmitting buffers with too many - * segments that do not fit the hardware queue (see max_send_sge). - * Extra segments are copied (linearized) in such buffers, replacing the - * last SGE during TX. - * The size is arbitrary but large enough to hold a jumbo frame with - * 8 segments considering mbuf.buf_len is about 2048 bytes. */ -typedef uint8_t linear_t[16384]; - -/* TX queue descriptor. */ -struct txq { - struct priv *priv; /* Back pointer to private data. */ - struct { - const struct rte_mempool *mp; /* Cached Memory Pool. */ - struct ibv_mr *mr; /* Memory Region (for mp). */ - uint32_t lkey; /* mr->lkey */ - } mp2mr[MLX4_PMD_TX_MP_CACHE]; /* MP to MR translation table. */ - struct ibv_cq *cq; /* Completion Queue. */ - struct ibv_qp *qp; /* Queue Pair. */ - struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ - struct ibv_exp_cq_family *if_cq; /* CQ interface. */ -#if MLX4_PMD_MAX_INLINE > 0 - uint32_t max_inline; /* Max inline send size <= MLX4_PMD_MAX_INLINE. */ -#endif - unsigned int elts_n; /* (*elts)[] length. */ - struct txq_elt (*elts)[]; /* TX elements. */ - unsigned int elts_head; /* Current index in (*elts)[]. */ - unsigned int elts_tail; /* First element awaiting completion. */ - unsigned int elts_comp; /* Number of completion requests. */ - unsigned int elts_comp_cd; /* Countdown for next completion request. */ - unsigned int elts_comp_cd_init; /* Initial value for countdown. */ - struct mlx4_txq_stats stats; /* TX queue counters. */ - linear_t (*elts_linear)[]; /* Linearized buffers. */ - struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */ - unsigned int socket; /* CPU socket ID for allocations. */ - struct ibv_exp_res_domain *rd; /* Resource Domain. */ -}; - -struct priv { - struct rte_eth_dev *dev; /* Ethernet device. */ - struct ibv_context *ctx; /* Verbs context. */ - struct ibv_device_attr device_attr; /* Device properties. */ - struct ibv_pd *pd; /* Protection Domain. */ - /* - * MAC addresses array and configuration bit-field. - * An extra entry that cannot be modified by the DPDK is reserved - * for broadcast frames (destination MAC address ff:ff:ff:ff:ff:ff). - */ - struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES]; - BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES); - /* VLAN filters. */ - struct { - unsigned int enabled:1; /* If enabled. */ - unsigned int id:12; /* VLAN ID (0-4095). */ - } vlan_filter[MLX4_MAX_VLAN_IDS]; /* VLAN filters table. */ - /* Device properties. */ - uint16_t mtu; /* Configured MTU. */ - uint8_t port; /* Physical port number. */ - unsigned int started:1; /* Device started, flows enabled. */ - unsigned int promisc:1; /* Device in promiscuous mode. */ - unsigned int allmulti:1; /* Device receives all multicast packets. */ - unsigned int hw_qpg:1; /* QP groups are supported. */ - unsigned int hw_tss:1; /* TSS is supported. */ - unsigned int hw_rss:1; /* RSS is supported. */ - unsigned int hw_csum:1; /* Checksum offload is supported. */ - unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */ - unsigned int rss:1; /* RSS is enabled. */ - unsigned int vf:1; /* This is a VF device. */ - unsigned int pending_alarm:1; /* An alarm is pending. */ -#ifdef INLINE_RECV - unsigned int inl_recv_size; /* Inline recv size */ -#endif - unsigned int max_rss_tbl_sz; /* Maximum number of RSS queues. */ - /* RX/TX queues. */ - struct rxq rxq_parent; /* Parent queue when RSS is enabled. */ - unsigned int rxqs_n; /* RX queues array size. */ - unsigned int txqs_n; /* TX queues array size. */ - struct rxq *(*rxqs)[]; /* RX queues. */ - struct txq *(*txqs)[]; /* TX queues. */ - struct rte_intr_handle intr_handle; /* Interrupt handler. */ - rte_spinlock_t lock; /* Lock for control functions. */ -}; - /* Local storage for secondary process data. */ struct mlx4_secondary_data { struct rte_eth_dev_data data; /* Local device data. */ @@ -296,6 +117,16 @@ struct mlx4_secondary_data { rte_spinlock_t lock; /* Port configuration lock. */ } mlx4_secondary_data[RTE_MAX_ETHPORTS]; +struct mlx4_conf { + uint8_t active_ports; +}; + +/* Available parameters list. */ +const char *pmd_mlx4_init_params[] = { + MLX4_PMD_PORT_KVARG, + NULL, +}; + /** * Check if running as a secondary process. * @@ -335,8 +166,7 @@ mlx4_get_priv(struct rte_eth_dev *dev) * @param priv * Pointer to private structure. */ -static void -priv_lock(struct priv *priv) +void priv_lock(struct priv *priv) { rte_spinlock_lock(&priv->lock); } @@ -347,8 +177,7 @@ priv_lock(struct priv *priv) * @param priv * Pointer to private structure. */ -static void -priv_unlock(struct priv *priv) +void priv_unlock(struct priv *priv) { rte_spinlock_unlock(&priv->lock); } @@ -2526,6 +2355,7 @@ rxq_add_flow(struct rxq *rxq, unsigned int mac_index, unsigned int vlan_index) assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec); *attr = (struct ibv_flow_attr){ .type = IBV_FLOW_ATTR_NORMAL, + .priority = 3, .num_of_specs = 1, .port = priv->port, .flags = 0 @@ -3340,6 +3170,8 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) /* Increase out of memory counters. */ ++rxq->stats.rx_nombuf; ++rxq->priv->dev->data->rx_mbuf_alloc_failed; + /* Add SGE to array for repost. */ + sges[i] = elt->sge; goto repost; } @@ -3604,7 +3436,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq) } /* Enable scattered packets support for this queue if necessary. */ assert(mb_len >= RTE_PKTMBUF_HEADROOM); - if ((dev->data->dev_conf.rxmode.jumbo_frame) && + if (dev->data->dev_conf.rxmode.enable_scatter && (dev->data->dev_conf.rxmode.max_rx_pkt_len > (mb_len - RTE_PKTMBUF_HEADROOM))) { tmpl.sp = 1; @@ -3826,11 +3658,19 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum; /* Enable scattered packets support for this queue if necessary. */ assert(mb_len >= RTE_PKTMBUF_HEADROOM); - if ((dev->data->dev_conf.rxmode.jumbo_frame) && - (dev->data->dev_conf.rxmode.max_rx_pkt_len > - (mb_len - RTE_PKTMBUF_HEADROOM))) { + if (dev->data->dev_conf.rxmode.max_rx_pkt_len <= + (mb_len - RTE_PKTMBUF_HEADROOM)) { + tmpl.sp = 0; + } else if (dev->data->dev_conf.rxmode.enable_scatter) { tmpl.sp = 1; desc /= MLX4_PMD_SGE_WR_N; + } else { + WARN("%p: the requested maximum Rx packet size (%u) is" + " larger than a single mbuf (%u) and scattered" + " mode has not been requested", + (void *)dev, + dev->data->dev_conf.rxmode.max_rx_pkt_len, + mb_len - RTE_PKTMBUF_HEADROOM); } DEBUG("%p: %s scattered packets support (%u WRs)", (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc); @@ -4092,9 +3932,15 @@ mlx4_rx_queue_release(void *dpdk_rxq) priv_unlock(priv); } -static void +static int priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *); +static int +priv_dev_removal_interrupt_handler_install(struct priv *, struct rte_eth_dev *); + +static int +priv_dev_link_interrupt_handler_install(struct priv *, struct rte_eth_dev *); + /** * DPDK callback to start the device. * @@ -4113,6 +3959,7 @@ mlx4_dev_start(struct rte_eth_dev *dev) unsigned int i = 0; unsigned int r; struct rxq *rxq; + int ret; if (mlx4_is_secondary()) return -E_RTE_SECONDARY; @@ -4132,8 +3979,6 @@ mlx4_dev_start(struct rte_eth_dev *dev) } /* Iterate only once when RSS is enabled. */ do { - int ret; - /* Ignore nonexistent RX queues. */ if (rxq == NULL) continue; @@ -4146,22 +3991,41 @@ mlx4_dev_start(struct rte_eth_dev *dev) continue; WARN("%p: QP flow attachment failed: %s", (void *)dev, strerror(ret)); - /* Rollback. */ - while (i != 0) { - rxq = (*priv->rxqs)[--i]; - if (rxq != NULL) { - rxq_allmulticast_disable(rxq); - rxq_promiscuous_disable(rxq); - rxq_mac_addrs_del(rxq); - } - } - priv->started = 0; - priv_unlock(priv); - return -ret; + goto err; } while ((--r) && ((rxq = (*priv->rxqs)[++i]), i)); - priv_dev_interrupt_handler_install(priv, dev); + ret = priv_dev_link_interrupt_handler_install(priv, dev); + if (ret) { + ERROR("%p: LSC handler install failed", + (void *)dev); + goto err; + } + ret = priv_dev_removal_interrupt_handler_install(priv, dev); + if (ret) { + ERROR("%p: RMV handler install failed", + (void *)dev); + goto err; + } + ret = mlx4_priv_flow_start(priv); + if (ret) { + ERROR("%p: flow start failed: %s", + (void *)dev, strerror(ret)); + goto err; + } priv_unlock(priv); return 0; +err: + /* Rollback. */ + while (i != 0) { + rxq = (*priv->rxqs)[i--]; + if (rxq != NULL) { + rxq_allmulticast_disable(rxq); + rxq_promiscuous_disable(rxq); + rxq_mac_addrs_del(rxq); + } + } + priv->started = 0; + priv_unlock(priv); + return -ret; } /** @@ -4196,6 +4060,7 @@ mlx4_dev_stop(struct rte_eth_dev *dev) rxq = (*priv->rxqs)[0]; r = priv->rxqs_n; } + mlx4_priv_flow_stop(priv); /* Iterate only once when RSS is enabled. */ do { /* Ignore nonexistent RX queues. */ @@ -4258,9 +4123,16 @@ removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) return 0; } -static void +static int priv_dev_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *); +static int +priv_dev_removal_interrupt_handler_uninstall(struct priv *, + struct rte_eth_dev *); + +static int +priv_dev_link_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *); + /** * DPDK callback to close the device. * @@ -4323,7 +4195,8 @@ mlx4_dev_close(struct rte_eth_dev *dev) claim_zero(ibv_close_device(priv->ctx)); } else assert(priv->ctx == NULL); - priv_dev_interrupt_handler_uninstall(priv, dev); + priv_dev_removal_interrupt_handler_uninstall(priv, dev); + priv_dev_link_interrupt_handler_uninstall(priv, dev); priv_unlock(priv); memset(priv, 0, sizeof(*priv)); } @@ -4427,6 +4300,8 @@ mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) unsigned int max; char ifname[IF_NAMESIZE]; + info->pci_dev = RTE_DEV_TO_PCI(dev->device); + if (priv == NULL) return; priv_lock(priv); @@ -4628,26 +4503,30 @@ end: * @param vmdq * VMDq pool index to associate address with (ignored). */ -static void +static int mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr, uint32_t index, uint32_t vmdq) { struct priv *priv = dev->data->dev_private; + int re; if (mlx4_is_secondary()) - return; + return -ENOTSUP; (void)vmdq; priv_lock(priv); DEBUG("%p: adding MAC address at index %" PRIu32, (void *)dev, index); /* Last array entry is reserved for broadcast. */ - if (index >= (elemof(priv->mac) - 1)) + if (index >= (elemof(priv->mac) - 1)) { + re = EINVAL; goto end; - priv_mac_addr_add(priv, index, - (const uint8_t (*)[ETHER_ADDR_LEN]) - mac_addr->addr_bytes); + } + re = priv_mac_addr_add(priv, index, + (const uint8_t (*)[ETHER_ADDR_LEN]) + mac_addr->addr_bytes); end: priv_unlock(priv); + return -re; } /** @@ -4827,7 +4706,7 @@ end: } /** - * DPDK callback to retrieve physical link information (unlocked version). + * DPDK callback to retrieve physical link information. * * @param dev * Pointer to Ethernet device structure. @@ -4835,9 +4714,9 @@ end: * Wait for request completion (ignored). */ static int -mlx4_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete) +mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete) { - struct priv *priv = mlx4_get_priv(dev); + const struct priv *priv = mlx4_get_priv(dev); struct ethtool_cmd edata = { .cmd = ETHTOOL_GSET }; @@ -4845,6 +4724,8 @@ mlx4_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete) struct rte_eth_link dev_link; int link_speed = 0; + /* priv_lock() is not taken to allow concurrent calls. */ + if (priv == NULL) return -EINVAL; (void)wait_to_complete; @@ -4879,27 +4760,9 @@ mlx4_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete) return -1; } -/** - * DPDK callback to retrieve physical link information. - * - * @param dev - * Pointer to Ethernet device structure. - * @param wait_to_complete - * Wait for request completion (ignored). - */ static int -mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete) -{ - struct priv *priv = mlx4_get_priv(dev); - int ret; - - if (priv == NULL) - return -EINVAL; - priv_lock(priv); - ret = mlx4_link_update_unlocked(dev, wait_to_complete); - priv_unlock(priv); - return ret; -} +mlx4_ibv_device_to_pci_addr(const struct ibv_device *device, + struct rte_pci_addr *pci_addr); /** * DPDK callback to change the MTU. @@ -4949,21 +4812,16 @@ mlx4_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) /* Reconfigure each RX queue. */ for (i = 0; (i != priv->rxqs_n); ++i) { struct rxq *rxq = (*priv->rxqs)[i]; - unsigned int mb_len; unsigned int max_frame_len; - int sp; if (rxq == NULL) continue; - /* Calculate new maximum frame length according to MTU and - * toggle scattered support (sp) if necessary. */ + /* Calculate new maximum frame length according to MTU. */ max_frame_len = (priv->mtu + ETHER_HDR_LEN + (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN)); - mb_len = rte_pktmbuf_data_room_size(rxq->mp); - assert(mb_len >= RTE_PKTMBUF_HEADROOM); - sp = (max_frame_len > (mb_len - RTE_PKTMBUF_HEADROOM)); /* Provide new values to rxq_setup(). */ - dev->data->dev_conf.rxmode.jumbo_frame = sp; + dev->data->dev_conf.rxmode.jumbo_frame = + (max_frame_len > ETHER_MAX_LEN); dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; ret = rxq_rehash(dev, rxq); if (ret) { @@ -5215,6 +5073,55 @@ mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) return -ret; } +const struct rte_flow_ops mlx4_flow_ops = { + .validate = mlx4_flow_validate, + .create = mlx4_flow_create, + .destroy = mlx4_flow_destroy, + .flush = mlx4_flow_flush, + .query = NULL, +}; + +/** + * Manage filter operations. + * + * @param dev + * Pointer to Ethernet device structure. + * @param filter_type + * Filter type. + * @param filter_op + * Operation to perform. + * @param arg + * Pointer to operation-specific structure. + * + * @return + * 0 on success, negative errno value on failure. + */ +static int +mlx4_dev_filter_ctrl(struct rte_eth_dev *dev, + enum rte_filter_type filter_type, + enum rte_filter_op filter_op, + void *arg) +{ + int ret = EINVAL; + + switch (filter_type) { + case RTE_ETH_FILTER_GENERIC: + if (filter_op != RTE_ETH_FILTER_GET) + return -EINVAL; + *(const void **)arg = &mlx4_flow_ops; + return 0; + case RTE_ETH_FILTER_FDIR: + DEBUG("%p: filter type FDIR is not supported by this PMD", + (void *)dev); + break; + default: + ERROR("%p: filter type (%d) not supported", + (void *)dev, filter_type); + break; + } + return -ret; +} + static const struct eth_dev_ops mlx4_dev_ops = { .dev_configure = mlx4_dev_configure, .dev_start = mlx4_dev_start, @@ -5249,6 +5156,7 @@ static const struct eth_dev_ops mlx4_dev_ops = { .mac_addr_add = mlx4_mac_addr_add, .mac_addr_set = mlx4_mac_addr_set, .mtu_set = mlx4_dev_set_mtu, + .filter_ctrl = mlx4_dev_filter_ctrl, }; /** @@ -5379,35 +5287,44 @@ mlx4_getenv_int(const char *name) static void mlx4_dev_link_status_handler(void *); static void -mlx4_dev_interrupt_handler(struct rte_intr_handle *, void *); +mlx4_dev_interrupt_handler(void *); /** - * Link status handler. + * Link/device status handler. * * @param priv * Pointer to private structure. * @param dev * Pointer to the rte_eth_dev structure. + * @param events + * Pointer to event flags holder. * * @return - * Nonzero if the callback process can be called immediately. + * Number of events */ static int -priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) +priv_dev_status_handler(struct priv *priv, struct rte_eth_dev *dev, + uint32_t *events) { struct ibv_async_event event; int port_change = 0; int ret = 0; + *events = 0; /* Read all message and acknowledge them. */ for (;;) { if (ibv_get_async_event(priv->ctx, &event)) break; - - if (event.event_type == IBV_EVENT_PORT_ACTIVE || - event.event_type == IBV_EVENT_PORT_ERR) + if ((event.event_type == IBV_EVENT_PORT_ACTIVE || + event.event_type == IBV_EVENT_PORT_ERR) && + (priv->intr_conf.lsc == 1)) { port_change = 1; - else + ret++; + } else if (event.event_type == IBV_EVENT_DEVICE_FATAL && + priv->intr_conf.rmv == 1) { + *events |= (1 << RTE_ETH_EVENT_INTR_RMV); + ret++; + } else DEBUG("event type %d on port %d not handled", event.event_type, event.element.port_num); ibv_ack_async_event(&event); @@ -5417,7 +5334,7 @@ priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) struct rte_eth_link *link = &dev->data->dev_link; priv->pending_alarm = 0; - mlx4_link_update_unlocked(dev, 0); + mlx4_link_update(dev, 0); if (((link->link_speed == 0) && link->link_status) || ((link->link_speed != 0) && !link->link_status)) { /* Inconsistent status, check again later. */ @@ -5425,8 +5342,9 @@ priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) rte_eal_alarm_set(MLX4_ALARM_TIMEOUT_US, mlx4_dev_link_status_handler, dev); - } else - ret = 1; + } else { + *events |= (1 << RTE_ETH_EVENT_INTR_LSC); + } } return ret; } @@ -5442,13 +5360,14 @@ mlx4_dev_link_status_handler(void *arg) { struct rte_eth_dev *dev = arg; struct priv *priv = dev->data->dev_private; + uint32_t events; int ret; priv_lock(priv); assert(priv->pending_alarm == 1); - ret = priv_dev_link_status_handler(priv, dev); + ret = priv_dev_status_handler(priv, dev, &events); priv_unlock(priv); - if (ret) + if (ret > 0 && events & (1 << RTE_ETH_EVENT_INTR_LSC)) _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); } @@ -5461,18 +5380,31 @@ mlx4_dev_link_status_handler(void *arg) * Callback argument. */ static void -mlx4_dev_interrupt_handler(struct rte_intr_handle *intr_handle, void *cb_arg) +mlx4_dev_interrupt_handler(void *cb_arg) { struct rte_eth_dev *dev = cb_arg; struct priv *priv = dev->data->dev_private; int ret; + uint32_t ev; + int i; - (void)intr_handle; priv_lock(priv); - ret = priv_dev_link_status_handler(priv, dev); + ret = priv_dev_status_handler(priv, dev, &ev); priv_unlock(priv); - if (ret) - _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); + if (ret > 0) { + for (i = RTE_ETH_EVENT_UNKNOWN; + i < RTE_ETH_EVENT_MAX; + i++) { + if (ev & (1 << i)) { + ev &= ~(1 << i); + _rte_eth_dev_callback_process(dev, i, NULL); + ret--; + } + } + if (ret) + WARN("%d event%s not processed", ret, + (ret > 1 ? "s were" : " was")); + } } /** @@ -5482,20 +5414,30 @@ mlx4_dev_interrupt_handler(struct rte_intr_handle *intr_handle, void *cb_arg) * Pointer to private structure. * @param dev * Pointer to the rte_eth_dev structure. + * @return + * 0 on success, negative errno value on failure. */ -static void +static int priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) { - if (!dev->data->dev_conf.intr_conf.lsc) - return; - rte_intr_callback_unregister(&priv->intr_handle, - mlx4_dev_interrupt_handler, - dev); - if (priv->pending_alarm) - rte_eal_alarm_cancel(mlx4_dev_link_status_handler, dev); - priv->pending_alarm = 0; + int ret; + + if (priv->intr_conf.lsc || + priv->intr_conf.rmv) + return 0; + ret = rte_intr_callback_unregister(&priv->intr_handle, + mlx4_dev_interrupt_handler, + dev); + if (ret < 0) { + ERROR("rte_intr_callback_unregister failed with %d" + "%s%s%s", ret, + (errno ? " (errno: " : ""), + (errno ? strerror(errno) : ""), + (errno ? ")" : "")); + } priv->intr_handle.fd = 0; priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; + return ret; } /** @@ -5505,30 +5447,229 @@ priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) * Pointer to private structure. * @param dev * Pointer to the rte_eth_dev structure. + * @return + * 0 on success, negative errno value on failure. */ -static void -priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) +static int +priv_dev_interrupt_handler_install(struct priv *priv, + struct rte_eth_dev *dev) { - int rc, flags; + int flags; + int rc; - if (!dev->data->dev_conf.intr_conf.lsc) - return; + /* Check whether the interrupt handler has already been installed + * for either type of interrupt + */ + if (priv->intr_conf.lsc && + priv->intr_conf.rmv && + priv->intr_handle.fd) + return 0; assert(priv->ctx->async_fd > 0); flags = fcntl(priv->ctx->async_fd, F_GETFL); rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); if (rc < 0) { INFO("failed to change file descriptor async event queue"); dev->data->dev_conf.intr_conf.lsc = 0; + dev->data->dev_conf.intr_conf.rmv = 0; + return -errno; } else { priv->intr_handle.fd = priv->ctx->async_fd; priv->intr_handle.type = RTE_INTR_HANDLE_EXT; - rte_intr_callback_register(&priv->intr_handle, - mlx4_dev_interrupt_handler, - dev); + rc = rte_intr_callback_register(&priv->intr_handle, + mlx4_dev_interrupt_handler, + dev); + if (rc) { + ERROR("rte_intr_callback_register failed " + " (errno: %s)", strerror(errno)); + return rc; + } + } + return 0; +} + +/** + * Uninstall interrupt handler. + * + * @param priv + * Pointer to private structure. + * @param dev + * Pointer to the rte_eth_dev structure. + * @return + * 0 on success, negative value on error. + */ +static int +priv_dev_removal_interrupt_handler_uninstall(struct priv *priv, + struct rte_eth_dev *dev) +{ + if (dev->data->dev_conf.intr_conf.rmv) { + priv->intr_conf.rmv = 0; + return priv_dev_interrupt_handler_uninstall(priv, dev); + } + return 0; +} + +/** + * Uninstall interrupt handler. + * + * @param priv + * Pointer to private structure. + * @param dev + * Pointer to the rte_eth_dev structure. + * @return + * 0 on success, negative value on error, + */ +static int +priv_dev_link_interrupt_handler_uninstall(struct priv *priv, + struct rte_eth_dev *dev) +{ + int ret = 0; + + if (dev->data->dev_conf.intr_conf.lsc) { + priv->intr_conf.lsc = 0; + ret = priv_dev_interrupt_handler_uninstall(priv, dev); + if (ret) + return ret; + } + if (priv->pending_alarm) + if (rte_eal_alarm_cancel(mlx4_dev_link_status_handler, + dev)) { + ERROR("rte_eal_alarm_cancel failed " + " (errno: %s)", strerror(rte_errno)); + return -rte_errno; + } + priv->pending_alarm = 0; + return 0; +} + +/** + * Install link interrupt handler. + * + * @param priv + * Pointer to private structure. + * @param dev + * Pointer to the rte_eth_dev structure. + * @return + * 0 on success, negative value on error. + */ +static int +priv_dev_link_interrupt_handler_install(struct priv *priv, + struct rte_eth_dev *dev) +{ + int ret; + + if (dev->data->dev_conf.intr_conf.lsc) { + ret = priv_dev_interrupt_handler_install(priv, dev); + if (ret) + return ret; + priv->intr_conf.lsc = 1; + } + return 0; +} + +/** + * Install removal interrupt handler. + * + * @param priv + * Pointer to private structure. + * @param dev + * Pointer to the rte_eth_dev structure. + * @return + * 0 on success, negative value on error. + */ +static int +priv_dev_removal_interrupt_handler_install(struct priv *priv, + struct rte_eth_dev *dev) +{ + int ret; + + if (dev->data->dev_conf.intr_conf.rmv) { + ret = priv_dev_interrupt_handler_install(priv, dev); + if (ret) + return ret; + priv->intr_conf.rmv = 1; + } + return 0; +} + +/** + * Verify and store value for device argument. + * + * @param[in] key + * Key argument to verify. + * @param[in] val + * Value associated with key. + * @param out + * User data. + * + * @return + * 0 on success, negative errno value on failure. + */ +static int +mlx4_arg_parse(const char *key, const char *val, void *out) +{ + struct mlx4_conf *conf = out; + unsigned long tmp; + + errno = 0; + tmp = strtoul(val, NULL, 0); + if (errno) { + WARN("%s: \"%s\" is not a valid integer", key, val); + return -errno; + } + if (strcmp(MLX4_PMD_PORT_KVARG, key) == 0) { + if (tmp >= MLX4_PMD_MAX_PHYS_PORTS) { + ERROR("invalid port index %lu (max: %u)", + tmp, MLX4_PMD_MAX_PHYS_PORTS - 1); + return -EINVAL; + } + conf->active_ports |= 1 << tmp; + } else { + WARN("%s: unknown parameter", key); + return -EINVAL; } + return 0; } -static struct eth_driver mlx4_driver; +/** + * Parse device parameters. + * + * @param devargs + * Device arguments structure. + * + * @return + * 0 on success, negative errno value on failure. + */ +static int +mlx4_args(struct rte_devargs *devargs, struct mlx4_conf *conf) +{ + struct rte_kvargs *kvlist; + unsigned int arg_count; + int ret = 0; + int i; + + if (devargs == NULL) + return 0; + kvlist = rte_kvargs_parse(devargs->args, pmd_mlx4_init_params); + if (kvlist == NULL) { + ERROR("failed to parse kvargs"); + return -EINVAL; + } + /* Process parameters. */ + for (i = 0; pmd_mlx4_init_params[i]; ++i) { + arg_count = rte_kvargs_count(kvlist, MLX4_PMD_PORT_KVARG); + while (arg_count-- > 0) { + ret = rte_kvargs_process(kvlist, MLX4_PMD_PORT_KVARG, + mlx4_arg_parse, conf); + if (ret != 0) + goto free_kvlist; + } + } +free_kvlist: + rte_kvargs_free(kvlist); + return ret; +} + +static struct rte_pci_driver mlx4_driver; /** * DPDK callback to register a PCI device. @@ -5552,12 +5693,15 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) int err = 0; struct ibv_context *attr_ctx = NULL; struct ibv_device_attr device_attr; + struct mlx4_conf conf = { + .active_ports = 0, + }; unsigned int vf; int idx; int i; (void)pci_drv; - assert(pci_drv == &mlx4_driver.pci_drv); + assert(pci_drv == &mlx4_driver); /* Get mlx4_dev[] index. */ idx = mlx4_dev_idx(&pci_dev->addr); if (idx == -1) { @@ -5571,10 +5715,8 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) list = ibv_get_device_list(&i); if (list == NULL) { assert(errno); - if (errno == ENOSYS) { - WARN("cannot list devices, is ib_uverbs loaded?"); - return 0; - } + if (errno == ENOSYS) + ERROR("cannot list devices, is ib_uverbs loaded?"); return -errno; } assert(i >= 0); @@ -5606,11 +5748,11 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) ibv_free_device_list(list); switch (err) { case 0: - WARN("cannot access device, is mlx4_ib loaded?"); - return 0; + ERROR("cannot access device, is mlx4_ib loaded?"); + return -ENODEV; case EINVAL: - WARN("cannot use device, are drivers up to date?"); - return 0; + ERROR("cannot use device, are drivers up to date?"); + return -EINVAL; } assert(err > 0); return -err; @@ -5622,6 +5764,15 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) goto error; INFO("%u port(s) detected", device_attr.phys_port_cnt); + if (mlx4_args(pci_dev->device.devargs, &conf)) { + ERROR("failed to process device arguments"); + goto error; + } + /* Use all ports when none are defined */ + if (conf.active_ports == 0) { + for (i = 0; i < MLX4_PMD_MAX_PHYS_PORTS; i++) + conf.active_ports |= 1 << i; + } for (i = 0; i < device_attr.phys_port_cnt; i++) { uint32_t port = i + 1; /* ports are indexed from one */ uint32_t test = (1 << i); @@ -5635,6 +5786,9 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) #endif /* HAVE_EXP_QUERY_DEVICE */ struct ether_addr mac; + /* If port is not active, skip. */ + if (!(conf.active_ports & (1 << i))) + continue; #ifdef HAVE_EXP_QUERY_DEVICE exp_device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS; #ifdef RSS_SUPPORT @@ -5840,23 +5994,23 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) eth_dev->rx_pkt_burst = mlx4_rx_burst_secondary_setup; } else { eth_dev->data->dev_private = priv; - eth_dev->data->rx_mbuf_alloc_failed = 0; - eth_dev->data->mtu = ETHER_MTU; eth_dev->data->mac_addrs = priv->mac; } - eth_dev->pci_dev = pci_dev; + eth_dev->device = &pci_dev->device; rte_eth_copy_pci_info(eth_dev, pci_dev); - eth_dev->driver = &mlx4_driver; + eth_dev->device->driver = &mlx4_driver.driver; priv->dev = eth_dev; eth_dev->dev_ops = &mlx4_dev_ops; - TAILQ_INIT(ð_dev->link_intr_cbs); /* Bring Ethernet device up. */ DEBUG("forcing Ethernet interface up"); priv_set_flags(priv, ~IFF_UP, IFF_UP); + /* Update link status once if waiting for LSC. */ + if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) + mlx4_link_update(eth_dev, 0); continue; port_error: @@ -5910,16 +6064,14 @@ static const struct rte_pci_id mlx4_pci_id_map[] = { } }; -static struct eth_driver mlx4_driver = { - .pci_drv = { - .driver = { - .name = MLX4_DRIVER_NAME - }, - .id_table = mlx4_pci_id_map, - .probe = mlx4_pci_probe, - .drv_flags = RTE_PCI_DRV_INTR_LSC, +static struct rte_pci_driver mlx4_driver = { + .driver = { + .name = MLX4_DRIVER_NAME }, - .dev_private_size = sizeof(struct priv) + .id_table = mlx4_pci_id_map, + .probe = mlx4_pci_probe, + .drv_flags = RTE_PCI_DRV_INTR_LSC | + RTE_PCI_DRV_INTR_RMV, }; /** @@ -5938,8 +6090,10 @@ rte_mlx4_pmd_init(void) */ setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); ibv_fork_init(); - rte_eal_pci_register(&mlx4_driver.pci_drv); + rte_pci_register(&mlx4_driver); } RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__); RTE_PMD_REGISTER_PCI_TABLE(net_mlx4, mlx4_pci_id_map); +RTE_PMD_REGISTER_KMOD_DEP(net_mlx4, + "* ib_uverbs & mlx4_en & mlx4_core & mlx4_ib"); diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 4c7505e2..9a3bae90 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -1,8 +1,8 @@ /*- * BSD LICENSE * - * Copyright 2012-2015 6WIND S.A. - * Copyright 2012 Mellanox. + * Copyright 2012-2017 6WIND S.A. + * Copyright 2012-2017 Mellanox. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -39,6 +39,33 @@ #include <limits.h> /* + * Runtime logging through RTE_LOG() is enabled when not in debugging mode. + * Intermediate LOG_*() macros add the required end-of-line characters. + */ +#ifndef NDEBUG +#define INFO(...) DEBUG(__VA_ARGS__) +#define WARN(...) DEBUG(__VA_ARGS__) +#define ERROR(...) DEBUG(__VA_ARGS__) +#else +#define LOG__(level, m, ...) \ + RTE_LOG(level, PMD, MLX4_DRIVER_NAME ": " m "%c", __VA_ARGS__) +#define LOG_(level, ...) LOG__(level, __VA_ARGS__, '\n') +#define INFO(...) LOG_(INFO, __VA_ARGS__) +#define WARN(...) LOG_(WARNING, __VA_ARGS__) +#define ERROR(...) LOG_(ERR, __VA_ARGS__) +#endif + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +/* * Maximum number of simultaneous MAC addresses supported. * * According to ConnectX's Programmer Reference Manual: @@ -54,6 +81,9 @@ /* Request send completion once in every 64 sends, might be less. */ #define MLX4_PMD_TX_PER_COMP_REQ 64 +/* Maximum number of physical ports. */ +#define MLX4_PMD_MAX_PHYS_PORTS 2 + /* Maximum number of Scatter/Gather Elements per Work Request. */ #ifndef MLX4_PMD_SGE_WR_N #define MLX4_PMD_SGE_WR_N 4 @@ -86,6 +116,9 @@ /* Alarm timeout. */ #define MLX4_ALARM_TIMEOUT_US 100000 +/* Port parameter. */ +#define MLX4_PMD_PORT_KVARG "port" + enum { PCI_VENDOR_ID_MELLANOX = 0x15b3, }; @@ -160,4 +193,165 @@ enum { #define claim_positive(...) (__VA_ARGS__) #endif /* NDEBUG */ +struct mlx4_rxq_stats { + unsigned int idx; /**< Mapping index. */ +#ifdef MLX4_PMD_SOFT_COUNTERS + uint64_t ipackets; /**< Total of successfully received packets. */ + uint64_t ibytes; /**< Total of successfully received bytes. */ +#endif + uint64_t idropped; /**< Total of packets dropped when RX ring full. */ + uint64_t rx_nombuf; /**< Total of RX mbuf allocation failures. */ +}; + +/* RX element (scattered packets). */ +struct rxq_elt_sp { + struct ibv_recv_wr wr; /* Work Request. */ + struct ibv_sge sges[MLX4_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */ + struct rte_mbuf *bufs[MLX4_PMD_SGE_WR_N]; /* SGEs buffers. */ +}; + +/* RX element. */ +struct rxq_elt { + struct ibv_recv_wr wr; /* Work Request. */ + struct ibv_sge sge; /* Scatter/Gather Element. */ + /* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */ +}; + +/* RX queue descriptor. */ +struct rxq { + struct priv *priv; /* Back pointer to private data. */ + struct rte_mempool *mp; /* Memory Pool for allocations. */ + struct ibv_mr *mr; /* Memory Region (for mp). */ + struct ibv_cq *cq; /* Completion Queue. */ + struct ibv_qp *qp; /* Queue Pair. */ + struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ + struct ibv_exp_cq_family *if_cq; /* CQ interface. */ + /* + * Each VLAN ID requires a separate flow steering rule. + */ + BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES); + struct ibv_flow *mac_flow[MLX4_MAX_MAC_ADDRESSES][MLX4_MAX_VLAN_IDS]; + struct ibv_flow *promisc_flow; /* Promiscuous flow. */ + struct ibv_flow *allmulti_flow; /* Multicast flow. */ + unsigned int port_id; /* Port ID for incoming packets. */ + unsigned int elts_n; /* (*elts)[] length. */ + unsigned int elts_head; /* Current index in (*elts)[]. */ + union { + struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */ + struct rxq_elt (*no_sp)[]; /* RX elements. */ + } elts; + unsigned int sp:1; /* Use scattered RX elements. */ + unsigned int csum:1; /* Enable checksum offloading. */ + unsigned int csum_l2tun:1; /* Same for L2 tunnels. */ + struct mlx4_rxq_stats stats; /* RX queue counters. */ + unsigned int socket; /* CPU socket ID for allocations. */ + struct ibv_exp_res_domain *rd; /* Resource Domain. */ +}; + +/* TX element. */ +struct txq_elt { + struct rte_mbuf *buf; +}; + +struct mlx4_txq_stats { + unsigned int idx; /**< Mapping index. */ +#ifdef MLX4_PMD_SOFT_COUNTERS + uint64_t opackets; /**< Total of successfully sent packets. */ + uint64_t obytes; /**< Total of successfully sent bytes. */ +#endif + uint64_t odropped; /**< Total of packets not sent when TX ring full. */ +}; + +/* + * Linear buffer type. It is used when transmitting buffers with too many + * segments that do not fit the hardware queue (see max_send_sge). + * Extra segments are copied (linearized) in such buffers, replacing the + * last SGE during TX. + * The size is arbitrary but large enough to hold a jumbo frame with + * 8 segments considering mbuf.buf_len is about 2048 bytes. + */ +typedef uint8_t linear_t[16384]; + +/* TX queue descriptor. */ +struct txq { + struct priv *priv; /* Back pointer to private data. */ + struct { + const struct rte_mempool *mp; /* Cached Memory Pool. */ + struct ibv_mr *mr; /* Memory Region (for mp). */ + uint32_t lkey; /* mr->lkey */ + } mp2mr[MLX4_PMD_TX_MP_CACHE]; /* MP to MR translation table. */ + struct ibv_cq *cq; /* Completion Queue. */ + struct ibv_qp *qp; /* Queue Pair. */ + struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ + struct ibv_exp_cq_family *if_cq; /* CQ interface. */ +#if MLX4_PMD_MAX_INLINE > 0 + uint32_t max_inline; /* Max inline send size <= MLX4_PMD_MAX_INLINE. */ +#endif + unsigned int elts_n; /* (*elts)[] length. */ + struct txq_elt (*elts)[]; /* TX elements. */ + unsigned int elts_head; /* Current index in (*elts)[]. */ + unsigned int elts_tail; /* First element awaiting completion. */ + unsigned int elts_comp; /* Number of completion requests. */ + unsigned int elts_comp_cd; /* Countdown for next completion request. */ + unsigned int elts_comp_cd_init; /* Initial value for countdown. */ + struct mlx4_txq_stats stats; /* TX queue counters. */ + linear_t (*elts_linear)[]; /* Linearized buffers. */ + struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */ + unsigned int socket; /* CPU socket ID for allocations. */ + struct ibv_exp_res_domain *rd; /* Resource Domain. */ +}; + +struct rte_flow; + +struct priv { + struct rte_eth_dev *dev; /* Ethernet device. */ + struct ibv_context *ctx; /* Verbs context. */ + struct ibv_device_attr device_attr; /* Device properties. */ + struct ibv_pd *pd; /* Protection Domain. */ + /* + * MAC addresses array and configuration bit-field. + * An extra entry that cannot be modified by the DPDK is reserved + * for broadcast frames (destination MAC address ff:ff:ff:ff:ff:ff). + */ + struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES]; + BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES); + /* VLAN filters. */ + struct { + unsigned int enabled:1; /* If enabled. */ + unsigned int id:12; /* VLAN ID (0-4095). */ + } vlan_filter[MLX4_MAX_VLAN_IDS]; /* VLAN filters table. */ + /* Device properties. */ + uint16_t mtu; /* Configured MTU. */ + uint8_t port; /* Physical port number. */ + unsigned int started:1; /* Device started, flows enabled. */ + unsigned int promisc:1; /* Device in promiscuous mode. */ + unsigned int allmulti:1; /* Device receives all multicast packets. */ + unsigned int hw_qpg:1; /* QP groups are supported. */ + unsigned int hw_tss:1; /* TSS is supported. */ + unsigned int hw_rss:1; /* RSS is supported. */ + unsigned int hw_csum:1; /* Checksum offload is supported. */ + unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */ + unsigned int rss:1; /* RSS is enabled. */ + unsigned int vf:1; /* This is a VF device. */ + unsigned int pending_alarm:1; /* An alarm is pending. */ +#ifdef INLINE_RECV + unsigned int inl_recv_size; /* Inline recv size */ +#endif + unsigned int max_rss_tbl_sz; /* Maximum number of RSS queues. */ + /* RX/TX queues. */ + struct rxq rxq_parent; /* Parent queue when RSS is enabled. */ + unsigned int rxqs_n; /* RX queues array size. */ + unsigned int txqs_n; /* TX queues array size. */ + struct rxq *(*rxqs)[]; /* RX queues. */ + struct txq *(*txqs)[]; /* TX queues. */ + struct rte_intr_handle intr_handle; /* Interrupt handler. */ + struct rte_flow_drop *flow_drop_queue; /* Flow drop queue. */ + LIST_HEAD(mlx4_flows, rte_flow) flows; + struct rte_intr_conf intr_conf; /* Active interrupt configuration. */ + rte_spinlock_t lock; /* Lock for control functions. */ +}; + +void priv_lock(struct priv *priv); +void priv_unlock(struct priv *priv); + #endif /* RTE_PMD_MLX4_H_ */ diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c new file mode 100644 index 00000000..edfac038 --- /dev/null +++ b/drivers/net/mlx4/mlx4_flow.c @@ -0,0 +1,1090 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <assert.h> + +#include <rte_flow.h> +#include <rte_flow_driver.h> +#include <rte_malloc.h> + +/* Generated configuration header. */ +#include "mlx4_autoconf.h" + +/* PMD headers. */ +#include "mlx4.h" +#include "mlx4_flow.h" + +/** Static initializer for items. */ +#define ITEMS(...) \ + (const enum rte_flow_item_type []){ \ + __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \ + } + +/** Structure to generate a simple graph of layers supported by the NIC. */ +struct mlx4_flow_items { + /** List of possible actions for these items. */ + const enum rte_flow_action_type *const actions; + /** Bit-masks corresponding to the possibilities for the item. */ + const void *mask; + /** + * Default bit-masks to use when item->mask is not provided. When + * \default_mask is also NULL, the full supported bit-mask (\mask) is + * used instead. + */ + const void *default_mask; + /** Bit-masks size in bytes. */ + const unsigned int mask_sz; + /** + * Check support for a given item. + * + * @param item[in] + * Item specification. + * @param mask[in] + * Bit-masks covering supported fields to compare with spec, + * last and mask in + * \item. + * @param size + * Bit-Mask size in bytes. + * + * @return + * 0 on success, negative value otherwise. + */ + int (*validate)(const struct rte_flow_item *item, + const uint8_t *mask, unsigned int size); + /** + * Conversion function from rte_flow to NIC specific flow. + * + * @param item + * rte_flow item to convert. + * @param default_mask + * Default bit-masks to use when item->mask is not provided. + * @param data + * Internal structure to store the conversion. + * + * @return + * 0 on success, negative value otherwise. + */ + int (*convert)(const struct rte_flow_item *item, + const void *default_mask, + void *data); + /** Size in bytes of the destination structure. */ + const unsigned int dst_sz; + /** List of possible following items. */ + const enum rte_flow_item_type *const items; +}; + +struct rte_flow_drop { + struct ibv_qp *qp; /**< Verbs queue pair. */ + struct ibv_cq *cq; /**< Verbs completion queue. */ +}; + +/** Valid action for this PMD. */ +static const enum rte_flow_action_type valid_actions[] = { + RTE_FLOW_ACTION_TYPE_DROP, + RTE_FLOW_ACTION_TYPE_QUEUE, + RTE_FLOW_ACTION_TYPE_END, +}; + +/** + * Convert Ethernet item to Verbs specification. + * + * @param item[in] + * Item specification. + * @param default_mask[in] + * Default bit-masks to use when item->mask is not provided. + * @param data[in, out] + * User structure. + */ +static int +mlx4_flow_create_eth(const struct rte_flow_item *item, + const void *default_mask, + void *data) +{ + const struct rte_flow_item_eth *spec = item->spec; + const struct rte_flow_item_eth *mask = item->mask; + struct mlx4_flow *flow = (struct mlx4_flow *)data; + struct ibv_flow_spec_eth *eth; + const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth); + unsigned int i; + + ++flow->ibv_attr->num_of_specs; + flow->ibv_attr->priority = 2; + eth = (void *)((uintptr_t)flow->ibv_attr + flow->offset); + *eth = (struct ibv_flow_spec_eth) { + .type = IBV_FLOW_SPEC_ETH, + .size = eth_size, + }; + if (!spec) { + flow->ibv_attr->type = IBV_FLOW_ATTR_ALL_DEFAULT; + return 0; + } + if (!mask) + mask = default_mask; + memcpy(eth->val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN); + memcpy(eth->val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN); + memcpy(eth->mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN); + memcpy(eth->mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN); + /* Remove unwanted bits from values. */ + for (i = 0; i < ETHER_ADDR_LEN; ++i) { + eth->val.dst_mac[i] &= eth->mask.dst_mac[i]; + eth->val.src_mac[i] &= eth->mask.src_mac[i]; + } + return 0; +} + +/** + * Convert VLAN item to Verbs specification. + * + * @param item[in] + * Item specification. + * @param default_mask[in] + * Default bit-masks to use when item->mask is not provided. + * @param data[in, out] + * User structure. + */ +static int +mlx4_flow_create_vlan(const struct rte_flow_item *item, + const void *default_mask, + void *data) +{ + const struct rte_flow_item_vlan *spec = item->spec; + const struct rte_flow_item_vlan *mask = item->mask; + struct mlx4_flow *flow = (struct mlx4_flow *)data; + struct ibv_flow_spec_eth *eth; + const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth); + + eth = (void *)((uintptr_t)flow->ibv_attr + flow->offset - eth_size); + if (!spec) + return 0; + if (!mask) + mask = default_mask; + eth->val.vlan_tag = spec->tci; + eth->mask.vlan_tag = mask->tci; + eth->val.vlan_tag &= eth->mask.vlan_tag; + return 0; +} + +/** + * Convert IPv4 item to Verbs specification. + * + * @param item[in] + * Item specification. + * @param default_mask[in] + * Default bit-masks to use when item->mask is not provided. + * @param data[in, out] + * User structure. + */ +static int +mlx4_flow_create_ipv4(const struct rte_flow_item *item, + const void *default_mask, + void *data) +{ + const struct rte_flow_item_ipv4 *spec = item->spec; + const struct rte_flow_item_ipv4 *mask = item->mask; + struct mlx4_flow *flow = (struct mlx4_flow *)data; + struct ibv_flow_spec_ipv4 *ipv4; + unsigned int ipv4_size = sizeof(struct ibv_flow_spec_ipv4); + + ++flow->ibv_attr->num_of_specs; + flow->ibv_attr->priority = 1; + ipv4 = (void *)((uintptr_t)flow->ibv_attr + flow->offset); + *ipv4 = (struct ibv_flow_spec_ipv4) { + .type = IBV_FLOW_SPEC_IPV4, + .size = ipv4_size, + }; + if (!spec) + return 0; + ipv4->val = (struct ibv_flow_ipv4_filter) { + .src_ip = spec->hdr.src_addr, + .dst_ip = spec->hdr.dst_addr, + }; + if (!mask) + mask = default_mask; + ipv4->mask = (struct ibv_flow_ipv4_filter) { + .src_ip = mask->hdr.src_addr, + .dst_ip = mask->hdr.dst_addr, + }; + /* Remove unwanted bits from values. */ + ipv4->val.src_ip &= ipv4->mask.src_ip; + ipv4->val.dst_ip &= ipv4->mask.dst_ip; + return 0; +} + +/** + * Convert UDP item to Verbs specification. + * + * @param item[in] + * Item specification. + * @param default_mask[in] + * Default bit-masks to use when item->mask is not provided. + * @param data[in, out] + * User structure. + */ +static int +mlx4_flow_create_udp(const struct rte_flow_item *item, + const void *default_mask, + void *data) +{ + const struct rte_flow_item_udp *spec = item->spec; + const struct rte_flow_item_udp *mask = item->mask; + struct mlx4_flow *flow = (struct mlx4_flow *)data; + struct ibv_flow_spec_tcp_udp *udp; + unsigned int udp_size = sizeof(struct ibv_flow_spec_tcp_udp); + + ++flow->ibv_attr->num_of_specs; + flow->ibv_attr->priority = 0; + udp = (void *)((uintptr_t)flow->ibv_attr + flow->offset); + *udp = (struct ibv_flow_spec_tcp_udp) { + .type = IBV_FLOW_SPEC_UDP, + .size = udp_size, + }; + if (!spec) + return 0; + udp->val.dst_port = spec->hdr.dst_port; + udp->val.src_port = spec->hdr.src_port; + if (!mask) + mask = default_mask; + udp->mask.dst_port = mask->hdr.dst_port; + udp->mask.src_port = mask->hdr.src_port; + /* Remove unwanted bits from values. */ + udp->val.src_port &= udp->mask.src_port; + udp->val.dst_port &= udp->mask.dst_port; + return 0; +} + +/** + * Convert TCP item to Verbs specification. + * + * @param item[in] + * Item specification. + * @param default_mask[in] + * Default bit-masks to use when item->mask is not provided. + * @param data[in, out] + * User structure. + */ +static int +mlx4_flow_create_tcp(const struct rte_flow_item *item, + const void *default_mask, + void *data) +{ + const struct rte_flow_item_tcp *spec = item->spec; + const struct rte_flow_item_tcp *mask = item->mask; + struct mlx4_flow *flow = (struct mlx4_flow *)data; + struct ibv_flow_spec_tcp_udp *tcp; + unsigned int tcp_size = sizeof(struct ibv_flow_spec_tcp_udp); + + ++flow->ibv_attr->num_of_specs; + flow->ibv_attr->priority = 0; + tcp = (void *)((uintptr_t)flow->ibv_attr + flow->offset); + *tcp = (struct ibv_flow_spec_tcp_udp) { + .type = IBV_FLOW_SPEC_TCP, + .size = tcp_size, + }; + if (!spec) + return 0; + tcp->val.dst_port = spec->hdr.dst_port; + tcp->val.src_port = spec->hdr.src_port; + if (!mask) + mask = default_mask; + tcp->mask.dst_port = mask->hdr.dst_port; + tcp->mask.src_port = mask->hdr.src_port; + /* Remove unwanted bits from values. */ + tcp->val.src_port &= tcp->mask.src_port; + tcp->val.dst_port &= tcp->mask.dst_port; + return 0; +} + +/** + * Check support for a given item. + * + * @param item[in] + * Item specification. + * @param mask[in] + * Bit-masks covering supported fields to compare with spec, last and mask in + * \item. + * @param size + * Bit-Mask size in bytes. + * + * @return + * 0 on success, negative value otherwise. + */ +static int +mlx4_flow_item_validate(const struct rte_flow_item *item, + const uint8_t *mask, unsigned int size) +{ + int ret = 0; + + if (!item->spec && (item->mask || item->last)) + return -1; + if (item->spec && !item->mask) { + unsigned int i; + const uint8_t *spec = item->spec; + + for (i = 0; i < size; ++i) + if ((spec[i] | mask[i]) != mask[i]) + return -1; + } + if (item->last && !item->mask) { + unsigned int i; + const uint8_t *spec = item->last; + + for (i = 0; i < size; ++i) + if ((spec[i] | mask[i]) != mask[i]) + return -1; + } + if (item->spec && item->last) { + uint8_t spec[size]; + uint8_t last[size]; + const uint8_t *apply = mask; + unsigned int i; + + if (item->mask) + apply = item->mask; + for (i = 0; i < size; ++i) { + spec[i] = ((const uint8_t *)item->spec)[i] & apply[i]; + last[i] = ((const uint8_t *)item->last)[i] & apply[i]; + } + ret = memcmp(spec, last, size); + } + return ret; +} + +static int +mlx4_flow_validate_eth(const struct rte_flow_item *item, + const uint8_t *mask, unsigned int size) +{ + if (item->mask) { + const struct rte_flow_item_eth *mask = item->mask; + + if (mask->dst.addr_bytes[0] != 0xff || + mask->dst.addr_bytes[1] != 0xff || + mask->dst.addr_bytes[2] != 0xff || + mask->dst.addr_bytes[3] != 0xff || + mask->dst.addr_bytes[4] != 0xff || + mask->dst.addr_bytes[5] != 0xff) + return -1; + } + return mlx4_flow_item_validate(item, mask, size); +} + +static int +mlx4_flow_validate_vlan(const struct rte_flow_item *item, + const uint8_t *mask, unsigned int size) +{ + if (item->mask) { + const struct rte_flow_item_vlan *mask = item->mask; + + if (mask->tci != 0 && + ntohs(mask->tci) != 0x0fff) + return -1; + } + return mlx4_flow_item_validate(item, mask, size); +} + +static int +mlx4_flow_validate_ipv4(const struct rte_flow_item *item, + const uint8_t *mask, unsigned int size) +{ + if (item->mask) { + const struct rte_flow_item_ipv4 *mask = item->mask; + + if (mask->hdr.src_addr != 0 && + mask->hdr.src_addr != 0xffffffff) + return -1; + if (mask->hdr.dst_addr != 0 && + mask->hdr.dst_addr != 0xffffffff) + return -1; + } + return mlx4_flow_item_validate(item, mask, size); +} + +static int +mlx4_flow_validate_udp(const struct rte_flow_item *item, + const uint8_t *mask, unsigned int size) +{ + if (item->mask) { + const struct rte_flow_item_udp *mask = item->mask; + + if (mask->hdr.src_port != 0 && + mask->hdr.src_port != 0xffff) + return -1; + if (mask->hdr.dst_port != 0 && + mask->hdr.dst_port != 0xffff) + return -1; + } + return mlx4_flow_item_validate(item, mask, size); +} + +static int +mlx4_flow_validate_tcp(const struct rte_flow_item *item, + const uint8_t *mask, unsigned int size) +{ + if (item->mask) { + const struct rte_flow_item_tcp *mask = item->mask; + + if (mask->hdr.src_port != 0 && + mask->hdr.src_port != 0xffff) + return -1; + if (mask->hdr.dst_port != 0 && + mask->hdr.dst_port != 0xffff) + return -1; + } + return mlx4_flow_item_validate(item, mask, size); +} + +/** Graph of supported items and associated actions. */ +static const struct mlx4_flow_items mlx4_flow_items[] = { + [RTE_FLOW_ITEM_TYPE_END] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH), + }, + [RTE_FLOW_ITEM_TYPE_ETH] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_VLAN, + RTE_FLOW_ITEM_TYPE_IPV4), + .actions = valid_actions, + .mask = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }, + .default_mask = &rte_flow_item_eth_mask, + .mask_sz = sizeof(struct rte_flow_item_eth), + .validate = mlx4_flow_validate_eth, + .convert = mlx4_flow_create_eth, + .dst_sz = sizeof(struct ibv_flow_spec_eth), + }, + [RTE_FLOW_ITEM_TYPE_VLAN] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4), + .actions = valid_actions, + .mask = &(const struct rte_flow_item_vlan){ + /* rte_flow_item_vlan_mask is invalid for mlx4. */ +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN + .tci = 0x0fff, +#else + .tci = 0xff0f, +#endif + }, + .mask_sz = sizeof(struct rte_flow_item_vlan), + .validate = mlx4_flow_validate_vlan, + .convert = mlx4_flow_create_vlan, + .dst_sz = 0, + }, + [RTE_FLOW_ITEM_TYPE_IPV4] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, + RTE_FLOW_ITEM_TYPE_TCP), + .actions = valid_actions, + .mask = &(const struct rte_flow_item_ipv4){ + .hdr = { + .src_addr = -1, + .dst_addr = -1, + }, + }, + .default_mask = &rte_flow_item_ipv4_mask, + .mask_sz = sizeof(struct rte_flow_item_ipv4), + .validate = mlx4_flow_validate_ipv4, + .convert = mlx4_flow_create_ipv4, + .dst_sz = sizeof(struct ibv_flow_spec_ipv4), + }, + [RTE_FLOW_ITEM_TYPE_UDP] = { + .actions = valid_actions, + .mask = &(const struct rte_flow_item_udp){ + .hdr = { + .src_port = -1, + .dst_port = -1, + }, + }, + .default_mask = &rte_flow_item_udp_mask, + .mask_sz = sizeof(struct rte_flow_item_udp), + .validate = mlx4_flow_validate_udp, + .convert = mlx4_flow_create_udp, + .dst_sz = sizeof(struct ibv_flow_spec_tcp_udp), + }, + [RTE_FLOW_ITEM_TYPE_TCP] = { + .actions = valid_actions, + .mask = &(const struct rte_flow_item_tcp){ + .hdr = { + .src_port = -1, + .dst_port = -1, + }, + }, + .default_mask = &rte_flow_item_tcp_mask, + .mask_sz = sizeof(struct rte_flow_item_tcp), + .validate = mlx4_flow_validate_tcp, + .convert = mlx4_flow_create_tcp, + .dst_sz = sizeof(struct ibv_flow_spec_tcp_udp), + }, +}; + +/** + * Validate a flow supported by the NIC. + * + * @param priv + * Pointer to private structure. + * @param[in] attr + * Flow rule attributes. + * @param[in] items + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[out] error + * Perform verbose error reporting if not NULL. + * @param[in, out] flow + * Flow structure to update. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +priv_flow_validate(struct priv *priv, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error, + struct mlx4_flow *flow) +{ + const struct mlx4_flow_items *cur_item = mlx4_flow_items; + struct mlx4_flow_action action = { + .queue = 0, + .drop = 0, + }; + + (void)priv; + if (attr->group) { + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + NULL, + "groups are not supported"); + return -rte_errno; + } + if (attr->priority) { + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, + "priorities are not supported"); + return -rte_errno; + } + if (attr->egress) { + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, + NULL, + "egress is not supported"); + return -rte_errno; + } + if (!attr->ingress) { + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + NULL, + "only ingress is supported"); + return -rte_errno; + } + /* Go over items list. */ + for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) { + const struct mlx4_flow_items *token = NULL; + unsigned int i; + int err; + + if (items->type == RTE_FLOW_ITEM_TYPE_VOID) + continue; + /* + * The nic can support patterns with NULL eth spec only + * if eth is a single item in a rule. + */ + if (!items->spec && + items->type == RTE_FLOW_ITEM_TYPE_ETH) { + const struct rte_flow_item *next = items + 1; + + if (next->type != RTE_FLOW_ITEM_TYPE_END) { + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "the rule requires" + " an Ethernet spec"); + return -rte_errno; + } + } + for (i = 0; + cur_item->items && + cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END; + ++i) { + if (cur_item->items[i] == items->type) { + token = &mlx4_flow_items[items->type]; + break; + } + } + if (!token) + goto exit_item_not_supported; + cur_item = token; + err = cur_item->validate(items, + (const uint8_t *)cur_item->mask, + cur_item->mask_sz); + if (err) + goto exit_item_not_supported; + if (flow->ibv_attr && cur_item->convert) { + err = cur_item->convert(items, + (cur_item->default_mask ? + cur_item->default_mask : + cur_item->mask), + flow); + if (err) + goto exit_item_not_supported; + } + flow->offset += cur_item->dst_sz; + } + /* Go over actions list */ + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { + if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) { + continue; + } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { + action.drop = 1; + } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) { + const struct rte_flow_action_queue *queue = + (const struct rte_flow_action_queue *) + actions->conf; + + if (!queue || (queue->index > (priv->rxqs_n - 1))) + goto exit_action_not_supported; + action.queue = 1; + } else { + goto exit_action_not_supported; + } + } + if (!action.queue && !action.drop) { + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "no valid action"); + return -rte_errno; + } + return 0; +exit_item_not_supported: + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + items, "item not supported"); + return -rte_errno; +exit_action_not_supported: + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, + actions, "action not supported"); + return -rte_errno; +} + +/** + * Validate a flow supported by the NIC. + * + * @see rte_flow_validate() + * @see rte_flow_ops + */ +int +mlx4_flow_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + int ret; + struct mlx4_flow flow = { .offset = sizeof(struct ibv_flow_attr) }; + + priv_lock(priv); + ret = priv_flow_validate(priv, attr, items, actions, error, &flow); + priv_unlock(priv); + return ret; +} + +/** + * Destroy a drop queue. + * + * @param priv + * Pointer to private structure. + */ +static void +mlx4_flow_destroy_drop_queue(struct priv *priv) +{ + if (priv->flow_drop_queue) { + struct rte_flow_drop *fdq = priv->flow_drop_queue; + + priv->flow_drop_queue = NULL; + claim_zero(ibv_destroy_qp(fdq->qp)); + claim_zero(ibv_destroy_cq(fdq->cq)); + rte_free(fdq); + } +} + +/** + * Create a single drop queue for all drop flows. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success, negative value otherwise. + */ +static int +mlx4_flow_create_drop_queue(struct priv *priv) +{ + struct ibv_qp *qp; + struct ibv_cq *cq; + struct rte_flow_drop *fdq; + + fdq = rte_calloc(__func__, 1, sizeof(*fdq), 0); + if (!fdq) { + ERROR("Cannot allocate memory for drop struct"); + goto err; + } + cq = ibv_exp_create_cq(priv->ctx, 1, NULL, NULL, 0, + &(struct ibv_exp_cq_init_attr){ + .comp_mask = 0, + }); + if (!cq) { + ERROR("Cannot create drop CQ"); + goto err_create_cq; + } + qp = ibv_exp_create_qp(priv->ctx, + &(struct ibv_exp_qp_init_attr){ + .send_cq = cq, + .recv_cq = cq, + .cap = { + .max_recv_wr = 1, + .max_recv_sge = 1, + }, + .qp_type = IBV_QPT_RAW_PACKET, + .comp_mask = + IBV_EXP_QP_INIT_ATTR_PD | + IBV_EXP_QP_INIT_ATTR_PORT, + .pd = priv->pd, + .port_num = priv->port, + }); + if (!qp) { + ERROR("Cannot create drop QP"); + goto err_create_qp; + } + *fdq = (struct rte_flow_drop){ + .qp = qp, + .cq = cq, + }; + priv->flow_drop_queue = fdq; + return 0; +err_create_qp: + claim_zero(ibv_destroy_cq(cq)); +err_create_cq: + rte_free(fdq); +err: + return -1; +} + +/** + * Complete flow rule creation. + * + * @param priv + * Pointer to private structure. + * @param ibv_attr + * Verbs flow attributes. + * @param action + * Target action structure. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * A flow if the rule could be created. + */ +static struct rte_flow * +priv_flow_create_action_queue(struct priv *priv, + struct ibv_flow_attr *ibv_attr, + struct mlx4_flow_action *action, + struct rte_flow_error *error) +{ + struct ibv_qp *qp; + struct rte_flow *rte_flow; + + assert(priv->pd); + assert(priv->ctx); + rte_flow = rte_calloc(__func__, 1, sizeof(*rte_flow), 0); + if (!rte_flow) { + rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "cannot allocate flow memory"); + return NULL; + } + if (action->drop) { + qp = priv->flow_drop_queue->qp; + } else { + struct rxq *rxq = (*priv->rxqs)[action->queue_id]; + + qp = rxq->qp; + rte_flow->qp = qp; + } + rte_flow->ibv_attr = ibv_attr; + rte_flow->ibv_flow = ibv_create_flow(qp, rte_flow->ibv_attr); + if (!rte_flow->ibv_flow) { + rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "flow rule creation failure"); + goto error; + } + return rte_flow; + +error: + rte_free(rte_flow); + return NULL; +} + +/** + * Convert a flow. + * + * @param priv + * Pointer to private structure. + * @param[in] attr + * Flow rule attributes. + * @param[in] items + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * A flow on success, NULL otherwise. + */ +static struct rte_flow * +priv_flow_create(struct priv *priv, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct rte_flow *rte_flow; + struct mlx4_flow_action action; + struct mlx4_flow flow = { .offset = sizeof(struct ibv_flow_attr), }; + int err; + + err = priv_flow_validate(priv, attr, items, actions, error, &flow); + if (err) + return NULL; + flow.ibv_attr = rte_malloc(__func__, flow.offset, 0); + if (!flow.ibv_attr) { + rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "cannot allocate ibv_attr memory"); + return NULL; + } + flow.offset = sizeof(struct ibv_flow_attr); + *flow.ibv_attr = (struct ibv_flow_attr){ + .comp_mask = 0, + .type = IBV_FLOW_ATTR_NORMAL, + .size = sizeof(struct ibv_flow_attr), + .priority = attr->priority, + .num_of_specs = 0, + .port = priv->port, + .flags = 0, + }; + claim_zero(priv_flow_validate(priv, attr, items, actions, + error, &flow)); + action = (struct mlx4_flow_action){ + .queue = 0, + .drop = 0, + }; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { + if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) { + continue; + } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) { + action.queue = 1; + action.queue_id = + ((const struct rte_flow_action_queue *) + actions->conf)->index; + } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { + action.drop = 1; + } else { + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, "unsupported action"); + goto exit; + } + } + rte_flow = priv_flow_create_action_queue(priv, flow.ibv_attr, + &action, error); + if (rte_flow) + return rte_flow; +exit: + rte_free(flow.ibv_attr); + return NULL; +} + +/** + * Create a flow. + * + * @see rte_flow_create() + * @see rte_flow_ops + */ +struct rte_flow * +mlx4_flow_create(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + struct rte_flow *flow; + + priv_lock(priv); + flow = priv_flow_create(priv, attr, items, actions, error); + if (flow) { + LIST_INSERT_HEAD(&priv->flows, flow, next); + DEBUG("Flow created %p", (void *)flow); + } + priv_unlock(priv); + return flow; +} + +/** + * Destroy a flow. + * + * @param priv + * Pointer to private structure. + * @param[in] flow + * Flow to destroy. + */ +static void +priv_flow_destroy(struct priv *priv, struct rte_flow *flow) +{ + (void)priv; + LIST_REMOVE(flow, next); + if (flow->ibv_flow) + claim_zero(ibv_destroy_flow(flow->ibv_flow)); + rte_free(flow->ibv_attr); + DEBUG("Flow destroyed %p", (void *)flow); + rte_free(flow); +} + +/** + * Destroy a flow. + * + * @see rte_flow_destroy() + * @see rte_flow_ops + */ +int +mlx4_flow_destroy(struct rte_eth_dev *dev, + struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + + (void)error; + priv_lock(priv); + priv_flow_destroy(priv, flow); + priv_unlock(priv); + return 0; +} + +/** + * Destroy all flows. + * + * @param priv + * Pointer to private structure. + */ +static void +priv_flow_flush(struct priv *priv) +{ + while (!LIST_EMPTY(&priv->flows)) { + struct rte_flow *flow; + + flow = LIST_FIRST(&priv->flows); + priv_flow_destroy(priv, flow); + } +} + +/** + * Destroy all flows. + * + * @see rte_flow_flush() + * @see rte_flow_ops + */ +int +mlx4_flow_flush(struct rte_eth_dev *dev, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + + (void)error; + priv_lock(priv); + priv_flow_flush(priv); + priv_unlock(priv); + return 0; +} + +/** + * Remove all flows. + * + * Called by dev_stop() to remove all flows. + * + * @param priv + * Pointer to private structure. + */ +void +mlx4_priv_flow_stop(struct priv *priv) +{ + struct rte_flow *flow; + + for (flow = LIST_FIRST(&priv->flows); + flow; + flow = LIST_NEXT(flow, next)) { + claim_zero(ibv_destroy_flow(flow->ibv_flow)); + flow->ibv_flow = NULL; + DEBUG("Flow %p removed", (void *)flow); + } + mlx4_flow_destroy_drop_queue(priv); +} + +/** + * Add all flows. + * + * @param priv + * Pointer to private structure. + * + * @return + * 0 on success, a errno value otherwise and rte_errno is set. + */ +int +mlx4_priv_flow_start(struct priv *priv) +{ + int ret; + struct ibv_qp *qp; + struct rte_flow *flow; + + ret = mlx4_flow_create_drop_queue(priv); + if (ret) + return -1; + for (flow = LIST_FIRST(&priv->flows); + flow; + flow = LIST_NEXT(flow, next)) { + qp = flow->qp ? flow->qp : priv->flow_drop_queue->qp; + flow->ibv_flow = ibv_create_flow(qp, flow->ibv_attr); + if (!flow->ibv_flow) { + DEBUG("Flow %p cannot be applied", (void *)flow); + rte_errno = EINVAL; + return rte_errno; + } + DEBUG("Flow %p applied", (void *)flow); + } + return 0; +} diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h new file mode 100644 index 00000000..12a293e4 --- /dev/null +++ b/drivers/net/mlx4/mlx4_flow.h @@ -0,0 +1,102 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RTE_PMD_MLX4_FLOW_H_ +#define RTE_PMD_MLX4_FLOW_H_ + +#include <stddef.h> +#include <stdint.h> +#include <sys/queue.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_flow.h> +#include <rte_flow_driver.h> +#include <rte_byteorder.h> + +#include "mlx4.h" + +struct rte_flow { + LIST_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */ + struct ibv_flow *ibv_flow; /**< Verbs flow. */ + struct ibv_flow_attr *ibv_attr; /**< Pointer to Verbs attributes. */ + struct ibv_qp *qp; /**< Verbs queue pair. */ +}; + +int +mlx4_flow_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); + +struct rte_flow * +mlx4_flow_create(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); + +int +mlx4_flow_destroy(struct rte_eth_dev *dev, + struct rte_flow *flow, + struct rte_flow_error *error); + +int +mlx4_flow_flush(struct rte_eth_dev *dev, + struct rte_flow_error *error); + +/** Structure to pass to the conversion function. */ +struct mlx4_flow { + struct ibv_flow_attr *ibv_attr; /**< Verbs attribute. */ + unsigned int offset; /**< Offset in bytes in the ibv_attr buffer. */ +}; + +struct mlx4_flow_action { + uint32_t drop:1; /**< Target is a drop queue. */ + uint32_t queue:1; /**< Target is a receive queue. */ + uint32_t queue_id; /**< Identifier of the queue. */ +}; + +int mlx4_priv_flow_start(struct priv *priv); +void mlx4_priv_flow_stop(struct priv *priv); + +#endif /* RTE_PMD_MLX4_FLOW_H_ */ |