diff options
author | Luca Boccassi <luca.boccassi@gmail.com> | 2018-11-12 16:14:45 +0000 |
---|---|---|
committer | Luca Boccassi <luca.boccassi@gmail.com> | 2018-11-12 16:15:06 +0000 |
commit | 88fab00d4402af240c1b7cc2566133aece115488 (patch) | |
tree | 54525f2b8784dd20ce6886b429ef85d24df04532 /drivers | |
parent | 8d01b9cd70a67cdafd5b965a70420c3bd7fb3f82 (diff) |
New upstream version 18.11-rc2upstream/18.11-rc2
Change-Id: I43ca4edd0747b2dfc38c574ebf3c0aac17d7392c
Signed-off-by: Luca Boccassi <luca.boccassi@gmail.com>
Diffstat (limited to 'drivers')
53 files changed, 5289 insertions, 1162 deletions
diff --git a/drivers/bus/dpaa/base/fman/fman.c b/drivers/bus/dpaa/base/fman/fman.c index bdb70042..06762e0f 100644 --- a/drivers/bus/dpaa/base/fman/fman.c +++ b/drivers/bus/dpaa/base/fman/fman.c @@ -13,6 +13,7 @@ #include <fman.h> #include <of.h> #include <rte_dpaa_logs.h> +#include <rte_string_fns.h> #define QMI_PORT_REGS_OFFSET 0x400 @@ -183,7 +184,7 @@ fman_if_init(const struct device_node *dpa_node) } memset(__if, 0, sizeof(*__if)); INIT_LIST_HEAD(&__if->__if.bpool_list); - strncpy(__if->node_path, dpa_node->full_name, PATH_MAX - 1); + strlcpy(__if->node_path, dpa_node->full_name, PATH_MAX - 1); __if->node_path[PATH_MAX - 1] = '\0'; /* Obtain the MAC node used by this interface except macless */ diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c index 45c24ef7..c99d523f 100644 --- a/drivers/bus/pci/linux/pci.c +++ b/drivers/bus/pci/linux/pci.c @@ -349,11 +349,36 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr) if (ret < 0) { rte_pci_insert_device(dev2, dev); } else { /* already registered */ - dev2->kdrv = dev->kdrv; - dev2->max_vfs = dev->max_vfs; - pci_name_set(dev2); - memmove(dev2->mem_resource, dev->mem_resource, - sizeof(dev->mem_resource)); + if (!rte_dev_is_probed(&dev2->device)) { + dev2->kdrv = dev->kdrv; + dev2->max_vfs = dev->max_vfs; + pci_name_set(dev2); + memmove(dev2->mem_resource, + dev->mem_resource, + sizeof(dev->mem_resource)); + } else { + /** + * If device is plugged and driver is + * probed already, (This happens when + * we call rte_dev_probe which will + * scan all device on the bus) we don't + * need to do anything here unless... + **/ + if (dev2->kdrv != dev->kdrv || + dev2->max_vfs != dev->max_vfs) + /* + * This should not happens. + * But it is still possible if + * we unbind a device from + * vfio or uio before hotplug + * remove and rebind it with + * a different configure. + * So we just print out the + * error as an alarm. + */ + RTE_LOG(ERR, EAL, "Unexpected device scan at %s!\n", + filename); + } free(dev); } return 0; @@ -590,7 +615,16 @@ pci_one_device_iommu_support_va(struct rte_pci_device *dev) mgaw = ((vtd_cap_reg & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; - return rte_eal_check_dma_mask(mgaw) == 0 ? true : false; + /* + * Assuming there is no limitation by now. We can not know at this point + * because the memory has not been initialized yet. Setting the dma mask + * will force a check once memory initialization is done. We can not do + * a fallback to IOVA PA now, but if the dma check fails, the error + * message should advice for using '--iova-mode pa' if IOVA VA is the + * current mode. + */ + rte_mem_set_dma_mask(mgaw); + return true; } #elif defined(RTE_ARCH_PPC_64) static bool @@ -679,6 +713,7 @@ int rte_pci_read_config(const struct rte_pci_device *device, switch (device->kdrv) { case RTE_KDRV_IGB_UIO: + case RTE_KDRV_UIO_GENERIC: return pci_uio_read_config(intr_handle, buf, len, offset); #ifdef VFIO_PRESENT case RTE_KDRV_VFIO: @@ -702,6 +737,7 @@ int rte_pci_write_config(const struct rte_pci_device *device, switch (device->kdrv) { case RTE_KDRV_IGB_UIO: + case RTE_KDRV_UIO_GENERIC: return pci_uio_write_config(intr_handle, buf, len, offset); #ifdef VFIO_PRESENT case RTE_KDRV_VFIO: diff --git a/drivers/bus/vmbus/rte_bus_vmbus.h b/drivers/bus/vmbus/rte_bus_vmbus.h index 2839fef5..4cf73ce8 100644 --- a/drivers/bus/vmbus/rte_bus_vmbus.h +++ b/drivers/bus/vmbus/rte_bus_vmbus.h @@ -407,8 +407,7 @@ void rte_vmbus_unregister(struct rte_vmbus_driver *driver); /** Helper for VMBUS device registration from driver instance */ #define RTE_PMD_REGISTER_VMBUS(nm, vmbus_drv) \ - RTE_INIT(vmbusinitfn_ ##nm); \ - static void vmbusinitfn_ ##nm(void) \ + RTE_INIT(vmbusinitfn_ ##nm) \ { \ (vmbus_drv).driver.name = RTE_STR(nm); \ rte_vmbus_register(&vmbus_drv); \ diff --git a/drivers/compress/isal/isal_compress_pmd.c b/drivers/compress/isal/isal_compress_pmd.c index e943336b..9f1e9688 100644 --- a/drivers/compress/isal/isal_compress_pmd.c +++ b/drivers/compress/isal/isal_compress_pmd.c @@ -314,24 +314,23 @@ chained_mbuf_decompression(struct rte_comp_op *op, struct isal_comp_qp *qp) ret = isal_inflate(qp->state); - /* Check for first segment, offset needs to be accounted for */ - if (remaining_data == op->src.length) { - consumed_data = src->data_len - qp->state->avail_in - - src_remaining_offset; - } else - consumed_data = src->data_len - qp->state->avail_in; - - op->consumed += consumed_data; - remaining_data -= consumed_data; - if (ret != ISAL_DECOMP_OK) { ISAL_PMD_LOG(ERR, "Decompression operation failed\n"); op->status = RTE_COMP_OP_STATUS_ERROR; return ret; } + /* Check for first segment, offset needs to be accounted for */ + if (remaining_data == op->src.length) { + consumed_data = src->data_len - src_remaining_offset; + } else + consumed_data = src->data_len; + if (qp->state->avail_in == 0 && op->consumed != op->src.length) { + op->consumed += consumed_data; + remaining_data -= consumed_data; + if (src->next != NULL) { src = src->next; qp->state->next_in = @@ -460,8 +459,9 @@ process_isal_deflate(struct rte_comp_op *op, struct isal_comp_qp *qp, return ret; } } - op->consumed = qp->stream->total_in; - op->produced = qp->stream->total_out; + op->consumed = qp->stream->total_in; + op->produced = qp->stream->total_out; + isal_deflate_reset(qp->stream); return ret; } @@ -538,6 +538,7 @@ process_isal_inflate(struct rte_comp_op *op, struct isal_comp_qp *qp) op->consumed = op->src.length - qp->state->avail_in; } op->produced = qp->state->total_out; + isal_inflate_reset(qp->state); return ret; } diff --git a/drivers/compress/octeontx/otx_zip_pmd.c b/drivers/compress/octeontx/otx_zip_pmd.c index 67ff5066..a1651b22 100644 --- a/drivers/compress/octeontx/otx_zip_pmd.c +++ b/drivers/compress/octeontx/otx_zip_pmd.c @@ -647,10 +647,7 @@ static struct rte_pci_driver octtx_zip_pmd = { RTE_PMD_REGISTER_PCI(COMPRESSDEV_NAME_ZIP_PMD, octtx_zip_pmd); RTE_PMD_REGISTER_PCI_TABLE(COMPRESSDEV_NAME_ZIP_PMD, pci_id_octtx_zipvf_table); -RTE_INIT(octtx_zip_init_log); - -static void -octtx_zip_init_log(void) +RTE_INIT(octtx_zip_init_log) { octtx_zip_logtype_driver = rte_log_register("pmd.compress.octeontx"); if (octtx_zip_logtype_driver >= 0) diff --git a/drivers/compress/qat/qat_comp.c b/drivers/compress/qat/qat_comp.c index d70c5949..27547428 100644 --- a/drivers/compress/qat/qat_comp.c +++ b/drivers/compress/qat/qat_comp.c @@ -141,6 +141,14 @@ qat_comp_process_response(void **op, uint8_t *resp) resp_msg->comn_resp.comn_status)) != ICP_QAT_FW_COMN_STATUS_FLAG_OK) { + if (unlikely((ICP_QAT_FW_COMN_RESP_XLAT_STAT_GET( + resp_msg->comn_resp.comn_status) != + ICP_QAT_FW_COMN_STATUS_FLAG_OK) && + (qat_xform->qat_comp_request_type + == QAT_COMP_REQUEST_DYNAMIC_COMP_STATELESS))) + QAT_DP_LOG(ERR, "QAT intermediate buffer may be too " + "small for output, try configuring a larger size"); + rx_op->status = RTE_COMP_OP_STATUS_ERROR; rx_op->debug_status = *((uint16_t *)(&resp_msg->comn_resp.comn_error)); diff --git a/drivers/compress/qat/qat_comp_pmd.c b/drivers/compress/qat/qat_comp_pmd.c index 01dd7361..ea930772 100644 --- a/drivers/compress/qat/qat_comp_pmd.c +++ b/drivers/compress/qat/qat_comp_pmd.c @@ -165,11 +165,14 @@ qat_comp_setup_inter_buffers(struct qat_comp_dev_private *comp_dev, } /* Create a memzone to hold intermediate buffers and associated - * meta-data needed by the firmware. The memzone contains: + * meta-data needed by the firmware. The memzone contains 3 parts: * - a list of num_im_sgls physical pointers to sgls - * - the num_im_sgl sgl structures, each pointing to 2 flat buffers - * - the flat buffers: num_im_sgl * 2 - * where num_im_sgls depends on the hardware generation of the device + * - the num_im_sgl sgl structures, each pointing to + * QAT_NUM_BUFS_IN_IM_SGL flat buffers + * - the flat buffers: num_im_sgl * QAT_NUM_BUFS_IN_IM_SGL + * buffers, each of buff_size + * num_im_sgls depends on the hardware generation of the device + * buff_size comes from the user via the config file */ size_of_ptr_array = num_im_sgls * sizeof(phys_addr_t); @@ -202,30 +205,31 @@ qat_comp_setup_inter_buffers(struct qat_comp_dev_private *comp_dev, offset_of_sgls + i * sizeof(struct qat_inter_sgl); struct qat_inter_sgl *sgl = (struct qat_inter_sgl *)(mz_start + curr_sgl_offset); + int lb; array_of_pointers->pointer[i] = mz_start_phys + curr_sgl_offset; sgl->num_bufs = QAT_NUM_BUFS_IN_IM_SGL; sgl->num_mapped_bufs = 0; sgl->resrvd = 0; - sgl->buffers[0].addr = mz_start_phys + offset_of_flat_buffs + - ((i * QAT_NUM_BUFS_IN_IM_SGL) * buff_size); - sgl->buffers[0].len = buff_size; - sgl->buffers[0].resrvd = 0; - sgl->buffers[1].addr = mz_start_phys + offset_of_flat_buffs + - (((i * QAT_NUM_BUFS_IN_IM_SGL) + 1) * buff_size); - sgl->buffers[1].len = buff_size; - sgl->buffers[1].resrvd = 0; #if QAT_IM_BUFFER_DEBUG QAT_LOG(DEBUG, " : phys addr of sgl[%i] in array_of_pointers" - "= 0x%"PRIx64, i, array_of_pointers->pointer[i]); + " = 0x%"PRIx64, i, array_of_pointers->pointer[i]); QAT_LOG(DEBUG, " : virt address of sgl[%i] = %p", i, sgl); - QAT_LOG(DEBUG, " : sgl->buffers[0].addr = 0x%"PRIx64", len=%d", - sgl->buffers[0].addr, sgl->buffers[0].len); - QAT_LOG(DEBUG, " : sgl->buffers[1].addr = 0x%"PRIx64", len=%d", - sgl->buffers[1].addr, sgl->buffers[1].len); +#endif + for (lb = 0; lb < QAT_NUM_BUFS_IN_IM_SGL; lb++) { + sgl->buffers[lb].addr = + mz_start_phys + offset_of_flat_buffs + + (((i * QAT_NUM_BUFS_IN_IM_SGL) + lb) * buff_size); + sgl->buffers[lb].len = buff_size; + sgl->buffers[lb].resrvd = 0; +#if QAT_IM_BUFFER_DEBUG + QAT_LOG(DEBUG, + " : sgl->buffers[%d].addr = 0x%"PRIx64", len=%d", + lb, sgl->buffers[lb].addr, sgl->buffers[lb].len); #endif } + } #if QAT_IM_BUFFER_DEBUG QAT_DP_HEXDUMP_LOG(DEBUG, "IM buffer memzone start:", mz_start, offset_of_flat_buffs + 32); diff --git a/drivers/compress/zlib/zlib_pmd.c b/drivers/compress/zlib/zlib_pmd.c index 7d6871b1..5a4d47d4 100644 --- a/drivers/compress/zlib/zlib_pmd.c +++ b/drivers/compress/zlib/zlib_pmd.c @@ -425,10 +425,8 @@ static struct rte_vdev_driver zlib_pmd_drv = { }; RTE_PMD_REGISTER_VDEV(COMPRESSDEV_NAME_ZLIB_PMD, zlib_pmd_drv); -RTE_INIT(zlib_init_log); -static void -zlib_init_log(void) +RTE_INIT(zlib_init_log) { zlib_logtype_driver = rte_log_register("pmd.compress.zlib"); if (zlib_logtype_driver >= 0) diff --git a/drivers/crypto/caam_jr/caam_jr_uio.c b/drivers/crypto/caam_jr/caam_jr_uio.c index c07d9db0..d94101c2 100644 --- a/drivers/crypto/caam_jr/caam_jr_uio.c +++ b/drivers/crypto/caam_jr/caam_jr_uio.c @@ -332,7 +332,7 @@ free_job_ring(uint32_t uio_fd) struct uio_job_ring *job_ring = NULL; int i; - if (!job_ring->uio_fd) + if (!uio_fd) return; for (i = 0; i < MAX_SEC_JOB_RINGS; i++) { diff --git a/drivers/crypto/openssl/rte_openssl_pmd.c b/drivers/crypto/openssl/rte_openssl_pmd.c index 003116dc..11ea0d19 100644 --- a/drivers/crypto/openssl/rte_openssl_pmd.c +++ b/drivers/crypto/openssl/rte_openssl_pmd.c @@ -1843,6 +1843,9 @@ process_openssl_rsa_op(struct rte_crypto_op *cop, struct rte_crypto_asym_op *op = cop->asym; RSA *rsa = sess->u.r.rsa; uint32_t pad = (op->rsa.pad); + uint8_t *tmp; + + cop->status = RTE_CRYPTO_OP_STATUS_SUCCESS; switch (pad) { case RTE_CRYPTO_RSA_PKCS1_V1_5_BT0: @@ -1895,9 +1898,15 @@ process_openssl_rsa_op(struct rte_crypto_op *cop, break; case RTE_CRYPTO_ASYM_OP_VERIFY: + tmp = rte_malloc(NULL, op->rsa.sign.length, 0); + if (tmp == NULL) { + OPENSSL_LOG(ERR, "Memory allocation failed"); + cop->status = RTE_CRYPTO_OP_STATUS_ERROR; + break; + } ret = RSA_public_decrypt(op->rsa.sign.length, op->rsa.sign.data, - op->rsa.sign.data, + tmp, rsa, pad); @@ -1905,13 +1914,12 @@ process_openssl_rsa_op(struct rte_crypto_op *cop, "Length of public_decrypt %d " "length of message %zd\n", ret, op->rsa.message.length); - - if (memcmp(op->rsa.sign.data, op->rsa.message.data, - op->rsa.message.length)) { - OPENSSL_LOG(ERR, - "RSA sign Verification failed"); - return -1; + if ((ret <= 0) || (memcmp(tmp, op->rsa.message.data, + op->rsa.message.length))) { + OPENSSL_LOG(ERR, "RSA sign Verification failed"); + cop->status = RTE_CRYPTO_OP_STATUS_ERROR; } + rte_free(tmp); break; default: diff --git a/drivers/crypto/scheduler/scheduler_pmd.c b/drivers/crypto/scheduler/scheduler_pmd.c index 20198ccb..a1632a2b 100644 --- a/drivers/crypto/scheduler/scheduler_pmd.c +++ b/drivers/crypto/scheduler/scheduler_pmd.c @@ -369,7 +369,7 @@ parse_name_arg(const char *key __rte_unused, return -EINVAL; } - strncpy(params->name, value, RTE_CRYPTODEV_NAME_MAX_LEN); + strlcpy(params->name, value, RTE_CRYPTODEV_NAME_MAX_LEN); return 0; } diff --git a/drivers/net/avf/base/avf_register.h b/drivers/net/avf/base/avf_register.h index ba5a9f3f..adb98958 100644 --- a/drivers/net/avf/base/avf_register.h +++ b/drivers/net/avf/base/avf_register.h @@ -76,7 +76,7 @@ POSSIBILITY OF SUCH DAMAGE. #define AVF_ARQLEN1_ARQCRIT_SHIFT 30 #define AVF_ARQLEN1_ARQCRIT_MASK AVF_MASK(0x1, AVF_ARQLEN1_ARQCRIT_SHIFT) #define AVF_ARQLEN1_ARQENABLE_SHIFT 31 -#define AVF_ARQLEN1_ARQENABLE_MASK AVF_MASK(0x1, AVF_ARQLEN1_ARQENABLE_SHIFT) +#define AVF_ARQLEN1_ARQENABLE_MASK AVF_MASK(0x1U, AVF_ARQLEN1_ARQENABLE_SHIFT) #define AVF_ARQT1 0x00007000 /* Reset: EMPR */ #define AVF_ARQT1_ARQT_SHIFT 0 #define AVF_ARQT1_ARQT_MASK AVF_MASK(0x3FF, AVF_ARQT1_ARQT_SHIFT) @@ -99,7 +99,7 @@ POSSIBILITY OF SUCH DAMAGE. #define AVF_ATQLEN1_ATQCRIT_SHIFT 30 #define AVF_ATQLEN1_ATQCRIT_MASK AVF_MASK(0x1, AVF_ATQLEN1_ATQCRIT_SHIFT) #define AVF_ATQLEN1_ATQENABLE_SHIFT 31 -#define AVF_ATQLEN1_ATQENABLE_MASK AVF_MASK(0x1, AVF_ATQLEN1_ATQENABLE_SHIFT) +#define AVF_ATQLEN1_ATQENABLE_MASK AVF_MASK(0x1U, AVF_ATQLEN1_ATQENABLE_SHIFT) #define AVF_ATQT1 0x00008400 /* Reset: EMPR */ #define AVF_ATQT1_ATQT_SHIFT 0 #define AVF_ATQT1_ATQT_MASK AVF_MASK(0x3FF, AVF_ATQT1_ATQT_SHIFT) diff --git a/drivers/net/bnxt/bnxt_rxr.c b/drivers/net/bnxt/bnxt_rxr.c index c7bc8848..1bfc63d9 100644 --- a/drivers/net/bnxt/bnxt_rxr.c +++ b/drivers/net/bnxt/bnxt_rxr.c @@ -102,25 +102,6 @@ static inline void bnxt_reuse_rx_mbuf(struct bnxt_rx_ring_info *rxr, rxr->rx_prod = prod; } -#ifdef BNXT_DEBUG -static void bnxt_reuse_ag_mbuf(struct bnxt_rx_ring_info *rxr, uint16_t cons, - struct rte_mbuf *mbuf) -{ - uint16_t prod = rxr->ag_prod; - struct bnxt_sw_rx_bd *prod_rx_buf; - struct rx_prod_pkt_bd *prod_bd, *cons_bd; - - prod_rx_buf = &rxr->ag_buf_ring[prod]; - - prod_rx_buf->mbuf = mbuf; - - prod_bd = &rxr->ag_desc_ring[prod]; - cons_bd = &rxr->ag_desc_ring[cons]; - - prod_bd->address = cons_bd->addr; -} -#endif - static inline struct rte_mbuf *bnxt_consume_rx_buf(struct bnxt_rx_ring_info *rxr, uint16_t cons) @@ -377,9 +358,6 @@ static int bnxt_rx_pkt(struct rte_mbuf **rx_pkt, uint32_t tmp_raw_cons = *raw_cons; uint16_t cons, prod, cp_cons = RING_CMP(cpr->cp_ring_struct, tmp_raw_cons); -#ifdef BNXT_DEBUG - uint16_t ag_cons; -#endif struct rte_mbuf *mbuf; int rc = 0; uint8_t agg_buf = 0; @@ -482,8 +460,6 @@ static int bnxt_rx_pkt(struct rte_mbuf **rx_pkt, if (rxcmp1->errors_v2 & RX_CMP_L2_ERRORS) { /* Re-install the mbuf back to the rx ring */ bnxt_reuse_rx_mbuf(rxr, cons, mbuf); - if (agg_buf) - bnxt_reuse_ag_mbuf(rxr, ag_cons, mbuf); rc = -EIO; goto next_rx; diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c b/drivers/net/bonding/rte_eth_bond_pmd.c index 156f31c6..1a6d8e4d 100644 --- a/drivers/net/bonding/rte_eth_bond_pmd.c +++ b/drivers/net/bonding/rte_eth_bond_pmd.c @@ -3216,8 +3216,6 @@ bond_probe(struct rte_vdev_device *dev) internals = rte_eth_devices[port_id].data->dev_private; internals->kvlist = kvlist; - rte_eth_dev_probing_finish(&rte_eth_devices[port_id]); - if (rte_kvargs_count(kvlist, PMD_BOND_AGG_MODE_KVARG) == 1) { if (rte_kvargs_process(kvlist, PMD_BOND_AGG_MODE_KVARG, @@ -3230,12 +3228,12 @@ bond_probe(struct rte_vdev_device *dev) } if (internals->mode == BONDING_MODE_8023AD) - rte_eth_bond_8023ad_agg_selection_set(port_id, - agg_mode); + internals->mode4.agg_selection = agg_mode; } else { - rte_eth_bond_8023ad_agg_selection_set(port_id, AGG_STABLE); + internals->mode4.agg_selection = AGG_STABLE; } + rte_eth_dev_probing_finish(&rte_eth_devices[port_id]); RTE_BOND_LOG(INFO, "Create bonded device %s on port %d in mode %u on " "socket %u.", name, port_id, bonding_mode, socket_id); return 0; diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c index 25ff5f68..ab0a80e1 100644 --- a/drivers/net/e1000/igb_rxtx.c +++ b/drivers/net/e1000/igb_rxtx.c @@ -50,6 +50,10 @@ #endif /* Bit Mask to indicate what bits required for building TX context */ #define IGB_TX_OFFLOAD_MASK ( \ + PKT_TX_OUTER_IPV6 | \ + PKT_TX_OUTER_IPV4 | \ + PKT_TX_IPV6 | \ + PKT_TX_IPV4 | \ PKT_TX_VLAN_PKT | \ PKT_TX_IP_CKSUM | \ PKT_TX_L4_MASK | \ diff --git a/drivers/net/ena/ena_ethdev.c b/drivers/net/ena/ena_ethdev.c index 0c0ed930..abe1e7bd 100644 --- a/drivers/net/ena/ena_ethdev.c +++ b/drivers/net/ena/ena_ethdev.c @@ -509,6 +509,8 @@ err: static void ena_close(struct rte_eth_dev *dev) { + struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev); + struct rte_intr_handle *intr_handle = &pci_dev->intr_handle; struct ena_adapter *adapter = (struct ena_adapter *)(dev->data->dev_private); @@ -518,6 +520,25 @@ static void ena_close(struct rte_eth_dev *dev) ena_rx_queue_release_all(dev); ena_tx_queue_release_all(dev); + + rte_free(adapter->drv_stats); + adapter->drv_stats = NULL; + + rte_intr_disable(intr_handle); + rte_intr_callback_unregister(intr_handle, + ena_interrupt_handler_rte, + adapter); + + /* + * Pass the information to the rte_eth_dev_close() that it should also + * release the private port resources. + */ + dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; + /* + * MAC is not allocated dynamically. Setting NULL should prevent from + * release of the resource in the rte_eth_dev_release_port(). + */ + dev->data->mac_addrs = NULL; } static int @@ -1683,8 +1704,6 @@ err: static int eth_ena_dev_uninit(struct rte_eth_dev *eth_dev) { - struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev); - struct rte_intr_handle *intr_handle = &pci_dev->intr_handle; struct ena_adapter *adapter = (struct ena_adapter *)(eth_dev->data->dev_private); @@ -1699,14 +1718,6 @@ static int eth_ena_dev_uninit(struct rte_eth_dev *eth_dev) eth_dev->tx_pkt_burst = NULL; eth_dev->tx_pkt_prepare = NULL; - rte_free(adapter->drv_stats); - adapter->drv_stats = NULL; - - rte_intr_disable(intr_handle); - rte_intr_callback_unregister(intr_handle, - ena_interrupt_handler_rte, - adapter); - adapter->state = ENA_ADAPTER_STATE_FREE; return 0; diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c index e81c3f3b..c3869de3 100644 --- a/drivers/net/enic/enic_main.c +++ b/drivers/net/enic/enic_main.c @@ -518,7 +518,7 @@ static void enic_prep_wq_for_simple_tx(struct enic *enic, uint16_t queue_idx) * The 'strong' version is in enic_rxtx_vec_avx2.c. This weak version is used * used when that file is not compiled. */ -bool __attribute__((weak)) +__rte_weak bool enic_use_vector_rx_handler(__rte_unused struct enic *enic) { return false; diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index 8bfa2517..e1152ff0 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -69,7 +69,7 @@ I40E_TX_IEEE1588_TMST) #define I40E_TX_OFFLOAD_NOTSUP_MASK \ - (PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_MASK) + ~(PKT_TX_OFFLOAD_MASK & I40E_TX_OFFLOAD_MASK) static inline void i40e_rxd_to_vlan_tci(struct rte_mbuf *mb, volatile union i40e_rx_desc *rxdp) @@ -1741,6 +1741,11 @@ i40e_dev_rx_queue_setup_runtime(struct rte_eth_dev *dev, ad->rx_bulk_alloc_allowed = false; i40e_set_rx_function(dev); return 0; + } else if (ad->rx_vec_allowed && !rte_is_power_of_2(rxq->nb_rx_desc)) { + PMD_DRV_LOG(ERR, "Vector mode is allowed, but descriptor" + " number %d of queue %d isn't power of 2", + rxq->nb_rx_desc, rxq->queue_id); + return -EINVAL; } /* check bulk alloc conflict */ diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h b/drivers/net/i40e/i40e_rxtx_vec_common.h index f00f6d64..0e6ffa00 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_common.h +++ b/drivers/net/i40e/i40e_rxtx_vec_common.h @@ -192,8 +192,13 @@ static inline int i40e_rx_vec_dev_conf_condition_check_default(struct rte_eth_dev *dev) { #ifndef RTE_LIBRTE_IEEE1588 + struct i40e_adapter *ad = + I40E_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private); struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode; struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf; + struct i40e_rx_queue *rxq; + uint16_t desc, i; + bool first_queue; /* no fdir support */ if (fconf->mode != RTE_FDIR_MODE_NONE) @@ -207,6 +212,39 @@ i40e_rx_vec_dev_conf_condition_check_default(struct rte_eth_dev *dev) if (rxmode->offloads & DEV_RX_OFFLOAD_VLAN_EXTEND) return -1; + /** + * Vector mode is allowed only when number of Rx queue + * descriptor is power of 2. + */ + if (!dev->data->dev_started) { + first_queue = true; + for (i = 0; i < dev->data->nb_rx_queues; i++) { + rxq = dev->data->rx_queues[i]; + if (!rxq) + continue; + desc = rxq->nb_rx_desc; + if (first_queue) + ad->rx_vec_allowed = + rte_is_power_of_2(desc); + else + ad->rx_vec_allowed = + ad->rx_vec_allowed ? + rte_is_power_of_2(desc) : + ad->rx_vec_allowed; + first_queue = false; + } + } else { + /* Only check the first queue's descriptor number */ + for (i = 0; i < dev->data->nb_rx_queues; i++) { + rxq = dev->data->rx_queues[i]; + if (!rxq) + continue; + desc = rxq->nb_rx_desc; + ad->rx_vec_allowed = rte_is_power_of_2(desc); + break; + } + } + return 0; #else RTE_SET_USED(dev); diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c index 269595b7..c9e82d51 100644 --- a/drivers/net/ixgbe/ixgbe_ethdev.c +++ b/drivers/net/ixgbe/ixgbe_ethdev.c @@ -220,6 +220,8 @@ static int ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev); static int ixgbe_dev_interrupt_action(struct rte_eth_dev *dev); static void ixgbe_dev_interrupt_handler(void *param); static void ixgbe_dev_interrupt_delayed_handler(void *param); +static void ixgbe_dev_setup_link_alarm_handler(void *param); + static int ixgbe_add_rar(struct rte_eth_dev *dev, struct ether_addr *mac_addr, uint32_t index, uint32_t pool); static void ixgbe_remove_rar(struct rte_eth_dev *dev, uint32_t index); @@ -1303,7 +1305,7 @@ eth_ixgbe_dev_uninit(struct rte_eth_dev *eth_dev) PMD_INIT_FUNC_TRACE(); if (rte_eal_process_type() != RTE_PROC_PRIMARY) - return -EPERM; + return 0; hw = IXGBE_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private); @@ -1702,7 +1704,7 @@ eth_ixgbevf_dev_uninit(struct rte_eth_dev *eth_dev) PMD_INIT_FUNC_TRACE(); if (rte_eal_process_type() != RTE_PROC_PRIMARY) - return -EPERM; + return 0; hw = IXGBE_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private); @@ -2793,6 +2795,8 @@ ixgbe_dev_stop(struct rte_eth_dev *dev) PMD_INIT_FUNC_TRACE(); + rte_eal_alarm_cancel(ixgbe_dev_setup_link_alarm_handler, dev); + /* disable interrupts */ ixgbe_disable_intr(hw); @@ -3971,6 +3975,25 @@ out: return ret_val; } +static void +ixgbe_dev_setup_link_alarm_handler(void *param) +{ + struct rte_eth_dev *dev = (struct rte_eth_dev *)param; + struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private); + struct ixgbe_interrupt *intr = + IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private); + u32 speed; + bool autoneg = false; + + speed = hw->phy.autoneg_advertised; + if (!speed) + ixgbe_get_link_capabilities(hw, &speed, &autoneg); + + ixgbe_setup_link(hw, speed, true); + + intr->flags &= ~IXGBE_FLAG_NEED_LINK_CONFIG; +} + /* return 0 means link status changed, -1 means not changed */ int ixgbe_dev_link_update_share(struct rte_eth_dev *dev, @@ -3983,9 +4006,7 @@ ixgbe_dev_link_update_share(struct rte_eth_dev *dev, IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private); int link_up; int diag; - u32 speed = 0; int wait = 1; - bool autoneg = false; memset(&link, 0, sizeof(link)); link.link_status = ETH_LINK_DOWN; @@ -3995,13 +4016,8 @@ ixgbe_dev_link_update_share(struct rte_eth_dev *dev, hw->mac.get_link_status = true; - if ((intr->flags & IXGBE_FLAG_NEED_LINK_CONFIG) && - ixgbe_get_media_type(hw) == ixgbe_media_type_fiber) { - speed = hw->phy.autoneg_advertised; - if (!speed) - ixgbe_get_link_capabilities(hw, &speed, &autoneg); - ixgbe_setup_link(hw, speed, true); - } + if (intr->flags & IXGBE_FLAG_NEED_LINK_CONFIG) + return rte_eth_linkstatus_set(dev, &link); /* check if it needs to wait to complete, if lsc interrupt is enabled */ if (wait_to_complete == 0 || dev->data->dev_conf.intr_conf.lsc != 0) @@ -4019,11 +4035,14 @@ ixgbe_dev_link_update_share(struct rte_eth_dev *dev, } if (link_up == 0) { - intr->flags |= IXGBE_FLAG_NEED_LINK_CONFIG; + if (ixgbe_get_media_type(hw) == ixgbe_media_type_fiber) { + intr->flags |= IXGBE_FLAG_NEED_LINK_CONFIG; + rte_eal_alarm_set(10, + ixgbe_dev_setup_link_alarm_handler, dev); + } return rte_eth_linkstatus_set(dev, &link); } - intr->flags &= ~IXGBE_FLAG_NEED_LINK_CONFIG; link.link_status = ETH_LINK_UP; link.link_duplex = ETH_LINK_FULL_DUPLEX; @@ -5128,6 +5147,8 @@ ixgbevf_dev_stop(struct rte_eth_dev *dev) PMD_INIT_FUNC_TRACE(); + rte_eal_alarm_cancel(ixgbe_dev_setup_link_alarm_handler, dev); + ixgbevf_intr_disable(dev); hw->adapter_stopped = 1; diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index fecb57c1..7a50bccd 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -51,6 +51,7 @@ CFLAGS += -D_DEFAULT_SOURCE CFLAGS += -D_XOPEN_SOURCE=600 CFLAGS += $(WERROR_FLAGS) CFLAGS += -Wno-strict-prototypes +CFLAGS += $(shell pkg-config --cflags libmnl) ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y) CFLAGS += -DMLX5_GLUE='"$(LIB_GLUE)"' CFLAGS += -DMLX5_GLUE_VERSION='"$(LIB_GLUE_VERSION)"' @@ -59,7 +60,7 @@ LDLIBS += -ldl else LDLIBS += -libverbs -lmlx5 endif -LDLIBS += -lmnl +LDLIBS += $(shell pkg-config --libs libmnl) LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs LDLIBS += -lrte_bus_pci @@ -137,9 +138,14 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh enum MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ + HAVE_IBV_MLX5_MOD_CQE_128B_PAD \ + infiniband/mlx5dv.h \ + enum MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ HAVE_IBV_FLOW_DV_SUPPORT \ infiniband/mlx5dv.h \ - enum MLX5DV_FLOW_ACTION_TAG \ + func mlx5dv_create_flow_action_packet_reformat \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ HAVE_ETHTOOL_LINK_MODE_25G \ @@ -212,6 +218,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh enum IFLA_PHYS_PORT_NAME \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ + HAVE_IFLA_VXLAN_COLLECT_METADATA \ + linux/if_link.h \ + enum IFLA_VXLAN_COLLECT_METADATA \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ HAVE_TCA_CHAIN \ linux/rtnetlink.h \ enum TCA_CHAIN \ @@ -372,6 +383,86 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh enum TCA_VLAN_PUSH_VLAN_PRIORITY \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_KEY_ID \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_KEY_ID \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_IPV4_SRC \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_IPV4_DST \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_IPV4_DST_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_IPV6_SRC \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_IPV6_DST \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_IPV6_DST_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_UDP_SRC_PORT \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_UDP_DST_PORT \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TC_ACT_TUNNEL_KEY \ + linux/tc_act/tc_tunnel_key.h \ + define TCA_ACT_TUNNEL_KEY \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT \ + linux/tc_act/tc_tunnel_key.h \ + enum TCA_TUNNEL_KEY_ENC_DST_PORT \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_TUNNEL_KEY_NO_CSUM \ + linux/tc_act/tc_tunnel_key.h \ + enum TCA_TUNNEL_KEY_NO_CSUM \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ HAVE_TC_ACT_PEDIT \ linux/tc_act/tc_pedit.h \ enum TCA_PEDIT_KEY_EX_HDR_TYPE_UDP \ diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build index e8cbe3ee..28938db0 100644 --- a/drivers/net/mlx5/meson.build +++ b/drivers/net/mlx5/meson.build @@ -96,8 +96,10 @@ if build 'MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED' ], [ 'HAVE_IBV_MLX5_MOD_CQE_128B_COMP', 'infiniband/mlx5dv.h', 'MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP' ], + [ 'HAVE_IBV_MLX5_MOD_CQE_128B_PAD', 'infiniband/mlx5dv.h', + 'MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD' ], [ 'HAVE_IBV_FLOW_DV_SUPPORT', 'infiniband/mlx5dv.h', - 'MLX5DV_FLOW_ACTION_TAG' ], + 'mlx5dv_create_flow_action_packet_reformat' ], [ 'HAVE_IBV_DEVICE_MPLS_SUPPORT', 'infiniband/verbs.h', 'IBV_FLOW_SPEC_MPLS' ], [ 'HAVE_IBV_WQ_FLAG_RX_END_PADDING', 'infiniband/verbs.h', @@ -128,6 +130,8 @@ if build 'IFLA_PHYS_SWITCH_ID' ], [ 'HAVE_IFLA_PHYS_PORT_NAME', 'linux/if_link.h', 'IFLA_PHYS_PORT_NAME' ], + [ 'HAVE_IFLA_VXLAN_COLLECT_METADATA', 'linux/if_link.h', + 'IFLA_VXLAN_COLLECT_METADATA' ], [ 'HAVE_TCA_CHAIN', 'linux/rtnetlink.h', 'TCA_CHAIN' ], [ 'HAVE_TCA_FLOWER_ACT', 'linux/pkt_cls.h', @@ -192,6 +196,38 @@ if build 'TC_ACT_GOTO_CHAIN' ], [ 'HAVE_TC_ACT_VLAN', 'linux/tc_act/tc_vlan.h', 'TCA_VLAN_PUSH_VLAN_PRIORITY' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_KEY_ID', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_KEY_ID' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_IPV4_SRC' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_IPV4_DST' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_IPV4_DST_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_IPV6_SRC' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_IPV6_DST' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_IPV6_DST_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_UDP_SRC_PORT' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_UDP_DST_PORT' ], + [ 'HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK' ], + [ 'HAVE_TC_ACT_TUNNEL_KEY', 'linux/tc_act/tc_tunnel_key.h', + 'TCA_ACT_TUNNEL_KEY' ], + [ 'HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT', 'linux/tc_act/tc_tunnel_key.h', + 'TCA_TUNNEL_KEY_ENC_DST_PORT' ], + [ 'HAVE_TCA_TUNNEL_KEY_NO_CSUM', 'linux/tc_act/tc_tunnel_key.h', + 'TCA_TUNNEL_KEY_NO_CSUM' ], [ 'HAVE_TC_ACT_PEDIT', 'linux/tc_act/tc_pedit.h', 'TCA_PEDIT_KEY_EX_HDR_TYPE_UDP' ], [ 'HAVE_RDMA_NL_NLDEV', 'rdma/rdma_netlink.h', diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index a277b573..ed1fcfc7 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -51,6 +51,9 @@ /* Device parameter to enable RX completion queue compression. */ #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" +/* Device parameter to enable RX completion entry padding to 128B. */ +#define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en" + /* Device parameter to enable Multi-Packet Rx queue. */ #define MLX5_RX_MPRQ_EN "mprq_en" @@ -72,6 +75,12 @@ */ #define MLX5_TXQS_MIN_INLINE "txqs_min_inline" +/* + * Device parameter to configure the number of TX queues threshold for + * enabling vectorized Tx. + */ +#define MLX5_TXQS_MAX_VEC "txqs_max_vec" + /* Device parameter to enable multi-packet send WQEs. */ #define MLX5_TXQ_MPW_EN "txq_mpw_en" @@ -390,6 +399,7 @@ const struct eth_dev_ops mlx5_dev_ops = { .filter_ctrl = mlx5_dev_filter_ctrl, .rx_descriptor_status = mlx5_rx_descriptor_status, .tx_descriptor_status = mlx5_tx_descriptor_status, + .rx_queue_count = mlx5_rx_queue_count, .rx_queue_intr_enable = mlx5_rx_intr_enable, .rx_queue_intr_disable = mlx5_rx_intr_disable, .is_removed = mlx5_is_removed, @@ -479,6 +489,8 @@ mlx5_args_check(const char *key, const char *val, void *opaque) } if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { config->cqe_comp = !!tmp; + } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) { + config->cqe_pad = !!tmp; } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) { config->mprq.enabled = !!tmp; } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) { @@ -491,6 +503,8 @@ mlx5_args_check(const char *key, const char *val, void *opaque) config->txq_inline = tmp; } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { config->txqs_inline = tmp; + } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) { + config->txqs_vec = tmp; } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { config->mps = !!tmp; } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { @@ -531,12 +545,14 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) { const char **params = (const char *[]){ MLX5_RXQ_CQE_COMP_EN, + MLX5_RXQ_CQE_PAD_EN, MLX5_RX_MPRQ_EN, MLX5_RX_MPRQ_LOG_STRIDE_NUM, MLX5_RX_MPRQ_MAX_MEMCPY_LEN, MLX5_RXQS_MIN_MPRQ, MLX5_TXQ_INLINE, MLX5_TXQS_MIN_INLINE, + MLX5_TXQS_MAX_VEC, MLX5_TXQ_MPW_EN, MLX5_TXQ_MPW_HDR_DSEG_EN, MLX5_TXQ_MAX_INLINE_LEN, @@ -698,8 +714,8 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev) * Backing DPDK device. * @param ibv_dev * Verbs device. - * @param vf - * If nonzero, enable VF-specific features. + * @param config + * Device configuration parameters. * @param[in] switch_info * Switch properties of Ethernet device. * @@ -713,7 +729,7 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev) static struct rte_eth_dev * mlx5_dev_spawn(struct rte_device *dpdk_dev, struct ibv_device *ibv_dev, - int vf, + struct mlx5_dev_config config, const struct mlx5_switch_info *switch_info) { struct ibv_context *ctx; @@ -721,28 +737,12 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, struct ibv_port_attr port_attr; struct ibv_pd *pd = NULL; struct mlx5dv_context dv_attr = { .comp_mask = 0 }; - struct mlx5_dev_config config = { - .vf = !!vf, - .mps = MLX5_ARG_UNSET, - .tx_vec_en = 1, - .rx_vec_en = 1, - .mpw_hdr_dseg = 0, - .txq_inline = MLX5_ARG_UNSET, - .txqs_inline = MLX5_ARG_UNSET, - .inline_max_packet_sz = MLX5_ARG_UNSET, - .vf_nl_en = 1, - .mprq = { - .enabled = 0, - .stride_num_n = MLX5_MPRQ_STRIDE_NUM_N, - .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, - .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, - }, - }; struct rte_eth_dev *eth_dev = NULL; struct priv *priv = NULL; int err = 0; unsigned int mps; unsigned int cqe_comp; + unsigned int cqe_pad = 0; unsigned int tunnel_en = 0; unsigned int mpls_en = 0; unsigned int swp = 0; @@ -863,6 +863,11 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, else cqe_comp = 1; config.cqe_comp = cqe_comp; +#ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD + /* Whether device supports 128B Rx CQE padding. */ + cqe_pad = RTE_CACHE_LINE_SIZE == 128 && + (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); +#endif #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { tunnel_en = ((dv_attr.tunnel_offloads_caps & @@ -1079,6 +1084,12 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, DRV_LOG(WARNING, "Rx CQE compression isn't supported"); config.cqe_comp = 0; } + if (config.cqe_pad && !cqe_pad) { + DRV_LOG(WARNING, "Rx CQE padding isn't supported"); + config.cqe_pad = 0; + } else if (config.cqe_pad) { + DRV_LOG(INFO, "Rx CQE padding is enabled"); + } if (config.mprq.enabled && mprq) { if (config.mprq.stride_num_n > mprq_max_stride_num_n || config.mprq.stride_num_n < mprq_min_stride_num_n) { @@ -1157,7 +1168,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, eth_dev->dev_ops = &mlx5_dev_ops; /* Register MAC address. */ claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); - if (vf && config.vf_nl_en) + if (config.vf && config.vf_nl_en) mlx5_nl_mac_addr_sync(eth_dev); priv->tcf_context = mlx5_flow_tcf_context_create(); if (!priv->tcf_context) { @@ -1326,7 +1337,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, { struct ibv_device **ibv_list; unsigned int n = 0; - int vf; + struct mlx5_dev_config dev_config; int ret; assert(pci_drv == &mlx5_driver); @@ -1424,21 +1435,46 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, */ if (n) qsort(list, n, sizeof(*list), mlx5_dev_spawn_data_cmp); + /* Default configuration. */ + dev_config = (struct mlx5_dev_config){ + .mps = MLX5_ARG_UNSET, + .tx_vec_en = 1, + .rx_vec_en = 1, + .txq_inline = MLX5_ARG_UNSET, + .txqs_inline = MLX5_ARG_UNSET, + .txqs_vec = MLX5_ARG_UNSET, + .inline_max_packet_sz = MLX5_ARG_UNSET, + .vf_nl_en = 1, + .mprq = { + .enabled = 0, /* Disabled by default. */ + .stride_num_n = MLX5_MPRQ_STRIDE_NUM_N, + .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, + .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, + }, + }; + /* Device speicific configuration. */ switch (pci_dev->id.device_id) { + case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF: + dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS_BLUEFIELD; + break; case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: - vf = 1; + dev_config.vf = 1; break; default: - vf = 0; + break; } + /* Set architecture-dependent default value if unset. */ + if (dev_config.txqs_vec == MLX5_ARG_UNSET) + dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS; for (i = 0; i != n; ++i) { uint32_t restore; - list[i].eth_dev = mlx5_dev_spawn - (&pci_dev->device, list[i].ibv_dev, vf, &list[i].info); + list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, + list[i].ibv_dev, dev_config, + &list[i].info); if (!list[i].eth_dev) { if (rte_errno != EBUSY && rte_errno != EEXIST) break; diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 74d87c05..bc500b2b 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -115,6 +115,7 @@ struct mlx5_dev_config { /* Whether tunnel stateless offloads are supported. */ unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */ unsigned int cqe_comp:1; /* CQE compression is enabled. */ + unsigned int cqe_pad:1; /* CQE padding is enabled. */ unsigned int tso:1; /* Whether TSO is supported. */ unsigned int tx_vec_en:1; /* Tx vector is enabled. */ unsigned int rx_vec_en:1; /* Rx vector is enabled. */ @@ -139,6 +140,7 @@ struct mlx5_dev_config { unsigned int ind_table_max_size; /* Maximum indirection table size. */ int txq_inline; /* Maximum packet size for inlining. */ int txqs_inline; /* Queue number threshold for inlining. */ + int txqs_vec; /* Queue number threshold for vectorized Tx. */ int inline_max_packet_sz; /* Max packet size for inlining. */ }; @@ -219,6 +221,7 @@ struct priv { /* Verbs Indirection tables. */ LIST_HEAD(ind_tables, mlx5_ind_table_ibv) ind_tbls; LIST_HEAD(matchers, mlx5_flow_dv_matcher) matchers; + LIST_HEAD(encap_decap, mlx5_flow_dv_encap_decap_resource) encaps_decaps; uint32_t link_speed_capa; /* Link speed capabilities. */ struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */ int primary_socket; /* Unix socket for primary process. */ diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index f2a16795..bfe66558 100644 --- a/drivers/net/mlx5/mlx5_defs.h +++ b/drivers/net/mlx5/mlx5_defs.h @@ -60,8 +60,13 @@ /* Maximum Packet headers size (L2+L3+L4) for TSO. */ #define MLX5_MAX_TSO_HEADER 192 -/* Default minimum number of Tx queues for vectorized Tx. */ -#define MLX5_VPMD_MIN_TXQS 4 +/* Default maximum number of Tx queues for vectorized Tx. */ +#if defined(RTE_ARCH_ARM64) +#define MLX5_VPMD_MAX_TXQS 8 +#else +#define MLX5_VPMD_MAX_TXQS 4 +#endif +#define MLX5_VPMD_MAX_TXQS_BLUEFIELD 16 /* Threshold of buffer replenishment for vectorized Rx. */ #define MLX5_VPMD_RXQ_RPLNSH_THRESH(n) \ diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index 280af0ab..3c2ac4b3 100644 --- a/drivers/net/mlx5/mlx5_flow.c +++ b/drivers/net/mlx5/mlx5_flow.c @@ -239,7 +239,6 @@ static const struct rte_flow_ops mlx5_flow_ops = { /* Convert FDIR request to Generic flow. */ struct mlx5_fdir { struct rte_flow_attr attr; - struct rte_flow_action actions[2]; struct rte_flow_item items[4]; struct rte_flow_item_eth l2; struct rte_flow_item_eth l2_mask; @@ -259,6 +258,7 @@ struct mlx5_fdir { struct rte_flow_item_udp udp; struct rte_flow_item_tcp tcp; } l4_mask; + struct rte_flow_action actions[2]; struct rte_flow_action_queue queue; }; @@ -275,7 +275,7 @@ static const uint32_t priority_map_5[][MLX5_PRIORITY_MAP_MAX] = { /* Tunnel information. */ struct mlx5_flow_tunnel_info { - uint32_t tunnel; /**< Tunnel bit (see MLX5_FLOW_*). */ + uint64_t tunnel; /**< Tunnel bit (see MLX5_FLOW_*). */ uint32_t ptype; /**< Tunnel Ptype (see RTE_PTYPE_*). */ }; @@ -912,7 +912,13 @@ mlx5_flow_validate_action_rss(const struct rte_flow_action *action, RTE_FLOW_ERROR_TYPE_ACTION_CONF, &rss->level, "tunnel RSS is not supported"); - if (rss->key_len < MLX5_RSS_HASH_KEY_LEN) + /* allow RSS key_len 0 in case of NULL (default) RSS key. */ + if (rss->key_len == 0 && rss->key != NULL) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->key_len, + "RSS hash key length 0"); + if (rss->key_len > 0 && rss->key_len < MLX5_RSS_HASH_KEY_LEN) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION_CONF, &rss->key_len, @@ -1046,15 +1052,13 @@ mlx5_flow_validate_item_eth(const struct rte_flow_item *item, }; int ret; int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t ethm = tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; - if (item_flags & MLX5_FLOW_LAYER_OUTER_L2) + if (item_flags & ethm) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item, - "3 levels of l2 are not supported"); - if ((item_flags & MLX5_FLOW_LAYER_INNER_L2) && !tunnel) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, item, - "2 L2 without tunnel are not supported"); + "multiple L2 layers not supported"); if (!mask) mask = &rte_flow_item_eth_mask; ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, @@ -1079,7 +1083,7 @@ mlx5_flow_validate_item_eth(const struct rte_flow_item *item, */ int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item, - int64_t item_flags, + uint64_t item_flags, struct rte_flow_error *error) { const struct rte_flow_item_vlan *spec = item->spec; @@ -1091,17 +1095,17 @@ mlx5_flow_validate_item_vlan(const struct rte_flow_item *item, uint16_t vlan_tag = 0; const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); int ret; - const uint32_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 | + const uint64_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 | MLX5_FLOW_LAYER_INNER_L4) : (MLX5_FLOW_LAYER_OUTER_L3 | MLX5_FLOW_LAYER_OUTER_L4); - const uint32_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : + const uint64_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : MLX5_FLOW_LAYER_OUTER_VLAN; if (item_flags & vlanm) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, - "VLAN layer already configured"); + "multiple VLAN layers not supported"); else if ((item_flags & l34m) != 0) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, @@ -1145,7 +1149,7 @@ mlx5_flow_validate_item_vlan(const struct rte_flow_item *item, */ int mlx5_flow_validate_item_ipv4(const struct rte_flow_item *item, - int64_t item_flags, + uint64_t item_flags, struct rte_flow_error *error) { const struct rte_flow_item_ipv4 *mask = item->mask; @@ -1158,15 +1162,17 @@ mlx5_flow_validate_item_ipv4(const struct rte_flow_item *item, }, }; const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3; + const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4; int ret; - if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : - MLX5_FLOW_LAYER_OUTER_L3)) + if (item_flags & l3m) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item, "multiple L3 layers not supported"); - else if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : - MLX5_FLOW_LAYER_OUTER_L4)) + else if (item_flags & l4m) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, "L3 cannot follow an L4 layer."); @@ -1214,15 +1220,17 @@ mlx5_flow_validate_item_ipv6(const struct rte_flow_item *item, }, }; const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3; + const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4; int ret; - if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : - MLX5_FLOW_LAYER_OUTER_L3)) + if (item_flags & l3m) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item, "multiple L3 layers not supported"); - else if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : - MLX5_FLOW_LAYER_OUTER_L4)) + else if (item_flags & l4m) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, "L3 cannot follow an L4 layer."); @@ -1273,6 +1281,10 @@ mlx5_flow_validate_item_udp(const struct rte_flow_item *item, { const struct rte_flow_item_udp *mask = item->mask; const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3; + const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4; int ret; if (target_protocol != 0xff && target_protocol != IPPROTO_UDP) @@ -1280,16 +1292,14 @@ mlx5_flow_validate_item_udp(const struct rte_flow_item *item, RTE_FLOW_ERROR_TYPE_ITEM, item, "protocol filtering not compatible" " with UDP layer"); - if (!(item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : - MLX5_FLOW_LAYER_OUTER_L3))) + if (!(item_flags & l3m)) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, "L3 is mandatory to filter on L4"); - if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : - MLX5_FLOW_LAYER_OUTER_L4)) + if (item_flags & l4m) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, - "L4 layer is already present"); + "multiple L4 layers not supported"); if (!mask) mask = &rte_flow_item_udp_mask; ret = mlx5_flow_item_acceptable @@ -1325,6 +1335,10 @@ mlx5_flow_validate_item_tcp(const struct rte_flow_item *item, { const struct rte_flow_item_tcp *mask = item->mask; const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + const uint64_t l3m = tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3; + const uint64_t l4m = tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4; int ret; assert(flow_mask); @@ -1333,16 +1347,14 @@ mlx5_flow_validate_item_tcp(const struct rte_flow_item *item, RTE_FLOW_ERROR_TYPE_ITEM, item, "protocol filtering not compatible" " with TCP layer"); - if (!(item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : - MLX5_FLOW_LAYER_OUTER_L3))) + if (!(item_flags & l3m)) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, "L3 is mandatory to filter on L4"); - if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : - MLX5_FLOW_LAYER_OUTER_L4)) + if (item_flags & l4m) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, - "L4 layer is already present"); + "multiple L4 layers not supported"); if (!mask) mask = &rte_flow_item_tcp_mask; ret = mlx5_flow_item_acceptable @@ -1387,7 +1399,8 @@ mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item, if (item_flags & MLX5_FLOW_LAYER_TUNNEL) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item, - "a tunnel is already present"); + "multiple tunnel layers not" + " supported"); /* * Verify only UDPv4 is present as defined in * https://tools.ietf.org/html/rfc7348 @@ -1473,7 +1486,8 @@ mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item, if (item_flags & MLX5_FLOW_LAYER_TUNNEL) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item, - "a tunnel is already present"); + "multiple tunnel layers not" + " supported"); /* * Verify only UDPv4 is present as defined in * https://tools.ietf.org/html/rfc7348 @@ -1556,7 +1570,8 @@ mlx5_flow_validate_item_gre(const struct rte_flow_item *item, if (item_flags & MLX5_FLOW_LAYER_TUNNEL) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item, - "a tunnel is already present"); + "multiple tunnel layers not" + " supported"); if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3)) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item, @@ -1610,11 +1625,13 @@ mlx5_flow_validate_item_mpls(const struct rte_flow_item *item __rte_unused, RTE_FLOW_ERROR_TYPE_ITEM, item, "protocol filtering not compatible" " with MPLS layer"); - if (item_flags & MLX5_FLOW_LAYER_TUNNEL) + /* Multi-tunnel isn't allowed but MPLS over GRE is an exception. */ + if ((item_flags & MLX5_FLOW_LAYER_TUNNEL) && + !(item_flags & MLX5_FLOW_LAYER_GRE)) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item, - "a tunnel is already" - " present"); + "multiple tunnel layers not" + " supported"); if (!mask) mask = &rte_flow_item_mpls_mask; ret = mlx5_flow_item_acceptable @@ -1646,8 +1663,6 @@ static struct mlx5_flow * flow_null_prepare(const struct rte_flow_attr *attr __rte_unused, const struct rte_flow_item items[] __rte_unused, const struct rte_flow_action actions[] __rte_unused, - uint64_t *item_flags __rte_unused, - uint64_t *action_flags __rte_unused, struct rte_flow_error *error __rte_unused) { rte_errno = ENOTSUP; @@ -1775,16 +1790,19 @@ flow_drv_validate(struct rte_eth_dev *dev, * calculates the size of memory required for device flow, allocates the memory, * initializes the device flow and returns the pointer. * + * @note + * This function initializes device flow structure such as dv, tcf or verbs in + * struct mlx5_flow. However, it is caller's responsibility to initialize the + * rest. For example, adding returning device flow to flow->dev_flow list and + * setting backward reference to the flow should be done out of this function. + * layers field is not filled either. + * * @param[in] attr * Pointer to the flow attributes. * @param[in] items * Pointer to the list of items. * @param[in] actions * Pointer to the list of actions. - * @param[out] item_flags - * Pointer to bit mask of all items detected. - * @param[out] action_flags - * Pointer to bit mask of all actions detected. * @param[out] error * Pointer to the error structure. * @@ -1792,12 +1810,10 @@ flow_drv_validate(struct rte_eth_dev *dev, * Pointer to device flow on success, otherwise NULL and rte_ernno is set. */ static inline struct mlx5_flow * -flow_drv_prepare(struct rte_flow *flow, +flow_drv_prepare(const struct rte_flow *flow, const struct rte_flow_attr *attr, const struct rte_flow_item items[], const struct rte_flow_action actions[], - uint64_t *item_flags, - uint64_t *action_flags, struct rte_flow_error *error) { const struct mlx5_flow_driver_ops *fops; @@ -1805,8 +1821,7 @@ flow_drv_prepare(struct rte_flow *flow, assert(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX); fops = flow_get_drv_ops(type); - return fops->prepare(attr, items, actions, item_flags, action_flags, - error); + return fops->prepare(attr, items, actions, error); } /** @@ -1815,6 +1830,12 @@ flow_drv_prepare(struct rte_flow *flow, * translates a generic flow into a driver flow. flow_drv_prepare() must * precede. * + * @note + * dev_flow->layers could be filled as a result of parsing during translation + * if needed by flow_drv_apply(). dev_flow->flow->actions can also be filled + * if necessary. As a flow can have multiple dev_flows by RSS flow expansion, + * flow->actions could be overwritten even though all the expanded dev_flows + * have the same actions. * * @param[in] dev * Pointer to the rte dev structure. @@ -1878,7 +1899,7 @@ flow_drv_apply(struct rte_eth_dev *dev, struct rte_flow *flow, * Flow driver remove API. This abstracts calling driver specific functions. * Parent flow (rte_flow) should have driver type (drv_type). It removes a flow * on device. All the resources of the flow should be freed by calling - * flow_dv_destroy(). + * flow_drv_destroy(). * * @param[in] dev * Pointer to Ethernet device. @@ -2009,8 +2030,6 @@ flow_list_create(struct rte_eth_dev *dev, struct mlx5_flows *list, { struct rte_flow *flow = NULL; struct mlx5_flow *dev_flow; - uint64_t action_flags = 0; - uint64_t item_flags = 0; const struct rte_flow_action_rss *rss; union { struct rte_flow_expand_rss buf; @@ -2053,16 +2072,10 @@ flow_list_create(struct rte_eth_dev *dev, struct mlx5_flows *list, } for (i = 0; i < buf->entries; ++i) { dev_flow = flow_drv_prepare(flow, attr, buf->entry[i].pattern, - actions, &item_flags, &action_flags, - error); + actions, error); if (!dev_flow) goto error; dev_flow->flow = flow; - dev_flow->layers = item_flags; - /* Store actions once as expanded flows have same actions. */ - if (i == 0) - flow->actions = action_flags; - assert(flow->actions == action_flags); LIST_INSERT_HEAD(&flow->dev_flows, dev_flow, next); ret = flow_drv_translate(dev, dev_flow, attr, buf->entry[i].pattern, @@ -2127,6 +2140,7 @@ flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list, */ if (dev->data->dev_started) flow_rxq_flags_trim(dev, flow); + rte_free(flow->fdir); rte_free(flow); } @@ -2444,7 +2458,7 @@ mlx5_flow_query(struct rte_eth_dev *dev, * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx5_fdir_filter_convert(struct rte_eth_dev *dev, +flow_fdir_filter_convert(struct rte_eth_dev *dev, const struct rte_eth_fdir_filter *fdir_filter, struct mlx5_fdir *attributes) { @@ -2616,6 +2630,69 @@ mlx5_fdir_filter_convert(struct rte_eth_dev *dev, return 0; } +#define FLOW_FDIR_CMP(f1, f2, fld) \ + memcmp(&(f1)->fld, &(f2)->fld, sizeof(f1->fld)) + +/** + * Compare two FDIR flows. If items and actions are identical, the two flows are + * regarded as same. + * + * @param dev + * Pointer to Ethernet device. + * @param f1 + * FDIR flow to compare. + * @param f2 + * FDIR flow to compare. + * + * @return + * Zero on match, 1 otherwise. + */ +static int +flow_fdir_cmp(const struct mlx5_fdir *f1, const struct mlx5_fdir *f2) +{ + if (FLOW_FDIR_CMP(f1, f2, attr) || + FLOW_FDIR_CMP(f1, f2, l2) || + FLOW_FDIR_CMP(f1, f2, l2_mask) || + FLOW_FDIR_CMP(f1, f2, l3) || + FLOW_FDIR_CMP(f1, f2, l3_mask) || + FLOW_FDIR_CMP(f1, f2, l4) || + FLOW_FDIR_CMP(f1, f2, l4_mask) || + FLOW_FDIR_CMP(f1, f2, actions[0])) + return 1; + if (f1->actions[0].type == RTE_FLOW_ACTION_TYPE_QUEUE && + FLOW_FDIR_CMP(f1, f2, queue)) + return 1; + return 0; +} + +/** + * Search device flow list to find out a matched FDIR flow. + * + * @param dev + * Pointer to Ethernet device. + * @param fdir_flow + * FDIR flow to lookup. + * + * @return + * Pointer of flow if found, NULL otherwise. + */ +static struct rte_flow * +flow_fdir_filter_lookup(struct rte_eth_dev *dev, struct mlx5_fdir *fdir_flow) +{ + struct priv *priv = dev->data->dev_private; + struct rte_flow *flow = NULL; + + assert(fdir_flow); + TAILQ_FOREACH(flow, &priv->flows, next) { + if (flow->fdir && !flow_fdir_cmp(flow->fdir, fdir_flow)) { + DRV_LOG(DEBUG, "port %u found FDIR flow %p", + dev->data->port_id, (void *)flow); + break; + } + } + return flow; +} + /** * Add new flow director filter and store it in list. * @@ -2628,32 +2705,38 @@ mlx5_fdir_filter_convert(struct rte_eth_dev *dev, * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx5_fdir_filter_add(struct rte_eth_dev *dev, +flow_fdir_filter_add(struct rte_eth_dev *dev, const struct rte_eth_fdir_filter *fdir_filter) { struct priv *priv = dev->data->dev_private; - struct mlx5_fdir attributes = { - .attr.group = 0, - .l2_mask = { - .dst.addr_bytes = "\x00\x00\x00\x00\x00\x00", - .src.addr_bytes = "\x00\x00\x00\x00\x00\x00", - .type = 0, - }, - }; - struct rte_flow_error error; + struct mlx5_fdir *fdir_flow; struct rte_flow *flow; int ret; - ret = mlx5_fdir_filter_convert(dev, fdir_filter, &attributes); + fdir_flow = rte_zmalloc(__func__, sizeof(*fdir_flow), 0); + if (!fdir_flow) { + rte_errno = ENOMEM; + return -rte_errno; + } + ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow); if (ret) - return ret; - flow = flow_list_create(dev, &priv->flows, &attributes.attr, - attributes.items, attributes.actions, &error); + goto error; + flow = flow_fdir_filter_lookup(dev, fdir_flow); if (flow) { - DRV_LOG(DEBUG, "port %u FDIR created %p", dev->data->port_id, - (void *)flow); - return 0; + rte_errno = EEXIST; + goto error; } + flow = flow_list_create(dev, &priv->flows, &fdir_flow->attr, + fdir_flow->items, fdir_flow->actions, NULL); + if (!flow) + goto error; + assert(!flow->fdir); + flow->fdir = fdir_flow; + DRV_LOG(DEBUG, "port %u created FDIR flow %p", + dev->data->port_id, (void *)flow); + return 0; +error: + rte_free(fdir_flow); return -rte_errno; } @@ -2669,12 +2752,28 @@ mlx5_fdir_filter_add(struct rte_eth_dev *dev, * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx5_fdir_filter_delete(struct rte_eth_dev *dev __rte_unused, - const struct rte_eth_fdir_filter *fdir_filter - __rte_unused) +flow_fdir_filter_delete(struct rte_eth_dev *dev, + const struct rte_eth_fdir_filter *fdir_filter) { - rte_errno = ENOTSUP; - return -rte_errno; + struct priv *priv = dev->data->dev_private; + struct rte_flow *flow; + struct mlx5_fdir fdir_flow = { + .attr.group = 0, + }; + int ret; + + ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow); + if (ret) + return -rte_errno; + flow = flow_fdir_filter_lookup(dev, &fdir_flow); + if (!flow) { + rte_errno = ENOENT; + return -rte_errno; + } + flow_list_destroy(dev, &priv->flows, flow); + DRV_LOG(DEBUG, "port %u deleted FDIR flow %p", + dev->data->port_id, (void *)flow); + return 0; } /** @@ -2689,15 +2788,15 @@ mlx5_fdir_filter_delete(struct rte_eth_dev *dev __rte_unused, * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx5_fdir_filter_update(struct rte_eth_dev *dev, +flow_fdir_filter_update(struct rte_eth_dev *dev, const struct rte_eth_fdir_filter *fdir_filter) { int ret; - ret = mlx5_fdir_filter_delete(dev, fdir_filter); + ret = flow_fdir_filter_delete(dev, fdir_filter); if (ret) return ret; - return mlx5_fdir_filter_add(dev, fdir_filter); + return flow_fdir_filter_add(dev, fdir_filter); } /** @@ -2707,7 +2806,7 @@ mlx5_fdir_filter_update(struct rte_eth_dev *dev, * Pointer to Ethernet device. */ static void -mlx5_fdir_filter_flush(struct rte_eth_dev *dev) +flow_fdir_filter_flush(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; @@ -2723,7 +2822,7 @@ mlx5_fdir_filter_flush(struct rte_eth_dev *dev) * Resulting flow director information. */ static void -mlx5_fdir_info_get(struct rte_eth_dev *dev, struct rte_eth_fdir_info *fdir_info) +flow_fdir_info_get(struct rte_eth_dev *dev, struct rte_eth_fdir_info *fdir_info) { struct rte_eth_fdir_masks *mask = &dev->data->dev_conf.fdir_conf.mask; @@ -2753,7 +2852,7 @@ mlx5_fdir_info_get(struct rte_eth_dev *dev, struct rte_eth_fdir_info *fdir_info) * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -mlx5_fdir_ctrl_func(struct rte_eth_dev *dev, enum rte_filter_op filter_op, +flow_fdir_ctrl_func(struct rte_eth_dev *dev, enum rte_filter_op filter_op, void *arg) { enum rte_fdir_mode fdir_mode = @@ -2770,16 +2869,16 @@ mlx5_fdir_ctrl_func(struct rte_eth_dev *dev, enum rte_filter_op filter_op, } switch (filter_op) { case RTE_ETH_FILTER_ADD: - return mlx5_fdir_filter_add(dev, arg); + return flow_fdir_filter_add(dev, arg); case RTE_ETH_FILTER_UPDATE: - return mlx5_fdir_filter_update(dev, arg); + return flow_fdir_filter_update(dev, arg); case RTE_ETH_FILTER_DELETE: - return mlx5_fdir_filter_delete(dev, arg); + return flow_fdir_filter_delete(dev, arg); case RTE_ETH_FILTER_FLUSH: - mlx5_fdir_filter_flush(dev); + flow_fdir_filter_flush(dev); break; case RTE_ETH_FILTER_INFO: - mlx5_fdir_info_get(dev, arg); + flow_fdir_info_get(dev, arg); break; default: DRV_LOG(DEBUG, "port %u unknown operation %u", @@ -2820,7 +2919,7 @@ mlx5_dev_filter_ctrl(struct rte_eth_dev *dev, *(const void **)arg = &mlx5_flow_ops; return 0; case RTE_ETH_FILTER_FDIR: - return mlx5_fdir_ctrl_func(dev, filter_op, arg); + return flow_fdir_ctrl_func(dev, filter_op, arg); default: DRV_LOG(ERR, "port %u filter type (%d) not supported", dev->data->port_id, filter_type); diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h index 61299d66..51ab47fe 100644 --- a/drivers/net/mlx5/mlx5_flow.h +++ b/drivers/net/mlx5/mlx5_flow.h @@ -92,10 +92,24 @@ #define MLX5_FLOW_ACTION_DEC_TTL (1u << 19) #define MLX5_FLOW_ACTION_SET_MAC_SRC (1u << 20) #define MLX5_FLOW_ACTION_SET_MAC_DST (1u << 21) +#define MLX5_FLOW_ACTION_VXLAN_ENCAP (1u << 22) +#define MLX5_FLOW_ACTION_VXLAN_DECAP (1u << 23) +#define MLX5_FLOW_ACTION_NVGRE_ENCAP (1u << 24) +#define MLX5_FLOW_ACTION_NVGRE_DECAP (1u << 25) +#define MLX5_FLOW_ACTION_RAW_ENCAP (1u << 26) +#define MLX5_FLOW_ACTION_RAW_DECAP (1u << 27) #define MLX5_FLOW_FATE_ACTIONS \ (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_QUEUE | MLX5_FLOW_ACTION_RSS) +#define MLX5_FLOW_ENCAP_ACTIONS (MLX5_FLOW_ACTION_VXLAN_ENCAP | \ + MLX5_FLOW_ACTION_NVGRE_ENCAP | \ + MLX5_FLOW_ACTION_RAW_ENCAP) + +#define MLX5_FLOW_DECAP_ACTIONS (MLX5_FLOW_ACTION_VXLAN_DECAP | \ + MLX5_FLOW_ACTION_NVGRE_DECAP | \ + MLX5_FLOW_ACTION_RAW_DECAP) + #ifndef IPPROTO_MPLS #define IPPROTO_MPLS 137 #endif @@ -156,6 +170,7 @@ struct mlx5_flow_dv_match_params { }; #define MLX5_DV_MAX_NUMBER_OF_ACTIONS 8 +#define MLX5_ENCAP_MAX_LEN 132 /* Matcher structure. */ struct mlx5_flow_dv_matcher { @@ -169,6 +184,19 @@ struct mlx5_flow_dv_matcher { struct mlx5_flow_dv_match_params mask; /**< Matcher mask. */ }; +/* Encap/decap resource structure. */ +struct mlx5_flow_dv_encap_decap_resource { + LIST_ENTRY(mlx5_flow_dv_encap_decap_resource) next; + /* Pointer to next element. */ + rte_atomic32_t refcnt; /**< Reference counter. */ + struct ibv_flow_action *verbs_action; + /**< Verbs encap/decap action object. */ + uint8_t buf[MLX5_ENCAP_MAX_LEN]; + size_t size; + uint8_t reformat_type; + uint8_t ft_type; +}; + /* DV flows structure. */ struct mlx5_flow_dv { uint64_t hash_fields; /**< Fields that participate in the hash. */ @@ -177,6 +205,8 @@ struct mlx5_flow_dv { struct mlx5_flow_dv_matcher *matcher; /**< Cache to matcher. */ struct mlx5_flow_dv_match_params value; /**< Holds the value that the packet is compared to. */ + struct mlx5_flow_dv_encap_decap_resource *encap_decap; + /**< Pointer to encap/decap resource in cache. */ struct ibv_flow *flow; /**< Installed flow. */ #ifdef HAVE_IBV_FLOW_DV_SUPPORT struct mlx5dv_flow_action_attr actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS]; @@ -189,6 +219,15 @@ struct mlx5_flow_dv { struct mlx5_flow_tcf { struct nlmsghdr *nlh; struct tcmsg *tcm; + union { /**< Tunnel encap/decap descriptor. */ + struct flow_tcf_tunnel_hdr *tunnel; + struct flow_tcf_vxlan_decap *vxlan_decap; + struct flow_tcf_vxlan_encap *vxlan_encap; + }; + uint32_t applied:1; /**< Whether rule is currently applied. */ +#ifndef NDEBUG + uint32_t nlsize; /**< Size of NL message buffer for debug check. */ +#endif }; /* Verbs specification header. */ @@ -253,7 +292,9 @@ struct rte_flow { /**< Device flows that are part of the flow. */ uint64_t actions; /**< Bit-fields of detected actions, see MLX5_FLOW_ACTION_*. */ + struct mlx5_fdir *fdir; /**< Pointer to associated FDIR if any. */ }; + typedef int (*mlx5_flow_validate_t)(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, const struct rte_flow_item items[], @@ -261,8 +302,7 @@ typedef int (*mlx5_flow_validate_t)(struct rte_eth_dev *dev, struct rte_flow_error *error); typedef struct mlx5_flow *(*mlx5_flow_prepare_t) (const struct rte_flow_attr *attr, const struct rte_flow_item items[], - const struct rte_flow_action actions[], uint64_t *item_flags, - uint64_t *action_flags, struct rte_flow_error *error); + const struct rte_flow_action actions[], struct rte_flow_error *error); typedef int (*mlx5_flow_translate_t)(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, const struct rte_flow_attr *attr, @@ -336,7 +376,7 @@ int mlx5_flow_validate_item_gre(const struct rte_flow_item *item, uint8_t target_protocol, struct rte_flow_error *error); int mlx5_flow_validate_item_ipv4(const struct rte_flow_item *item, - int64_t item_flags, + uint64_t item_flags, struct rte_flow_error *error); int mlx5_flow_validate_item_ipv6(const struct rte_flow_item *item, uint64_t item_flags, @@ -355,7 +395,7 @@ int mlx5_flow_validate_item_udp(const struct rte_flow_item *item, uint8_t target_protocol, struct rte_flow_error *error); int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item, - int64_t item_flags, + uint64_t item_flags, struct rte_flow_error *error); int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item, uint64_t item_flags, diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c index 8f729f44..79096153 100644 --- a/drivers/net/mlx5/mlx5_flow_dv.c +++ b/drivers/net/mlx5/mlx5_flow_dv.c @@ -25,6 +25,7 @@ #include <rte_flow_driver.h> #include <rte_malloc.h> #include <rte_ip.h> +#include <rte_gre.h> #include "mlx5.h" #include "mlx5_defs.h" @@ -96,6 +97,613 @@ flow_dv_validate_item_meta(struct rte_eth_dev *dev, } /** + * Validate the L2 encap action. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the encap action. + * @param[in] attr + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_l2_encap(uint64_t action_flags, + const struct rte_flow_action *action, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + if (!(action->conf)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "configuration cannot be null"); + if (action_flags & MLX5_FLOW_ACTION_DROP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't drop and encap in same flow"); + if (action_flags & (MLX5_FLOW_ENCAP_ACTIONS | MLX5_FLOW_DECAP_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can only have a single encap or" + " decap action in a flow"); + if (attr->ingress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + NULL, + "encap action not supported for " + "ingress"); + return 0; +} + +/** + * Validate the L2 decap action. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] attr + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_l2_decap(uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + if (action_flags & MLX5_FLOW_ACTION_DROP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't drop and decap in same flow"); + if (action_flags & (MLX5_FLOW_ENCAP_ACTIONS | MLX5_FLOW_DECAP_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can only have a single encap or" + " decap action in a flow"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, + NULL, + "decap action not supported for " + "egress"); + return 0; +} + +/** + * Validate the raw encap action. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the encap action. + * @param[in] attr + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_raw_encap(uint64_t action_flags, + const struct rte_flow_action *action, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + if (!(action->conf)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "configuration cannot be null"); + if (action_flags & MLX5_FLOW_ACTION_DROP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't drop and encap in same flow"); + if (action_flags & MLX5_FLOW_ENCAP_ACTIONS) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can only have a single encap" + " action in a flow"); + /* encap without preceding decap is not supported for ingress */ + if (attr->ingress && !(action_flags & MLX5_FLOW_ACTION_RAW_DECAP)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + NULL, + "encap action not supported for " + "ingress"); + return 0; +} + +/** + * Validate the raw decap action. + * + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action + * Pointer to the encap action. + * @param[in] attr + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_action_raw_decap(uint64_t action_flags, + const struct rte_flow_action *action, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + if (action_flags & MLX5_FLOW_ACTION_DROP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't drop and decap in same flow"); + if (action_flags & MLX5_FLOW_ENCAP_ACTIONS) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have encap action before" + " decap action"); + if (action_flags & MLX5_FLOW_DECAP_ACTIONS) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can only have a single decap" + " action in a flow"); + /* decap action is valid on egress only if it is followed by encap */ + if (attr->egress) { + for (; action->type != RTE_FLOW_ACTION_TYPE_END && + action->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP; + action++) { + } + if (action->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, + NULL, "decap action not supported" + " for egress"); + } + return 0; +} + + +/** + * Find existing encap/decap resource or create and register a new one. + * + * @param dev[in, out] + * Pointer to rte_eth_dev structure. + * @param[in, out] resource + * Pointer to encap/decap resource. + * @parm[in, out] dev_flow + * Pointer to the dev_flow. + * @param[out] error + * pointer to error structure. + * + * @return + * 0 on success otherwise -errno and errno is set. + */ +static int +flow_dv_encap_decap_resource_register + (struct rte_eth_dev *dev, + struct mlx5_flow_dv_encap_decap_resource *resource, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_flow_dv_encap_decap_resource *cache_resource; + + /* Lookup a matching resource from cache. */ + LIST_FOREACH(cache_resource, &priv->encaps_decaps, next) { + if (resource->reformat_type == cache_resource->reformat_type && + resource->ft_type == cache_resource->ft_type && + resource->size == cache_resource->size && + !memcmp((const void *)resource->buf, + (const void *)cache_resource->buf, + resource->size)) { + DRV_LOG(DEBUG, "encap/decap resource %p: refcnt %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + rte_atomic32_inc(&cache_resource->refcnt); + dev_flow->dv.encap_decap = cache_resource; + return 0; + } + } + /* Register new encap/decap resource. */ + cache_resource = rte_calloc(__func__, 1, sizeof(*cache_resource), 0); + if (!cache_resource) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot allocate resource memory"); + *cache_resource = *resource; + cache_resource->verbs_action = + mlx5_glue->dv_create_flow_action_packet_reformat + (priv->ctx, cache_resource->size, + (cache_resource->size ? cache_resource->buf : NULL), + cache_resource->reformat_type, + cache_resource->ft_type); + if (!cache_resource->verbs_action) { + rte_free(cache_resource); + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot create action"); + } + rte_atomic32_init(&cache_resource->refcnt); + rte_atomic32_inc(&cache_resource->refcnt); + LIST_INSERT_HEAD(&priv->encaps_decaps, cache_resource, next); + dev_flow->dv.encap_decap = cache_resource; + DRV_LOG(DEBUG, "new encap/decap resource %p: refcnt %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + return 0; +} + +/** + * Get the size of specific rte_flow_item_type + * + * @param[in] item_type + * Tested rte_flow_item_type. + * + * @return + * sizeof struct item_type, 0 if void or irrelevant. + */ +static size_t +flow_dv_get_item_len(const enum rte_flow_item_type item_type) +{ + size_t retval; + + switch (item_type) { + case RTE_FLOW_ITEM_TYPE_ETH: + retval = sizeof(struct rte_flow_item_eth); + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + retval = sizeof(struct rte_flow_item_vlan); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + retval = sizeof(struct rte_flow_item_ipv4); + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + retval = sizeof(struct rte_flow_item_ipv6); + break; + case RTE_FLOW_ITEM_TYPE_UDP: + retval = sizeof(struct rte_flow_item_udp); + break; + case RTE_FLOW_ITEM_TYPE_TCP: + retval = sizeof(struct rte_flow_item_tcp); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + retval = sizeof(struct rte_flow_item_vxlan); + break; + case RTE_FLOW_ITEM_TYPE_GRE: + retval = sizeof(struct rte_flow_item_gre); + break; + case RTE_FLOW_ITEM_TYPE_NVGRE: + retval = sizeof(struct rte_flow_item_nvgre); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + retval = sizeof(struct rte_flow_item_vxlan_gpe); + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + retval = sizeof(struct rte_flow_item_mpls); + break; + case RTE_FLOW_ITEM_TYPE_VOID: /* Fall through. */ + default: + retval = 0; + break; + } + return retval; +} + +#define MLX5_ENCAP_IPV4_VERSION 0x40 +#define MLX5_ENCAP_IPV4_IHL_MIN 0x05 +#define MLX5_ENCAP_IPV4_TTL_DEF 0x40 +#define MLX5_ENCAP_IPV6_VTC_FLOW 0x60000000 +#define MLX5_ENCAP_IPV6_HOP_LIMIT 0xff +#define MLX5_ENCAP_VXLAN_FLAGS 0x08000000 +#define MLX5_ENCAP_VXLAN_GPE_FLAGS 0x04 + +/** + * Convert the encap action data from list of rte_flow_item to raw buffer + * + * @param[in] items + * Pointer to rte_flow_item objects list. + * @param[out] buf + * Pointer to the output buffer. + * @param[out] size + * Pointer to the output buffer size. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_convert_encap_data(const struct rte_flow_item *items, uint8_t *buf, + size_t *size, struct rte_flow_error *error) +{ + struct ether_hdr *eth = NULL; + struct vlan_hdr *vlan = NULL; + struct ipv4_hdr *ipv4 = NULL; + struct ipv6_hdr *ipv6 = NULL; + struct udp_hdr *udp = NULL; + struct vxlan_hdr *vxlan = NULL; + struct vxlan_gpe_hdr *vxlan_gpe = NULL; + struct gre_hdr *gre = NULL; + size_t len; + size_t temp_size = 0; + + if (!items) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "invalid empty data"); + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + len = flow_dv_get_item_len(items->type); + if (len + temp_size > MLX5_ENCAP_MAX_LEN) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "items total size is too big" + " for encap action"); + rte_memcpy((void *)&buf[temp_size], items->spec, len); + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_ETH: + eth = (struct ether_hdr *)&buf[temp_size]; + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + vlan = (struct vlan_hdr *)&buf[temp_size]; + if (!eth) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "eth header not found"); + if (!eth->ether_type) + eth->ether_type = RTE_BE16(ETHER_TYPE_VLAN); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + ipv4 = (struct ipv4_hdr *)&buf[temp_size]; + if (!vlan && !eth) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "neither eth nor vlan" + " header found"); + if (vlan && !vlan->eth_proto) + vlan->eth_proto = RTE_BE16(ETHER_TYPE_IPv4); + else if (eth && !eth->ether_type) + eth->ether_type = RTE_BE16(ETHER_TYPE_IPv4); + if (!ipv4->version_ihl) + ipv4->version_ihl = MLX5_ENCAP_IPV4_VERSION | + MLX5_ENCAP_IPV4_IHL_MIN; + if (!ipv4->time_to_live) + ipv4->time_to_live = MLX5_ENCAP_IPV4_TTL_DEF; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + ipv6 = (struct ipv6_hdr *)&buf[temp_size]; + if (!vlan && !eth) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "neither eth nor vlan" + " header found"); + if (vlan && !vlan->eth_proto) + vlan->eth_proto = RTE_BE16(ETHER_TYPE_IPv6); + else if (eth && !eth->ether_type) + eth->ether_type = RTE_BE16(ETHER_TYPE_IPv6); + if (!ipv6->vtc_flow) + ipv6->vtc_flow = + RTE_BE32(MLX5_ENCAP_IPV6_VTC_FLOW); + if (!ipv6->hop_limits) + ipv6->hop_limits = MLX5_ENCAP_IPV6_HOP_LIMIT; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + udp = (struct udp_hdr *)&buf[temp_size]; + if (!ipv4 && !ipv6) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "ip header not found"); + if (ipv4 && !ipv4->next_proto_id) + ipv4->next_proto_id = IPPROTO_UDP; + else if (ipv6 && !ipv6->proto) + ipv6->proto = IPPROTO_UDP; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + vxlan = (struct vxlan_hdr *)&buf[temp_size]; + if (!udp) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "udp header not found"); + if (!udp->dst_port) + udp->dst_port = RTE_BE16(MLX5_UDP_PORT_VXLAN); + if (!vxlan->vx_flags) + vxlan->vx_flags = + RTE_BE32(MLX5_ENCAP_VXLAN_FLAGS); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + vxlan_gpe = (struct vxlan_gpe_hdr *)&buf[temp_size]; + if (!udp) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "udp header not found"); + if (!vxlan_gpe->proto) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "next protocol not found"); + if (!udp->dst_port) + udp->dst_port = + RTE_BE16(MLX5_UDP_PORT_VXLAN_GPE); + if (!vxlan_gpe->vx_flags) + vxlan_gpe->vx_flags = + MLX5_ENCAP_VXLAN_GPE_FLAGS; + break; + case RTE_FLOW_ITEM_TYPE_GRE: + case RTE_FLOW_ITEM_TYPE_NVGRE: + gre = (struct gre_hdr *)&buf[temp_size]; + if (!gre->proto) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "next protocol not found"); + if (!ipv4 && !ipv6) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "ip header not found"); + if (ipv4 && !ipv4->next_proto_id) + ipv4->next_proto_id = IPPROTO_GRE; + else if (ipv6 && !ipv6->proto) + ipv6->proto = IPPROTO_GRE; + break; + case RTE_FLOW_ITEM_TYPE_VOID: + break; + default: + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + (void *)items->type, + "unsupported item type"); + break; + } + temp_size += len; + } + *size = temp_size; + return 0; +} + +/** + * Convert L2 encap action to DV specification. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in] action + * Pointer to action structure. + * @param[in, out] dev_flow + * Pointer to the mlx5_flow. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_create_action_l2_encap(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + const struct rte_flow_item *encap_data; + const struct rte_flow_action_raw_encap *raw_encap_data; + struct mlx5_flow_dv_encap_decap_resource res = { + .reformat_type = + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL, + .ft_type = MLX5DV_FLOW_TABLE_TYPE_NIC_TX, + }; + + if (action->type == RTE_FLOW_ACTION_TYPE_RAW_ENCAP) { + raw_encap_data = + (const struct rte_flow_action_raw_encap *)action->conf; + res.size = raw_encap_data->size; + memcpy(res.buf, raw_encap_data->data, res.size); + } else { + if (action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP) + encap_data = + ((const struct rte_flow_action_vxlan_encap *) + action->conf)->definition; + else + encap_data = + ((const struct rte_flow_action_nvgre_encap *) + action->conf)->definition; + if (flow_dv_convert_encap_data(encap_data, res.buf, + &res.size, error)) + return -rte_errno; + } + if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "can't create L2 encap action"); + return 0; +} + +/** + * Convert L2 decap action to DV specification. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in, out] dev_flow + * Pointer to the mlx5_flow. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_create_action_l2_decap(struct rte_eth_dev *dev, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct mlx5_flow_dv_encap_decap_resource res = { + .size = 0, + .reformat_type = + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2, + .ft_type = MLX5DV_FLOW_TABLE_TYPE_NIC_RX, + }; + + if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "can't create L2 decap action"); + return 0; +} + +/** + * Convert raw decap/encap (L3 tunnel) action to DV specification. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in] action + * Pointer to action structure. + * @param[in, out] dev_flow + * Pointer to the mlx5_flow. + * @param[in] attr + * Pointer to the flow attributes. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_create_action_raw_encap(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct rte_flow_action_raw_encap *encap_data; + struct mlx5_flow_dv_encap_decap_resource res; + + encap_data = (const struct rte_flow_action_raw_encap *)action->conf; + res.size = encap_data->size; + memcpy(res.buf, encap_data->data, res.size); + res.reformat_type = attr->egress ? + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL : + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2; + res.ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX : + MLX5DV_FLOW_TABLE_TYPE_NIC_RX; + if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "can't create encap action"); + return 0; +} + +/** * Verify the @p attributes will be correctly understood by the NIC and store * them in the @p flow if everything is correct. * @@ -339,6 +947,49 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, action_flags |= MLX5_FLOW_ACTION_COUNT; ++actions_n; break; + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP: + ret = flow_dv_validate_action_l2_encap(action_flags, + actions, attr, + error); + if (ret < 0) + return ret; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP ? + MLX5_FLOW_ACTION_VXLAN_ENCAP : + MLX5_FLOW_ACTION_NVGRE_ENCAP; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP: + ret = flow_dv_validate_action_l2_decap(action_flags, + attr, error); + if (ret < 0) + return ret; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_VXLAN_DECAP ? + MLX5_FLOW_ACTION_VXLAN_DECAP : + MLX5_FLOW_ACTION_NVGRE_DECAP; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: + ret = flow_dv_validate_action_raw_encap(action_flags, + actions, attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_RAW_ENCAP; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_RAW_DECAP: + ret = flow_dv_validate_action_raw_decap(action_flags, + actions, attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_RAW_DECAP; + ++actions_n; + break; default: return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, @@ -363,10 +1014,6 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, * Pointer to the list of items. * @param[in] actions * Pointer to the list of actions. - * @param[out] item_flags - * Pointer to bit mask of all items detected. - * @param[out] action_flags - * Pointer to bit mask of all actions detected. * @param[out] error * Pointer to the error structure. * @@ -378,8 +1025,6 @@ static struct mlx5_flow * flow_dv_prepare(const struct rte_flow_attr *attr __rte_unused, const struct rte_flow_item items[] __rte_unused, const struct rte_flow_action actions[] __rte_unused, - uint64_t *item_flags __rte_unused, - uint64_t *action_flags __rte_unused, struct rte_flow_error *error) { uint32_t size = sizeof(struct mlx5_flow); @@ -951,161 +1596,6 @@ flow_dv_translate_item_meta(void *matcher, void *key, } } -/** - * Update the matcher and the value based the selected item. - * - * @param[in, out] matcher - * Flow matcher. - * @param[in, out] key - * Flow matcher value. - * @param[in] item - * Flow pattern to translate. - * @param[in, out] dev_flow - * Pointer to the mlx5_flow. - * @param[in] inner - * Item is inner pattern. - */ -static void -flow_dv_create_item(void *matcher, void *key, - const struct rte_flow_item *item, - struct mlx5_flow *dev_flow, - int inner) -{ - struct mlx5_flow_dv_matcher *tmatcher = matcher; - - switch (item->type) { - case RTE_FLOW_ITEM_TYPE_ETH: - flow_dv_translate_item_eth(tmatcher->mask.buf, key, item, - inner); - tmatcher->priority = MLX5_PRIORITY_MAP_L2; - break; - case RTE_FLOW_ITEM_TYPE_VLAN: - flow_dv_translate_item_vlan(tmatcher->mask.buf, key, item, - inner); - break; - case RTE_FLOW_ITEM_TYPE_IPV4: - flow_dv_translate_item_ipv4(tmatcher->mask.buf, key, item, - inner); - tmatcher->priority = MLX5_PRIORITY_MAP_L3; - dev_flow->dv.hash_fields |= - mlx5_flow_hashfields_adjust(dev_flow, inner, - MLX5_IPV4_LAYER_TYPES, - MLX5_IPV4_IBV_RX_HASH); - break; - case RTE_FLOW_ITEM_TYPE_IPV6: - flow_dv_translate_item_ipv6(tmatcher->mask.buf, key, item, - inner); - tmatcher->priority = MLX5_PRIORITY_MAP_L3; - dev_flow->dv.hash_fields |= - mlx5_flow_hashfields_adjust(dev_flow, inner, - MLX5_IPV6_LAYER_TYPES, - MLX5_IPV6_IBV_RX_HASH); - break; - case RTE_FLOW_ITEM_TYPE_TCP: - flow_dv_translate_item_tcp(tmatcher->mask.buf, key, item, - inner); - tmatcher->priority = MLX5_PRIORITY_MAP_L4; - dev_flow->dv.hash_fields |= - mlx5_flow_hashfields_adjust(dev_flow, inner, - ETH_RSS_TCP, - (IBV_RX_HASH_SRC_PORT_TCP | - IBV_RX_HASH_DST_PORT_TCP)); - break; - case RTE_FLOW_ITEM_TYPE_UDP: - flow_dv_translate_item_udp(tmatcher->mask.buf, key, item, - inner); - tmatcher->priority = MLX5_PRIORITY_MAP_L4; - dev_flow->verbs.hash_fields |= - mlx5_flow_hashfields_adjust(dev_flow, inner, - ETH_RSS_UDP, - (IBV_RX_HASH_SRC_PORT_UDP | - IBV_RX_HASH_DST_PORT_UDP)); - break; - case RTE_FLOW_ITEM_TYPE_GRE: - flow_dv_translate_item_gre(tmatcher->mask.buf, key, item, - inner); - break; - case RTE_FLOW_ITEM_TYPE_NVGRE: - flow_dv_translate_item_nvgre(tmatcher->mask.buf, key, item, - inner); - break; - case RTE_FLOW_ITEM_TYPE_VXLAN: - case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: - flow_dv_translate_item_vxlan(tmatcher->mask.buf, key, item, - inner); - break; - case RTE_FLOW_ITEM_TYPE_META: - flow_dv_translate_item_meta(tmatcher->mask.buf, key, item); - break; - default: - break; - } -} - -/** - * Store the requested actions in an array. - * - * @param[in] action - * Flow action to translate. - * @param[in, out] dev_flow - * Pointer to the mlx5_flow. - */ -static void -flow_dv_create_action(const struct rte_flow_action *action, - struct mlx5_flow *dev_flow) -{ - const struct rte_flow_action_queue *queue; - const struct rte_flow_action_rss *rss; - int actions_n = dev_flow->dv.actions_n; - struct rte_flow *flow = dev_flow->flow; - - switch (action->type) { - case RTE_FLOW_ACTION_TYPE_VOID: - break; - case RTE_FLOW_ACTION_TYPE_FLAG: - dev_flow->dv.actions[actions_n].type = MLX5DV_FLOW_ACTION_TAG; - dev_flow->dv.actions[actions_n].tag_value = - mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT); - actions_n++; - flow->actions |= MLX5_FLOW_ACTION_FLAG; - break; - case RTE_FLOW_ACTION_TYPE_MARK: - dev_flow->dv.actions[actions_n].type = MLX5DV_FLOW_ACTION_TAG; - dev_flow->dv.actions[actions_n].tag_value = - mlx5_flow_mark_set - (((const struct rte_flow_action_mark *) - (action->conf))->id); - flow->actions |= MLX5_FLOW_ACTION_MARK; - actions_n++; - break; - case RTE_FLOW_ACTION_TYPE_DROP: - dev_flow->dv.actions[actions_n].type = MLX5DV_FLOW_ACTION_DROP; - flow->actions |= MLX5_FLOW_ACTION_DROP; - break; - case RTE_FLOW_ACTION_TYPE_QUEUE: - queue = action->conf; - flow->rss.queue_num = 1; - (*flow->queue)[0] = queue->index; - flow->actions |= MLX5_FLOW_ACTION_QUEUE; - break; - case RTE_FLOW_ACTION_TYPE_RSS: - rss = action->conf; - if (flow->queue) - memcpy((*flow->queue), rss->queue, - rss->queue_num * sizeof(uint16_t)); - flow->rss.queue_num = rss->queue_num; - memcpy(flow->key, rss->key, MLX5_RSS_HASH_KEY_LEN); - flow->rss.types = rss->types; - flow->rss.level = rss->level; - /* Added to array only in apply since we need the QP */ - flow->actions |= MLX5_FLOW_ACTION_RSS; - break; - default: - break; - } - dev_flow->dv.actions_n = actions_n; -} - static uint32_t matcher_zero[MLX5_ST_SZ_DW(fte_match_param)] = { 0 }; #define HEADER_IS_ZERO(match_criteria, headers) \ @@ -1203,10 +1693,12 @@ flow_dv_matcher_register(struct rte_eth_dev *dev, dv_attr.flags |= IBV_FLOW_ATTR_FLAGS_EGRESS; cache_matcher->matcher_object = mlx5_glue->dv_create_flow_matcher(priv->ctx, &dv_attr); - if (!cache_matcher->matcher_object) + if (!cache_matcher->matcher_object) { + rte_free(cache_matcher); return rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, "cannot create matcher"); + } rte_atomic32_inc(&cache_matcher->refcnt); LIST_INSERT_HEAD(&priv->matchers, cache_matcher, next); dev_flow->dv.matcher = cache_matcher; @@ -1217,7 +1709,6 @@ flow_dv_matcher_register(struct rte_eth_dev *dev, return 0; } - /** * Fill the flow with DV spec. * @@ -1242,37 +1733,264 @@ flow_dv_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, const struct rte_flow_attr *attr, const struct rte_flow_item items[], - const struct rte_flow_action actions[] __rte_unused, + const struct rte_flow_action actions[], struct rte_flow_error *error) { struct priv *priv = dev->data->dev_private; + struct rte_flow *flow = dev_flow->flow; + uint64_t item_flags = 0; + uint64_t action_flags = 0; uint64_t priority = attr->priority; struct mlx5_flow_dv_matcher matcher = { .mask = { .size = sizeof(matcher.mask.buf), }, }; - void *match_value = dev_flow->dv.value.buf; - int tunnel = 0; + int actions_n = 0; if (priority == MLX5_FLOW_PRIO_RSVD) priority = priv->config.flow_prio - 1; for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { - tunnel = !!(dev_flow->layers & MLX5_FLOW_LAYER_TUNNEL); - flow_dv_create_item(&matcher, match_value, items, dev_flow, - tunnel); + int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + void *match_mask = matcher.mask.buf; + void *match_value = dev_flow->dv.value.buf; + + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_ETH: + flow_dv_translate_item_eth(match_mask, match_value, + items, tunnel); + matcher.priority = MLX5_PRIORITY_MAP_L2; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + flow_dv_translate_item_vlan(match_mask, match_value, + items, tunnel); + matcher.priority = MLX5_PRIORITY_MAP_L2; + item_flags |= tunnel ? (MLX5_FLOW_LAYER_INNER_L2 | + MLX5_FLOW_LAYER_INNER_VLAN) : + (MLX5_FLOW_LAYER_OUTER_L2 | + MLX5_FLOW_LAYER_OUTER_VLAN); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + flow_dv_translate_item_ipv4(match_mask, match_value, + items, tunnel); + matcher.priority = MLX5_PRIORITY_MAP_L3; + dev_flow->dv.hash_fields |= + mlx5_flow_hashfields_adjust + (dev_flow, tunnel, + MLX5_IPV4_LAYER_TYPES, + MLX5_IPV4_IBV_RX_HASH); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + flow_dv_translate_item_ipv6(match_mask, match_value, + items, tunnel); + matcher.priority = MLX5_PRIORITY_MAP_L3; + dev_flow->dv.hash_fields |= + mlx5_flow_hashfields_adjust + (dev_flow, tunnel, + MLX5_IPV6_LAYER_TYPES, + MLX5_IPV6_IBV_RX_HASH); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; + break; + case RTE_FLOW_ITEM_TYPE_TCP: + flow_dv_translate_item_tcp(match_mask, match_value, + items, tunnel); + matcher.priority = MLX5_PRIORITY_MAP_L4; + dev_flow->dv.hash_fields |= + mlx5_flow_hashfields_adjust + (dev_flow, tunnel, ETH_RSS_TCP, + IBV_RX_HASH_SRC_PORT_TCP | + IBV_RX_HASH_DST_PORT_TCP); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + flow_dv_translate_item_udp(match_mask, match_value, + items, tunnel); + matcher.priority = MLX5_PRIORITY_MAP_L4; + dev_flow->verbs.hash_fields |= + mlx5_flow_hashfields_adjust + (dev_flow, tunnel, ETH_RSS_UDP, + IBV_RX_HASH_SRC_PORT_UDP | + IBV_RX_HASH_DST_PORT_UDP); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; + break; + case RTE_FLOW_ITEM_TYPE_GRE: + flow_dv_translate_item_gre(match_mask, match_value, + items, tunnel); + item_flags |= MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_NVGRE: + flow_dv_translate_item_nvgre(match_mask, match_value, + items, tunnel); + item_flags |= MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + flow_dv_translate_item_vxlan(match_mask, match_value, + items, tunnel); + item_flags |= MLX5_FLOW_LAYER_VXLAN; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + flow_dv_translate_item_vxlan(match_mask, match_value, + items, tunnel); + item_flags |= MLX5_FLOW_LAYER_VXLAN_GPE; + break; + case RTE_FLOW_ITEM_TYPE_META: + flow_dv_translate_item_meta(match_mask, match_value, + items); + item_flags |= MLX5_FLOW_ITEM_METADATA; + break; + default: + break; + } } + dev_flow->layers = item_flags; + /* Register matcher. */ matcher.crc = rte_raw_cksum((const void *)matcher.mask.buf, - matcher.mask.size); - if (priority == MLX5_FLOW_PRIO_RSVD) - priority = priv->config.flow_prio - 1; + matcher.mask.size); matcher.priority = mlx5_flow_adjust_priority(dev, priority, matcher.priority); matcher.egress = attr->egress; if (flow_dv_matcher_register(dev, &matcher, dev_flow, error)) return -rte_errno; - for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) - flow_dv_create_action(actions, dev_flow); + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + const struct rte_flow_action_queue *queue; + const struct rte_flow_action_rss *rss; + const struct rte_flow_action *action = actions; + const uint8_t *rss_key; + + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + dev_flow->dv.actions[actions_n].type = + MLX5DV_FLOW_ACTION_TAG; + dev_flow->dv.actions[actions_n].tag_value = + mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT); + actions_n++; + action_flags |= MLX5_FLOW_ACTION_FLAG; + break; + case RTE_FLOW_ACTION_TYPE_MARK: + dev_flow->dv.actions[actions_n].type = + MLX5DV_FLOW_ACTION_TAG; + dev_flow->dv.actions[actions_n].tag_value = + mlx5_flow_mark_set + (((const struct rte_flow_action_mark *) + (actions->conf))->id); + actions_n++; + action_flags |= MLX5_FLOW_ACTION_MARK; + break; + case RTE_FLOW_ACTION_TYPE_DROP: + dev_flow->dv.actions[actions_n].type = + MLX5DV_FLOW_ACTION_DROP; + action_flags |= MLX5_FLOW_ACTION_DROP; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + queue = actions->conf; + flow->rss.queue_num = 1; + (*flow->queue)[0] = queue->index; + action_flags |= MLX5_FLOW_ACTION_QUEUE; + break; + case RTE_FLOW_ACTION_TYPE_RSS: + rss = actions->conf; + if (flow->queue) + memcpy((*flow->queue), rss->queue, + rss->queue_num * sizeof(uint16_t)); + flow->rss.queue_num = rss->queue_num; + /* NULL RSS key indicates default RSS key. */ + rss_key = !rss->key ? rss_hash_default_key : rss->key; + memcpy(flow->key, rss_key, MLX5_RSS_HASH_KEY_LEN); + /* RSS type 0 indicates default RSS type ETH_RSS_IP. */ + flow->rss.types = !rss->types ? ETH_RSS_IP : rss->types; + flow->rss.level = rss->level; + action_flags |= MLX5_FLOW_ACTION_RSS; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP: + if (flow_dv_create_action_l2_encap(dev, actions, + dev_flow, error)) + return -rte_errno; + dev_flow->dv.actions[actions_n].type = + MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION; + dev_flow->dv.actions[actions_n].action = + dev_flow->dv.encap_decap->verbs_action; + actions_n++; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP ? + MLX5_FLOW_ACTION_VXLAN_ENCAP : + MLX5_FLOW_ACTION_NVGRE_ENCAP; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP: + if (flow_dv_create_action_l2_decap(dev, dev_flow, + error)) + return -rte_errno; + dev_flow->dv.actions[actions_n].type = + MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION; + dev_flow->dv.actions[actions_n].action = + dev_flow->dv.encap_decap->verbs_action; + actions_n++; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_VXLAN_DECAP ? + MLX5_FLOW_ACTION_VXLAN_DECAP : + MLX5_FLOW_ACTION_NVGRE_DECAP; + break; + case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: + /* Handle encap with preceding decap. */ + if (action_flags & MLX5_FLOW_ACTION_RAW_DECAP) { + if (flow_dv_create_action_raw_encap + (dev, actions, dev_flow, attr, error)) + return -rte_errno; + dev_flow->dv.actions[actions_n].type = + MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION; + dev_flow->dv.actions[actions_n].action = + dev_flow->dv.encap_decap->verbs_action; + } else { + /* Handle encap without preceding decap. */ + if (flow_dv_create_action_l2_encap(dev, actions, + dev_flow, + error)) + return -rte_errno; + dev_flow->dv.actions[actions_n].type = + MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION; + dev_flow->dv.actions[actions_n].action = + dev_flow->dv.encap_decap->verbs_action; + } + actions_n++; + action_flags |= MLX5_FLOW_ACTION_RAW_ENCAP; + break; + case RTE_FLOW_ACTION_TYPE_RAW_DECAP: + /* Check if this decap is followed by encap. */ + for (; action->type != RTE_FLOW_ACTION_TYPE_END && + action->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP; + action++) { + } + /* Handle decap only if it isn't followed by encap. */ + if (action->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP) { + if (flow_dv_create_action_l2_decap(dev, + dev_flow, + error)) + return -rte_errno; + dev_flow->dv.actions[actions_n].type = + MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION; + dev_flow->dv.actions[actions_n].action = + dev_flow->dv.encap_decap->verbs_action; + actions_n++; + } + /* If decap is followed by encap, handle it at encap. */ + action_flags |= MLX5_FLOW_ACTION_RAW_DECAP; + break; + default: + break; + } + } + dev_flow->dv.actions_n = actions_n; + flow->actions = action_flags; return 0; } @@ -1403,6 +2121,37 @@ flow_dv_matcher_release(struct rte_eth_dev *dev, } /** + * Release an encap/decap resource. + * + * @param flow + * Pointer to mlx5_flow. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +static int +flow_dv_encap_decap_resource_release(struct mlx5_flow *flow) +{ + struct mlx5_flow_dv_encap_decap_resource *cache_resource = + flow->dv.encap_decap; + + assert(cache_resource->verbs_action); + DRV_LOG(DEBUG, "encap/decap resource %p: refcnt %d--", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); + if (rte_atomic32_dec_and_test(&cache_resource->refcnt)) { + claim_zero(mlx5_glue->destroy_flow_action + (cache_resource->verbs_action)); + LIST_REMOVE(cache_resource, next); + rte_free(cache_resource); + DRV_LOG(DEBUG, "encap/decap resource %p: removed", + (void *)cache_resource); + return 0; + } + return 1; +} + +/** * Remove the flow from the NIC but keeps it in memory. * * @param[in] dev @@ -1457,6 +2206,8 @@ flow_dv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) LIST_REMOVE(dev_flow, next); if (dev_flow->dv.matcher) flow_dv_matcher_release(dev, dev_flow); + if (dev_flow->dv.encap_decap) + flow_dv_encap_decap_resource_release(dev_flow); rte_free(dev_flow); } } diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c index 719fb106..fb817b23 100644 --- a/drivers/net/mlx5/mlx5_flow_tcf.c +++ b/drivers/net/mlx5/mlx5_flow_tcf.c @@ -113,6 +113,39 @@ struct tc_pedit_sel { #endif /* HAVE_TC_ACT_VLAN */ +#ifdef HAVE_TC_ACT_TUNNEL_KEY + +#include <linux/tc_act/tc_tunnel_key.h> + +#ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT +#define TCA_TUNNEL_KEY_ENC_DST_PORT 9 +#endif + +#ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM +#define TCA_TUNNEL_KEY_NO_CSUM 10 +#endif + +#else /* HAVE_TC_ACT_TUNNEL_KEY */ + +#define TCA_ACT_TUNNEL_KEY 17 +#define TCA_TUNNEL_KEY_ACT_SET 1 +#define TCA_TUNNEL_KEY_ACT_RELEASE 2 +#define TCA_TUNNEL_KEY_PARMS 2 +#define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3 +#define TCA_TUNNEL_KEY_ENC_IPV4_DST 4 +#define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5 +#define TCA_TUNNEL_KEY_ENC_IPV6_DST 6 +#define TCA_TUNNEL_KEY_ENC_KEY_ID 7 +#define TCA_TUNNEL_KEY_ENC_DST_PORT 9 +#define TCA_TUNNEL_KEY_NO_CSUM 10 + +struct tc_tunnel_key { + tc_gen; + int t_action; +}; + +#endif /* HAVE_TC_ACT_TUNNEL_KEY */ + /* Normally found in linux/netlink.h. */ #ifndef NETLINK_CAP_ACK #define NETLINK_CAP_ACK 10 @@ -211,6 +244,45 @@ struct tc_pedit_sel { #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25 #endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID +#define TCA_FLOWER_KEY_ENC_KEY_ID 26 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC +#define TCA_FLOWER_KEY_ENC_IPV4_SRC 27 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK +#define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST +#define TCA_FLOWER_KEY_ENC_IPV4_DST 29 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK +#define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC +#define TCA_FLOWER_KEY_ENC_IPV6_SRC 31 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK +#define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST +#define TCA_FLOWER_KEY_ENC_IPV6_DST 33 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK +#define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT +#define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK +#define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT +#define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK +#define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46 +#endif #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS #define TCA_FLOWER_KEY_TCP_FLAGS 71 #endif @@ -241,6 +313,28 @@ struct tc_pedit_sel { #define TCA_ACT_MAX_PRIO 32 #endif +/** UDP port range of VXLAN devices created by driver. */ +#define MLX5_VXLAN_PORT_MIN 30000 +#define MLX5_VXLAN_PORT_MAX 60000 +#define MLX5_VXLAN_DEVICE_PFX "vmlx_" + +/** Tunnel action type, used for @p type in header structure. */ +enum flow_tcf_tunact_type { + FLOW_TCF_TUNACT_VXLAN_DECAP, + FLOW_TCF_TUNACT_VXLAN_ENCAP, +}; + +/** Flags used for @p mask in tunnel action encap descriptors. */ +#define FLOW_TCF_ENCAP_ETH_SRC (1u << 0) +#define FLOW_TCF_ENCAP_ETH_DST (1u << 1) +#define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2) +#define FLOW_TCF_ENCAP_IPV4_DST (1u << 3) +#define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4) +#define FLOW_TCF_ENCAP_IPV6_DST (1u << 5) +#define FLOW_TCF_ENCAP_UDP_SRC (1u << 6) +#define FLOW_TCF_ENCAP_UDP_DST (1u << 7) +#define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8) + /** * Structure for holding netlink context. * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE. @@ -254,6 +348,100 @@ struct mlx5_flow_tcf_context { uint8_t *buf; /* Message buffer. */ }; +/** + * Neigh rule structure. The neigh rule is applied via Netlink to + * outer tunnel iface in order to provide destination MAC address + * for the VXLAN encapsultion. The neigh rule is implicitly related + * to the Flow itself and can be shared by multiple Flows. + */ +struct tcf_neigh_rule { + LIST_ENTRY(tcf_neigh_rule) next; + uint32_t refcnt; + struct ether_addr eth; + uint16_t mask; + union { + struct { + rte_be32_t dst; + } ipv4; + struct { + uint8_t dst[IPV6_ADDR_LEN]; + } ipv6; + }; +}; + +/** + * Local rule structure. The local rule is applied via Netlink to + * outer tunnel iface in order to provide local and peer IP addresses + * of the VXLAN tunnel for encapsulation. The local rule is implicitly + * related to the Flow itself and can be shared by multiple Flows. + */ +struct tcf_local_rule { + LIST_ENTRY(tcf_local_rule) next; + uint32_t refcnt; + uint16_t mask; + union { + struct { + rte_be32_t dst; + rte_be32_t src; + } ipv4; + struct { + uint8_t dst[IPV6_ADDR_LEN]; + uint8_t src[IPV6_ADDR_LEN]; + } ipv6; + }; +}; + +/** VXLAN virtual netdev. */ +struct tcf_vtep { + LIST_ENTRY(tcf_vtep) next; + LIST_HEAD(, tcf_neigh_rule) neigh; + LIST_HEAD(, tcf_local_rule) local; + uint32_t refcnt; + unsigned int ifindex; /**< Own interface index. */ + unsigned int ifouter; /**< Index of device attached to. */ + uint16_t port; + uint8_t created; +}; + +/** Tunnel descriptor header, common for all tunnel types. */ +struct flow_tcf_tunnel_hdr { + uint32_t type; /**< Tunnel action type. */ + struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */ + unsigned int ifindex_org; /**< Original dst/src interface */ + unsigned int *ifindex_ptr; /**< Interface ptr in message. */ +}; + +struct flow_tcf_vxlan_decap { + struct flow_tcf_tunnel_hdr hdr; + uint16_t udp_port; +}; + +struct flow_tcf_vxlan_encap { + struct flow_tcf_tunnel_hdr hdr; + uint32_t mask; + struct { + struct ether_addr dst; + struct ether_addr src; + } eth; + union { + struct { + rte_be32_t dst; + rte_be32_t src; + } ipv4; + struct { + uint8_t dst[IPV6_ADDR_LEN]; + uint8_t src[IPV6_ADDR_LEN]; + } ipv6; + }; +struct { + rte_be16_t src; + rte_be16_t dst; + } udp; + struct { + uint8_t vni[3]; + } vxlan; +}; + /** Structure used when extracting the values of a flow counters * from a netlink message. */ @@ -271,6 +459,7 @@ static const union { struct rte_flow_item_ipv6 ipv6; struct rte_flow_item_tcp tcp; struct rte_flow_item_udp udp; + struct rte_flow_item_vxlan vxlan; } flow_tcf_mask_empty; /** Supported masks for known item types. */ @@ -282,6 +471,7 @@ static const struct { struct rte_flow_item_ipv6 ipv6; struct rte_flow_item_tcp tcp; struct rte_flow_item_udp udp; + struct rte_flow_item_vxlan vxlan; } flow_tcf_mask_supported = { .port_id = { .id = 0xffffffff, @@ -319,6 +509,9 @@ static const struct { .src_port = RTE_BE16(0xffff), .dst_port = RTE_BE16(0xffff), }, + .vxlan = { + .vni = "\xff\xff\xff", + }, }; #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr)) @@ -337,7 +530,15 @@ struct flow_tcf_ptoi { /* Due to a limitation on driver/FW. */ #define MLX5_TCF_GROUP_ID_MAX 3 -#define MLX5_TCF_GROUP_PRIORITY_MAX 14 + +/* + * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel. + * Priority in rte_flow attribute starts from 0 and is added by 1 in + * translation. This is subject to be changed to determine the max priority + * based on trial-and-error like Verbs driver once the restriction is lifted or + * the range is extended. + */ +#define MLX5_TCF_GROUP_PRIORITY_MAX 15 #define MLX5_TCF_FATE_ACTIONS \ (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \ @@ -347,6 +548,9 @@ struct flow_tcf_ptoi { (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \ MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP) +#define MLX5_TCF_VXLAN_ACTIONS \ + (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP) + #define MLX5_TCF_PEDIT_ACTIONS \ (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \ MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \ @@ -895,19 +1099,13 @@ flow_tcf_validate_attributes(const struct rte_flow_attr *attr, "group ID larger than " RTE_STR(MLX5_TCF_GROUP_ID_MAX) " isn't supported"); - else if (attr->group > 0 && - attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX) + else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, attr, - "lowest priority level is " + "priority more than " RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX) - " when group is configured"); - else if (attr->priority > 0xfffe) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, - attr, - "lowest priority level is 0xfffe"); + " is not supported"); if (!attr->ingress) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, @@ -920,6 +1118,665 @@ flow_tcf_validate_attributes(const struct rte_flow_attr *attr, } /** + * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch. + * The routine checks the L2 fields to be used in encapsulation header. + * + * @param[in] item + * Pointer to the item structure. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + **/ +static int +flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item, + struct rte_flow_error *error) +{ + const struct rte_flow_item_eth *spec = item->spec; + const struct rte_flow_item_eth *mask = item->mask; + + if (!spec) { + /* + * Specification for L2 addresses can be empty + * because these ones are optional and not + * required directly by tc rule. Kernel tries + * to resolve these ones on its own + */ + return 0; + } + if (!mask) { + /* If mask is not specified use the default one. */ + mask = &rte_flow_item_eth_mask; + } + if (memcmp(&mask->dst, + &flow_tcf_mask_empty.eth.dst, + sizeof(flow_tcf_mask_empty.eth.dst))) { + if (memcmp(&mask->dst, + &rte_flow_item_eth_mask.dst, + sizeof(rte_flow_item_eth_mask.dst))) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"eth.dst\" field"); + } + if (memcmp(&mask->src, + &flow_tcf_mask_empty.eth.src, + sizeof(flow_tcf_mask_empty.eth.src))) { + if (memcmp(&mask->src, + &rte_flow_item_eth_mask.src, + sizeof(rte_flow_item_eth_mask.src))) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"eth.src\" field"); + } + if (mask->type != RTE_BE16(0x0000)) { + if (mask->type != RTE_BE16(0xffff)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"eth.type\" field"); + DRV_LOG(WARNING, + "outer ethernet type field" + " cannot be forced for vxlan" + " encapsulation, parameter ignored"); + } + return 0; +} + +/** + * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch. + * The routine checks the IPv4 fields to be used in encapsulation header. + * + * @param[in] item + * Pointer to the item structure. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + **/ +static int +flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item, + struct rte_flow_error *error) +{ + const struct rte_flow_item_ipv4 *spec = item->spec; + const struct rte_flow_item_ipv4 *mask = item->mask; + + if (!spec) { + /* + * Specification for IP addresses cannot be empty + * because it is required by tunnel_key parameter. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "NULL outer ipv4 address" + " specification for vxlan" + " encapsulation"); + } + if (!mask) + mask = &rte_flow_item_ipv4_mask; + if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) { + if (mask->hdr.dst_addr != RTE_BE32(0xffffffff)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv4.hdr.dst_addr\" field" + " for vxlan encapsulation"); + /* More IPv4 address validations can be put here. */ + } else { + /* + * Kernel uses the destination IP address to determine + * the routing path and obtain the MAC destination + * address, so IP destination address must be + * specified in the tc rule. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "outer ipv4 destination address" + " must be specified for" + " vxlan encapsulation"); + } + if (mask->hdr.src_addr != RTE_BE32(0x00000000)) { + if (mask->hdr.src_addr != RTE_BE32(0xffffffff)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv4.hdr.src_addr\" field" + " for vxlan encapsulation"); + /* More IPv4 address validations can be put here. */ + } else { + /* + * Kernel uses the source IP address to select the + * interface for egress encapsulated traffic, so + * it must be specified in the tc rule. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "outer ipv4 source address" + " must be specified for" + " vxlan encapsulation"); + } + return 0; +} + +/** + * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch. + * The routine checks the IPv6 fields to be used in encapsulation header. + * + * @param[in] item + * Pointer to the item structure. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + **/ +static int +flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item, + struct rte_flow_error *error) +{ + const struct rte_flow_item_ipv6 *spec = item->spec; + const struct rte_flow_item_ipv6 *mask = item->mask; + + if (!spec) { + /* + * Specification for IP addresses cannot be empty + * because it is required by tunnel_key parameter. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "NULL outer ipv6 address" + " specification for" + " vxlan encapsulation"); + } + if (!mask) + mask = &rte_flow_item_ipv6_mask; + if (memcmp(&mask->hdr.dst_addr, + &flow_tcf_mask_empty.ipv6.hdr.dst_addr, + IPV6_ADDR_LEN)) { + if (memcmp(&mask->hdr.dst_addr, + &rte_flow_item_ipv6_mask.hdr.dst_addr, + IPV6_ADDR_LEN)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv6.hdr.dst_addr\" field" + " for vxlan encapsulation"); + /* More IPv6 address validations can be put here. */ + } else { + /* + * Kernel uses the destination IP address to determine + * the routing path and obtain the MAC destination + * address (heigh or gate), so IP destination address + * must be specified within the tc rule. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "outer ipv6 destination address" + " must be specified for" + " vxlan encapsulation"); + } + if (memcmp(&mask->hdr.src_addr, + &flow_tcf_mask_empty.ipv6.hdr.src_addr, + IPV6_ADDR_LEN)) { + if (memcmp(&mask->hdr.src_addr, + &rte_flow_item_ipv6_mask.hdr.src_addr, + IPV6_ADDR_LEN)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv6.hdr.src_addr\" field" + " for vxlan encapsulation"); + /* More L3 address validation can be put here. */ + } else { + /* + * Kernel uses the source IP address to select the + * interface for egress encapsulated traffic, so + * it must be specified in the tc rule. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "outer L3 source address" + " must be specified for" + " vxlan encapsulation"); + } + return 0; +} + +/** + * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch. + * The routine checks the UDP fields to be used in encapsulation header. + * + * @param[in] item + * Pointer to the item structure. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + **/ +static int +flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item, + struct rte_flow_error *error) +{ + const struct rte_flow_item_udp *spec = item->spec; + const struct rte_flow_item_udp *mask = item->mask; + + if (!spec) { + /* + * Specification for UDP ports cannot be empty + * because it is required by tunnel_key parameter. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "NULL UDP port specification " + " for vxlan encapsulation"); + } + if (!mask) + mask = &rte_flow_item_udp_mask; + if (mask->hdr.dst_port != RTE_BE16(0x0000)) { + if (mask->hdr.dst_port != RTE_BE16(0xffff)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"udp.hdr.dst_port\" field" + " for vxlan encapsulation"); + if (!spec->hdr.dst_port) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "outer UDP remote port cannot be" + " 0 for vxlan encapsulation"); + } else { + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "outer UDP remote port" + " must be specified for" + " vxlan encapsulation"); + } + if (mask->hdr.src_port != RTE_BE16(0x0000)) { + if (mask->hdr.src_port != RTE_BE16(0xffff)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"udp.hdr.src_port\" field" + " for vxlan encapsulation"); + DRV_LOG(WARNING, + "outer UDP source port cannot be" + " forced for vxlan encapsulation," + " parameter ignored"); + } + return 0; +} + +/** + * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch. + * The routine checks the VNIP fields to be used in encapsulation header. + * + * @param[in] item + * Pointer to the item structure. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + **/ +static int +flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item, + struct rte_flow_error *error) +{ + const struct rte_flow_item_vxlan *spec = item->spec; + const struct rte_flow_item_vxlan *mask = item->mask; + + if (!spec) { + /* Outer VNI is required by tunnel_key parameter. */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "NULL VNI specification" + " for vxlan encapsulation"); + } + if (!mask) + mask = &rte_flow_item_vxlan_mask; + if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2]) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "outer VNI must be specified " + "for vxlan encapsulation"); + if (mask->vni[0] != 0xff || + mask->vni[1] != 0xff || + mask->vni[2] != 0xff) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"vxlan.vni\" field"); + + if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2]) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "vxlan vni cannot be 0"); + return 0; +} + +/** + * Validate VXLAN_ENCAP action item list for E-Switch. + * The routine checks items to be used in encapsulation header. + * + * @param[in] action + * Pointer to the VXLAN_ENCAP action structure. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + **/ +static int +flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action, + struct rte_flow_error *error) +{ + const struct rte_flow_item *items; + int ret; + uint32_t item_flags = 0; + + if (!action->conf) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "Missing vxlan tunnel" + " action configuration"); + items = ((const struct rte_flow_action_vxlan_encap *) + action->conf)->definition; + if (!items) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "Missing vxlan tunnel" + " encapsulation parameters"); + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + ret = mlx5_flow_validate_item_eth(items, item_flags, + error); + if (ret < 0) + return ret; + ret = flow_tcf_validate_vxlan_encap_eth(items, error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_OUTER_L2; + break; + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + ret = mlx5_flow_validate_item_ipv4(items, item_flags, + error); + if (ret < 0) + return ret; + ret = flow_tcf_validate_vxlan_encap_ipv4(items, error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + ret = mlx5_flow_validate_item_ipv6(items, item_flags, + error); + if (ret < 0) + return ret; + ret = flow_tcf_validate_vxlan_encap_ipv6(items, error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + ret = mlx5_flow_validate_item_udp(items, item_flags, + 0xFF, error); + if (ret < 0) + return ret; + ret = flow_tcf_validate_vxlan_encap_udp(items, error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + ret = mlx5_flow_validate_item_vxlan(items, + item_flags, error); + if (ret < 0) + return ret; + ret = flow_tcf_validate_vxlan_encap_vni(items, error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_VXLAN; + break; + default: + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, items, + "vxlan encap item not supported"); + } + } + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "no outer IP layer found" + " for vxlan encapsulation"); + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "no outer UDP layer found" + " for vxlan encapsulation"); + if (!(item_flags & MLX5_FLOW_LAYER_VXLAN)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "no VXLAN VNI found" + " for vxlan encapsulation"); + return 0; +} + +/** + * Validate RTE_FLOW_ITEM_TYPE_IPV4 item if VXLAN_DECAP action + * is present in actions list. + * + * @param[in] ipv4 + * Outer IPv4 address item (if any, NULL otherwise). + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + **/ +static int +flow_tcf_validate_vxlan_decap_ipv4(const struct rte_flow_item *ipv4, + struct rte_flow_error *error) +{ + const struct rte_flow_item_ipv4 *spec = ipv4->spec; + const struct rte_flow_item_ipv4 *mask = ipv4->mask; + + if (!spec) { + /* + * Specification for IP addresses cannot be empty + * because it is required as decap parameter. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, ipv4, + "NULL outer ipv4 address" + " specification for vxlan" + " for vxlan decapsulation"); + } + if (!mask) + mask = &rte_flow_item_ipv4_mask; + if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) { + if (mask->hdr.dst_addr != RTE_BE32(0xffffffff)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv4.hdr.dst_addr\" field"); + /* More IP address validations can be put here. */ + } else { + /* + * Kernel uses the destination IP address + * to determine the ingress network interface + * for traffic being decapsulated. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, ipv4, + "outer ipv4 destination address" + " must be specified for" + " vxlan decapsulation"); + } + /* Source IP address is optional for decap. */ + if (mask->hdr.src_addr != RTE_BE32(0x00000000) && + mask->hdr.src_addr != RTE_BE32(0xffffffff)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv4.hdr.src_addr\" field"); + return 0; +} + +/** + * Validate RTE_FLOW_ITEM_TYPE_IPV6 item if VXLAN_DECAP action + * is present in actions list. + * + * @param[in] ipv6 + * Outer IPv6 address item (if any, NULL otherwise). + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + **/ +static int +flow_tcf_validate_vxlan_decap_ipv6(const struct rte_flow_item *ipv6, + struct rte_flow_error *error) +{ + const struct rte_flow_item_ipv6 *spec = ipv6->spec; + const struct rte_flow_item_ipv6 *mask = ipv6->mask; + + if (!spec) { + /* + * Specification for IP addresses cannot be empty + * because it is required as decap parameter. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, ipv6, + "NULL outer ipv6 address" + " specification for vxlan" + " decapsulation"); + } + if (!mask) + mask = &rte_flow_item_ipv6_mask; + if (memcmp(&mask->hdr.dst_addr, + &flow_tcf_mask_empty.ipv6.hdr.dst_addr, + IPV6_ADDR_LEN)) { + if (memcmp(&mask->hdr.dst_addr, + &rte_flow_item_ipv6_mask.hdr.dst_addr, + IPV6_ADDR_LEN)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv6.hdr.dst_addr\" field"); + /* More IP address validations can be put here. */ + } else { + /* + * Kernel uses the destination IP address + * to determine the ingress network interface + * for traffic being decapsulated. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, ipv6, + "outer ipv6 destination address must be " + "specified for vxlan decapsulation"); + } + /* Source IP address is optional for decap. */ + if (memcmp(&mask->hdr.src_addr, + &flow_tcf_mask_empty.ipv6.hdr.src_addr, + IPV6_ADDR_LEN)) { + if (memcmp(&mask->hdr.src_addr, + &rte_flow_item_ipv6_mask.hdr.src_addr, + IPV6_ADDR_LEN)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv6.hdr.src_addr\" field"); + } + return 0; +} + +/** + * Validate RTE_FLOW_ITEM_TYPE_UDP item if VXLAN_DECAP action + * is present in actions list. + * + * @param[in] udp + * Outer UDP layer item (if any, NULL otherwise). + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + **/ +static int +flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp, + struct rte_flow_error *error) +{ + const struct rte_flow_item_udp *spec = udp->spec; + const struct rte_flow_item_udp *mask = udp->mask; + + if (!spec) + /* + * Specification for UDP ports cannot be empty + * because it is required as decap parameter. + */ + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, udp, + "NULL UDP port specification" + " for VXLAN decapsulation"); + if (!mask) + mask = &rte_flow_item_udp_mask; + if (mask->hdr.dst_port != RTE_BE16(0x0000)) { + if (mask->hdr.dst_port != RTE_BE16(0xffff)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"udp.hdr.dst_port\" field"); + if (!spec->hdr.dst_port) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, udp, + "zero decap local UDP port"); + } else { + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, udp, + "outer UDP destination port must be " + "specified for vxlan decapsulation"); + } + if (mask->hdr.src_port != RTE_BE16(0x0000)) { + if (mask->hdr.src_port != RTE_BE16(0xffff)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"udp.hdr.src_port\" field"); + DRV_LOG(WARNING, + "outer UDP local port cannot be " + "forced for VXLAN encapsulation, " + "parameter ignored"); + } + return 0; +} + +/** * Validate flow for E-Switch. * * @param[in] priv @@ -951,6 +1808,7 @@ flow_tcf_validate(struct rte_eth_dev *dev, const struct rte_flow_item_ipv6 *ipv6; const struct rte_flow_item_tcp *tcp; const struct rte_flow_item_udp *udp; + const struct rte_flow_item_vxlan *vxlan; } spec, mask; union { const struct rte_flow_action_port_id *port_id; @@ -960,6 +1818,7 @@ flow_tcf_validate(struct rte_eth_dev *dev, of_set_vlan_vid; const struct rte_flow_action_of_set_vlan_pcp * of_set_vlan_pcp; + const struct rte_flow_action_vxlan_encap *vxlan_encap; const struct rte_flow_action_set_ipv4 *set_ipv4; const struct rte_flow_action_set_ipv6 *set_ipv6; } conf; @@ -978,9 +1837,170 @@ flow_tcf_validate(struct rte_eth_dev *dev, ret = flow_tcf_validate_attributes(attr, error); if (ret < 0) return ret; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + unsigned int i; + uint64_t current_action_flag = 0; + + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_PORT_ID: + current_action_flag = MLX5_FLOW_ACTION_PORT_ID; + if (!actions->conf) + break; + conf.port_id = actions->conf; + if (conf.port_id->original) + i = 0; + else + for (i = 0; ptoi[i].ifindex; ++i) + if (ptoi[i].port_id == conf.port_id->id) + break; + if (!ptoi[i].ifindex) + return rte_flow_error_set + (error, ENODEV, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + conf.port_id, + "missing data to convert port ID to" + " ifindex"); + port_id_dev = &rte_eth_devices[conf.port_id->id]; + break; + case RTE_FLOW_ACTION_TYPE_JUMP: + current_action_flag = MLX5_FLOW_ACTION_JUMP; + if (!actions->conf) + break; + conf.jump = actions->conf; + if (attr->group >= conf.jump->group) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "can jump only to a group forward"); + break; + case RTE_FLOW_ACTION_TYPE_DROP: + current_action_flag = MLX5_FLOW_ACTION_DROP; + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + break; + case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN: + current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN; + break; + case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: + current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN; + break; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID: + if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "vlan modify is not supported," + " set action must follow push action"); + current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID; + break; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP: + if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "vlan modify is not supported," + " set action must follow push action"); + current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP: + current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + ret = flow_tcf_validate_vxlan_encap(actions, error); + if (ret < 0) + return ret; + current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: + current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: + current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: + current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: + current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: + current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_TP_DST: + current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_TTL: + current_action_flag = MLX5_FLOW_ACTION_SET_TTL; + break; + case RTE_FLOW_ACTION_TYPE_DEC_TTL: + current_action_flag = MLX5_FLOW_ACTION_DEC_TTL; + break; + case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: + current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: + current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) { + if (!actions->conf) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + actions, + "action configuration not set"); + } + if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) && + pedit_validated) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "set actions should be " + "listed successively"); + if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) && + (action_flags & MLX5_TCF_PEDIT_ACTIONS)) + pedit_validated = 1; + if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) && + (action_flags & MLX5_TCF_FATE_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "can't have multiple fate" + " actions"); + if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) && + (action_flags & MLX5_TCF_VXLAN_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "can't have multiple vxlan" + " actions"); + if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) && + (action_flags & MLX5_TCF_VLAN_ACTIONS)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "can't have vxlan and vlan" + " actions in the same rule"); + action_flags |= current_action_flag; + } for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { unsigned int i; + if ((item_flags & MLX5_FLOW_LAYER_TUNNEL) && + items->type != RTE_FLOW_ITEM_TYPE_ETH) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "only L2 inner item" + " is supported"); switch (items->type) { case RTE_FLOW_ITEM_TYPE_VOID: break; @@ -1034,7 +2054,9 @@ flow_tcf_validate(struct rte_eth_dev *dev, error); if (ret < 0) return ret; - item_flags |= MLX5_FLOW_LAYER_OUTER_L2; + item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? + MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; /* TODO: * Redundant check due to different supported mask. * Same for the rest of items. @@ -1112,6 +2134,12 @@ flow_tcf_validate(struct rte_eth_dev *dev, next_protocol = ((const struct rte_flow_item_ipv4 *) (items->spec))->hdr.next_proto_id; + if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { + ret = flow_tcf_validate_vxlan_decap_ipv4 + (items, error); + if (ret < 0) + return ret; + } break; case RTE_FLOW_ITEM_TYPE_IPV6: ret = mlx5_flow_validate_item_ipv6(items, item_flags, @@ -1139,6 +2167,12 @@ flow_tcf_validate(struct rte_eth_dev *dev, next_protocol = ((const struct rte_flow_item_ipv6 *) (items->spec))->hdr.proto; + if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { + ret = flow_tcf_validate_vxlan_decap_ipv6 + (items, error); + if (ret < 0) + return ret; + } break; case RTE_FLOW_ITEM_TYPE_UDP: ret = mlx5_flow_validate_item_udp(items, item_flags, @@ -1154,6 +2188,12 @@ flow_tcf_validate(struct rte_eth_dev *dev, error); if (!mask.udp) return -rte_errno; + if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { + ret = flow_tcf_validate_vxlan_decap_udp + (items, error); + if (ret < 0) + return ret; + } break; case RTE_FLOW_ITEM_TYPE_TCP: ret = mlx5_flow_validate_item_tcp @@ -1173,141 +2213,41 @@ flow_tcf_validate(struct rte_eth_dev *dev, if (!mask.tcp) return -rte_errno; break; - default: - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - NULL, "item not supported"); - } - } - for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { - unsigned int i; - uint64_t current_action_flag = 0; - - switch (actions->type) { - case RTE_FLOW_ACTION_TYPE_VOID: - break; - case RTE_FLOW_ACTION_TYPE_PORT_ID: - current_action_flag = MLX5_FLOW_ACTION_PORT_ID; - if (!actions->conf) - break; - conf.port_id = actions->conf; - if (conf.port_id->original) - i = 0; - else - for (i = 0; ptoi[i].ifindex; ++i) - if (ptoi[i].port_id == conf.port_id->id) - break; - if (!ptoi[i].ifindex) - return rte_flow_error_set - (error, ENODEV, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - conf.port_id, - "missing data to convert port ID to" - " ifindex"); - port_id_dev = &rte_eth_devices[conf.port_id->id]; - break; - case RTE_FLOW_ACTION_TYPE_JUMP: - current_action_flag = MLX5_FLOW_ACTION_JUMP; - if (!actions->conf) - break; - conf.jump = actions->conf; - if (attr->group >= conf.jump->group) + case RTE_FLOW_ITEM_TYPE_VXLAN: + if (!(action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)) return rte_flow_error_set (error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "can jump only to a group forward"); - break; - case RTE_FLOW_ACTION_TYPE_DROP: - current_action_flag = MLX5_FLOW_ACTION_DROP; - break; - case RTE_FLOW_ACTION_TYPE_COUNT: - break; - case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN: - current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN; - break; - case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: - current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN; - break; - case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID: - if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN)) - return rte_flow_error_set - (error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, actions, - "vlan modify is not supported," - " set action must follow push action"); - current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID; - break; - case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP: - if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN)) + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "vni pattern should be followed by" + " vxlan decapsulation action"); + ret = mlx5_flow_validate_item_vxlan(items, + item_flags, error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_VXLAN; + mask.vxlan = flow_tcf_item_mask + (items, &rte_flow_item_vxlan_mask, + &flow_tcf_mask_supported.vxlan, + &flow_tcf_mask_empty.vxlan, + sizeof(flow_tcf_mask_supported.vxlan), error); + if (!mask.vxlan) + return -rte_errno; + if (mask.vxlan->vni[0] != 0xff || + mask.vxlan->vni[1] != 0xff || + mask.vxlan->vni[2] != 0xff) return rte_flow_error_set (error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, actions, - "vlan modify is not supported," - " set action must follow push action"); - current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP; - break; - case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: - current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC; - break; - case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: - current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST; - break; - case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: - current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC; - break; - case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: - current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST; - break; - case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: - current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC; - break; - case RTE_FLOW_ACTION_TYPE_SET_TP_DST: - current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST; - break; - case RTE_FLOW_ACTION_TYPE_SET_TTL: - current_action_flag = MLX5_FLOW_ACTION_SET_TTL; - break; - case RTE_FLOW_ACTION_TYPE_DEC_TTL: - current_action_flag = MLX5_FLOW_ACTION_DEC_TTL; - break; - case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: - current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC; - break; - case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: - current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST; + RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.vxlan, + "no support for partial or " + "empty mask on \"vxlan.vni\" field"); break; default: return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "action not supported"); - } - if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) { - if (!actions->conf) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - actions, - "action configuration not set"); + RTE_FLOW_ERROR_TYPE_ITEM, + items, "item not supported"); } - if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) && - pedit_validated) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "set actions should be " - "listed successively"); - if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) && - (action_flags & MLX5_TCF_PEDIT_ACTIONS)) - pedit_validated = 1; - if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) && - (action_flags & MLX5_TCF_FATE_ACTIONS)) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "can't have multiple fate" - " actions"); - action_flags |= current_action_flag; } if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) && (action_flags & MLX5_FLOW_ACTION_DROP)) @@ -1375,6 +2315,12 @@ flow_tcf_validate(struct rte_eth_dev *dev, RTE_FLOW_ERROR_TYPE_ACTION, actions, "vlan actions are supported" " only with port_id action"); + if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) && + !(action_flags & MLX5_FLOW_ACTION_PORT_ID)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "vxlan actions are supported" + " only with port_id action"); if (!(action_flags & MLX5_TCF_FATE_ACTIONS)) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, actions, @@ -1398,28 +2344,47 @@ flow_tcf_validate(struct rte_eth_dev *dev, "no ethernet found in" " pattern"); } + if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { + if (!(item_flags & + (MLX5_FLOW_LAYER_OUTER_L3_IPV4 | + MLX5_FLOW_LAYER_OUTER_L3_IPV6))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no outer IP pattern found" + " for vxlan decap action"); + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no outer UDP pattern found" + " for vxlan decap action"); + if (!(item_flags & MLX5_FLOW_LAYER_VXLAN)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no VNI pattern found" + " for vxlan decap action"); + } return 0; } /** - * Calculate maximum size of memory for flow items of Linux TC flower and - * extract specified items. + * Calculate maximum size of memory for flow items of Linux TC flower. * + * @param[in] attr + * Pointer to the flow attributes. * @param[in] items * Pointer to the list of items. - * @param[out] item_flags - * Pointer to the detected items. * * @return * Maximum size of memory for items. */ static int -flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - uint64_t *item_flags) +flow_tcf_get_items_size(const struct rte_flow_attr *attr, + const struct rte_flow_item items[]) { int size = 0; - uint64_t flags = 0; size += SZ_NLATTR_STRZ_OF("flower") + SZ_NLATTR_NEST + /* TCA_OPTIONS. */ @@ -1436,7 +2401,6 @@ flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4; /* dst/src MAC addr and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L2; break; case RTE_FLOW_ITEM_TYPE_VLAN: size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ @@ -1444,33 +2408,31 @@ flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, /* VLAN Ether type. */ SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */ SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */ - flags |= MLX5_FLOW_LAYER_OUTER_VLAN; break; case RTE_FLOW_ITEM_TYPE_IPV4: size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_TYPE_OF(uint32_t) * 4; /* dst/src IP addr and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; break; case RTE_FLOW_ITEM_TYPE_IPV6: size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ - SZ_NLATTR_TYPE_OF(IPV6_ADDR_LEN) * 4; + SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4; /* dst/src IP addr and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; break; case RTE_FLOW_ITEM_TYPE_UDP: size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_TYPE_OF(uint16_t) * 4; /* dst/src port and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP; break; case RTE_FLOW_ITEM_TYPE_TCP: size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_TYPE_OF(uint16_t) * 4; /* dst/src port and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + size += SZ_NLATTR_TYPE_OF(uint32_t); break; default: DRV_LOG(WARNING, @@ -1480,7 +2442,69 @@ flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, break; } } - *item_flags = flags; + return size; +} + +/** + * Calculate size of memory to store the VXLAN encapsultion + * related items in the Netlink message buffer. Items list + * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action. + * The item list should be validated. + * + * @param[in] action + * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object. + * List of pattern items to scan data from. + * + * @return + * The size the part of Netlink message buffer to store the + * VXLAN encapsulation item attributes. + */ +static int +flow_tcf_vxlan_encap_size(const struct rte_flow_action *action) +{ + const struct rte_flow_item *items; + int size = 0; + + assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP); + assert(action->conf); + + items = ((const struct rte_flow_action_vxlan_encap *) + action->conf)->definition; + assert(items); + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + /* This item does not require message buffer. */ + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2; + break; + case RTE_FLOW_ITEM_TYPE_UDP: { + const struct rte_flow_item_udp *udp = items->mask; + + size += SZ_NLATTR_TYPE_OF(uint16_t); + if (!udp || udp->hdr.src_port != RTE_BE16(0x0000)) + size += SZ_NLATTR_TYPE_OF(uint16_t); + break; + } + case RTE_FLOW_ITEM_TYPE_VXLAN: + size += SZ_NLATTR_TYPE_OF(uint32_t); + break; + default: + assert(false); + DRV_LOG(WARNING, + "unsupported item %p type %d," + " items must be validated" + " before flow creation", + (const void *)items, items->type); + return 0; + } + } return size; } @@ -1553,6 +2577,29 @@ action_of_vlan: SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */ SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */ break; + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + size += SZ_NLATTR_NEST + /* na_act_index. */ + SZ_NLATTR_STRZ_OF("tunnel_key") + + SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */ + SZ_NLATTR_TYPE_OF(uint8_t); + size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key); + size += flow_tcf_vxlan_encap_size(actions) + + RTE_ALIGN_CEIL /* preceding encap params. */ + (sizeof(struct flow_tcf_vxlan_encap), + MNL_ALIGNTO); + flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP: + size += SZ_NLATTR_NEST + /* na_act_index. */ + SZ_NLATTR_STRZ_OF("tunnel_key") + + SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */ + SZ_NLATTR_TYPE_OF(uint8_t); + size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key); + size += RTE_ALIGN_CEIL /* preceding decap params. */ + (sizeof(struct flow_tcf_vxlan_decap), + MNL_ALIGNTO); + flags |= MLX5_FLOW_ACTION_VXLAN_DECAP; + break; case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: @@ -1610,10 +2657,6 @@ flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle) * Pointer to the list of items. * @param[in] actions * Pointer to the list of actions. - * @param[out] item_flags - * Pointer to bit mask of all items detected. - * @param[out] action_flags - * Pointer to bit mask of all actions detected. * @param[out] error * Pointer to the error structure. * @@ -1625,18 +2668,21 @@ static struct mlx5_flow * flow_tcf_prepare(const struct rte_flow_attr *attr, const struct rte_flow_item items[], const struct rte_flow_action actions[], - uint64_t *item_flags, uint64_t *action_flags, struct rte_flow_error *error) { - size_t size = sizeof(struct mlx5_flow) + + size_t size = RTE_ALIGN_CEIL + (sizeof(struct mlx5_flow), + alignof(struct flow_tcf_tunnel_hdr)) + MNL_ALIGN(sizeof(struct nlmsghdr)) + MNL_ALIGN(sizeof(struct tcmsg)); struct mlx5_flow *dev_flow; + uint64_t action_flags = 0; struct nlmsghdr *nlh; struct tcmsg *tcm; + uint8_t *sp, *tun = NULL; - size += flow_tcf_get_items_and_size(attr, items, item_flags); - size += flow_tcf_get_actions_and_size(actions, action_flags); + size += flow_tcf_get_items_size(attr, items); + size += flow_tcf_get_actions_and_size(actions, &action_flags); dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO); if (!dev_flow) { rte_flow_error_set(error, ENOMEM, @@ -1644,14 +2690,52 @@ flow_tcf_prepare(const struct rte_flow_attr *attr, "not enough memory to create E-Switch flow"); return NULL; } - nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1)); + sp = (uint8_t *)(dev_flow + 1); + if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) { + sp = RTE_PTR_ALIGN + (sp, alignof(struct flow_tcf_tunnel_hdr)); + tun = sp; + sp += RTE_ALIGN_CEIL + (sizeof(struct flow_tcf_vxlan_encap), + MNL_ALIGNTO); +#ifndef NDEBUG + size -= RTE_ALIGN_CEIL + (sizeof(struct flow_tcf_vxlan_encap), + MNL_ALIGNTO); +#endif + } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { + sp = RTE_PTR_ALIGN + (sp, alignof(struct flow_tcf_tunnel_hdr)); + tun = sp; + sp += RTE_ALIGN_CEIL + (sizeof(struct flow_tcf_vxlan_decap), + MNL_ALIGNTO); +#ifndef NDEBUG + size -= RTE_ALIGN_CEIL + (sizeof(struct flow_tcf_vxlan_decap), + MNL_ALIGNTO); +#endif + } else { + sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO); + } + nlh = mnl_nlmsg_put_header(sp); tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); *dev_flow = (struct mlx5_flow){ .tcf = (struct mlx5_flow_tcf){ +#ifndef NDEBUG + .nlsize = size - RTE_ALIGN_CEIL + (sizeof(struct mlx5_flow), + alignof(struct flow_tcf_tunnel_hdr)), +#endif + .tunnel = (struct flow_tcf_tunnel_hdr *)tun, .nlh = nlh, .tcm = tcm, }, }; + if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) + dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP; + else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) + dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP; /* * Generate a reasonably unique handle based on the address of the * target buffer. @@ -1702,6 +2786,241 @@ flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused, } /** + * Convert VXLAN VNI to 32-bit integer. + * + * @param[in] vni + * VXLAN VNI in 24-bit wire format. + * + * @return + * VXLAN VNI as a 32-bit integer value in network endian. + */ +static inline rte_be32_t +vxlan_vni_as_be32(const uint8_t vni[3]) +{ + union { + uint8_t vni[4]; + rte_be32_t dword; + } ret = { + .vni = { 0, vni[0], vni[1], vni[2] }, + }; + return ret.dword; +} + +/** + * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields + * in the encapsulation parameters structure. The item must be prevalidated, + * no any validation checks performed by function. + * + * @param[in] spec + * RTE_FLOW_ITEM_TYPE_ETH entry specification. + * @param[in] mask + * RTE_FLOW_ITEM_TYPE_ETH entry mask. + * @param[out] encap + * Structure to fill the gathered MAC address data. + */ +static void +flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec, + const struct rte_flow_item_eth *mask, + struct flow_tcf_vxlan_encap *encap) +{ + /* Item must be validated before. No redundant checks. */ + assert(spec); + if (!mask || !memcmp(&mask->dst, + &rte_flow_item_eth_mask.dst, + sizeof(rte_flow_item_eth_mask.dst))) { + /* + * Ethernet addresses are not supported by + * tc as tunnel_key parameters. Destination + * address is needed to form encap packet + * header and retrieved by kernel from + * implicit sources (ARP table, etc), + * address masks are not supported at all. + */ + encap->eth.dst = spec->dst; + encap->mask |= FLOW_TCF_ENCAP_ETH_DST; + } + if (!mask || !memcmp(&mask->src, + &rte_flow_item_eth_mask.src, + sizeof(rte_flow_item_eth_mask.src))) { + /* + * Ethernet addresses are not supported by + * tc as tunnel_key parameters. Source ethernet + * address is ignored anyway. + */ + encap->eth.src = spec->src; + encap->mask |= FLOW_TCF_ENCAP_ETH_SRC; + } +} + +/** + * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields + * in the encapsulation parameters structure. The item must be prevalidated, + * no any validation checks performed by function. + * + * @param[in] spec + * RTE_FLOW_ITEM_TYPE_IPV4 entry specification. + * @param[out] encap + * Structure to fill the gathered IPV4 address data. + */ +static void +flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec, + struct flow_tcf_vxlan_encap *encap) +{ + /* Item must be validated before. No redundant checks. */ + assert(spec); + encap->ipv4.dst = spec->hdr.dst_addr; + encap->ipv4.src = spec->hdr.src_addr; + encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC | + FLOW_TCF_ENCAP_IPV4_DST; +} + +/** + * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields + * in the encapsulation parameters structure. The item must be prevalidated, + * no any validation checks performed by function. + * + * @param[in] spec + * RTE_FLOW_ITEM_TYPE_IPV6 entry specification. + * @param[out] encap + * Structure to fill the gathered IPV6 address data. + */ +static void +flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec, + struct flow_tcf_vxlan_encap *encap) +{ + /* Item must be validated before. No redundant checks. */ + assert(spec); + memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN); + memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN); + encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC | + FLOW_TCF_ENCAP_IPV6_DST; +} + +/** + * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields + * in the encapsulation parameters structure. The item must be prevalidated, + * no any validation checks performed by function. + * + * @param[in] spec + * RTE_FLOW_ITEM_TYPE_UDP entry specification. + * @param[in] mask + * RTE_FLOW_ITEM_TYPE_UDP entry mask. + * @param[out] encap + * Structure to fill the gathered UDP port data. + */ +static void +flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec, + const struct rte_flow_item_udp *mask, + struct flow_tcf_vxlan_encap *encap) +{ + assert(spec); + encap->udp.dst = spec->hdr.dst_port; + encap->mask |= FLOW_TCF_ENCAP_UDP_DST; + if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) { + encap->udp.src = spec->hdr.src_port; + encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC; + } +} + +/** + * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields + * in the encapsulation parameters structure. The item must be prevalidated, + * no any validation checks performed by function. + * + * @param[in] spec + * RTE_FLOW_ITEM_TYPE_VXLAN entry specification. + * @param[out] encap + * Structure to fill the gathered VNI address data. + */ +static void +flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec, + struct flow_tcf_vxlan_encap *encap) +{ + /* Item must be validated before. Do not redundant checks. */ + assert(spec); + memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni)); + encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI; +} + +/** + * Populate consolidated encapsulation object from list of pattern items. + * + * Helper function to process configuration of action such as + * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be + * validated, there is no way to return an meaningful error. + * + * @param[in] action + * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object. + * List of pattern items to gather data from. + * @param[out] src + * Structure to fill gathered data. + */ +static void +flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action, + struct flow_tcf_vxlan_encap *encap) +{ + union { + const struct rte_flow_item_eth *eth; + const struct rte_flow_item_ipv4 *ipv4; + const struct rte_flow_item_ipv6 *ipv6; + const struct rte_flow_item_udp *udp; + const struct rte_flow_item_vxlan *vxlan; + } spec, mask; + const struct rte_flow_item *items; + + assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP); + assert(action->conf); + + items = ((const struct rte_flow_action_vxlan_encap *) + action->conf)->definition; + assert(items); + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + mask.eth = items->mask; + spec.eth = items->spec; + flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth, + encap); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + spec.ipv4 = items->spec; + flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap); + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + spec.ipv6 = items->spec; + flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap); + break; + case RTE_FLOW_ITEM_TYPE_UDP: + mask.udp = items->mask; + spec.udp = items->spec; + flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp, + encap); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + spec.vxlan = items->spec; + flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap); + break; + default: + assert(false); + DRV_LOG(WARNING, + "unsupported item %p type %d," + " items must be validated" + " before flow creation", + (const void *)items, items->type); + encap->mask = 0; + return; + } + } +} + +/** * Translate flow for Linux TC flower and construct Netlink message. * * @param[in] priv @@ -1735,6 +3054,7 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, const struct rte_flow_item_ipv6 *ipv6; const struct rte_flow_item_tcp *tcp; const struct rte_flow_item_udp *udp; + const struct rte_flow_item_vxlan *vxlan; } spec, mask; union { const struct rte_flow_action_port_id *port_id; @@ -1745,6 +3065,18 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, const struct rte_flow_action_of_set_vlan_pcp * of_set_vlan_pcp; } conf; + union { + struct flow_tcf_tunnel_hdr *hdr; + struct flow_tcf_vxlan_decap *vxlan; + } decap = { + .hdr = NULL, + }; + union { + struct flow_tcf_tunnel_hdr *hdr; + struct flow_tcf_vxlan_encap *vxlan; + } encap = { + .hdr = NULL, + }; struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)]; struct nlmsghdr *nlh = dev_flow->tcf.nlh; struct tcmsg *tcm = dev_flow->tcf.tcm; @@ -1762,6 +3094,20 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi, PTOI_TABLE_SZ_MAX(dev))); + if (dev_flow->tcf.tunnel) { + switch (dev_flow->tcf.tunnel->type) { + case FLOW_TCF_TUNACT_VXLAN_DECAP: + decap.vxlan = dev_flow->tcf.vxlan_decap; + break; + case FLOW_TCF_TUNACT_VXLAN_ENCAP: + encap.vxlan = dev_flow->tcf.vxlan_encap; + break; + /* New tunnel actions can be added here. */ + default: + assert(false); + break; + } + } nlh = dev_flow->tcf.nlh; tcm = dev_flow->tcf.tcm; /* Prepare API must have been called beforehand. */ @@ -1779,7 +3125,6 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group); mnl_attr_put_strz(nlh, TCA_KIND, "flower"); na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS); - mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, TCA_CLS_FLAGS_SKIP_SW); for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { unsigned int i; @@ -1807,7 +3152,9 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, tcm->tcm_ifindex = ptoi[i].ifindex; break; case RTE_FLOW_ITEM_TYPE_ETH: - item_flags |= MLX5_FLOW_LAYER_OUTER_L2; + item_flags |= (item_flags & MLX5_FLOW_LAYER_VXLAN) ? + MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; mask.eth = flow_tcf_item_mask (items, &rte_flow_item_eth_mask, &flow_tcf_mask_supported.eth, @@ -1818,6 +3165,14 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, if (mask.eth == &flow_tcf_mask_empty.eth) break; spec.eth = items->spec; + if (decap.vxlan && + !(item_flags & MLX5_FLOW_LAYER_VXLAN)) { + DRV_LOG(WARNING, + "outer L2 addresses cannot be forced" + " for vxlan decapsulation, parameter" + " ignored"); + break; + } if (mask.eth->type) { mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE, spec.eth->type); @@ -1839,8 +3194,11 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, ETHER_ADDR_LEN, mask.eth->src.addr_bytes); } + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; case RTE_FLOW_ITEM_TYPE_VLAN: + assert(!encap.hdr); + assert(!decap.hdr); item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN; mask.vlan = flow_tcf_item_mask (items, &rte_flow_item_vlan_mask, @@ -1872,6 +3230,7 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, rte_be_to_cpu_16 (spec.vlan->tci & RTE_BE16(0x0fff))); + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; case RTE_FLOW_ITEM_TYPE_IPV4: item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; @@ -1882,36 +3241,53 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, sizeof(flow_tcf_mask_supported.ipv4), error); assert(mask.ipv4); - if (!eth_type_set || !vlan_eth_type_set) - mnl_attr_put_u16(nlh, + spec.ipv4 = items->spec; + if (!decap.vxlan) { + if (!eth_type_set && !vlan_eth_type_set) + mnl_attr_put_u16 + (nlh, vlan_present ? TCA_FLOWER_KEY_VLAN_ETH_TYPE : TCA_FLOWER_KEY_ETH_TYPE, RTE_BE16(ETH_P_IP)); - eth_type_set = 1; - vlan_eth_type_set = 1; - if (mask.ipv4 == &flow_tcf_mask_empty.ipv4) - break; - spec.ipv4 = items->spec; - if (mask.ipv4->hdr.next_proto_id) { - mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO, - spec.ipv4->hdr.next_proto_id); - ip_proto_set = 1; + eth_type_set = 1; + vlan_eth_type_set = 1; + if (mask.ipv4 == &flow_tcf_mask_empty.ipv4) + break; + if (mask.ipv4->hdr.next_proto_id) { + mnl_attr_put_u8 + (nlh, TCA_FLOWER_KEY_IP_PROTO, + spec.ipv4->hdr.next_proto_id); + ip_proto_set = 1; + } + } else { + assert(mask.ipv4 != &flow_tcf_mask_empty.ipv4); } if (mask.ipv4->hdr.src_addr) { - mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_SRC, - spec.ipv4->hdr.src_addr); - mnl_attr_put_u32(nlh, - TCA_FLOWER_KEY_IPV4_SRC_MASK, - mask.ipv4->hdr.src_addr); + mnl_attr_put_u32 + (nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_IPV4_SRC : + TCA_FLOWER_KEY_IPV4_SRC, + spec.ipv4->hdr.src_addr); + mnl_attr_put_u32 + (nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK : + TCA_FLOWER_KEY_IPV4_SRC_MASK, + mask.ipv4->hdr.src_addr); } if (mask.ipv4->hdr.dst_addr) { - mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_DST, - spec.ipv4->hdr.dst_addr); - mnl_attr_put_u32(nlh, - TCA_FLOWER_KEY_IPV4_DST_MASK, - mask.ipv4->hdr.dst_addr); + mnl_attr_put_u32 + (nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_IPV4_DST : + TCA_FLOWER_KEY_IPV4_DST, + spec.ipv4->hdr.dst_addr); + mnl_attr_put_u32 + (nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK : + TCA_FLOWER_KEY_IPV4_DST_MASK, + mask.ipv4->hdr.dst_addr); } + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; case RTE_FLOW_ITEM_TYPE_IPV6: item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; @@ -1922,38 +3298,54 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, sizeof(flow_tcf_mask_supported.ipv6), error); assert(mask.ipv6); - if (!eth_type_set || !vlan_eth_type_set) - mnl_attr_put_u16(nlh, + spec.ipv6 = items->spec; + if (!decap.vxlan) { + if (!eth_type_set || !vlan_eth_type_set) { + mnl_attr_put_u16 + (nlh, vlan_present ? TCA_FLOWER_KEY_VLAN_ETH_TYPE : TCA_FLOWER_KEY_ETH_TYPE, RTE_BE16(ETH_P_IPV6)); - eth_type_set = 1; - vlan_eth_type_set = 1; - if (mask.ipv6 == &flow_tcf_mask_empty.ipv6) - break; - spec.ipv6 = items->spec; - if (mask.ipv6->hdr.proto) { - mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO, - spec.ipv6->hdr.proto); - ip_proto_set = 1; + } + eth_type_set = 1; + vlan_eth_type_set = 1; + if (mask.ipv6 == &flow_tcf_mask_empty.ipv6) + break; + if (mask.ipv6->hdr.proto) { + mnl_attr_put_u8 + (nlh, TCA_FLOWER_KEY_IP_PROTO, + spec.ipv6->hdr.proto); + ip_proto_set = 1; + } + } else { + assert(mask.ipv6 != &flow_tcf_mask_empty.ipv6); } if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) { - mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC, - sizeof(spec.ipv6->hdr.src_addr), + mnl_attr_put(nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_IPV6_SRC : + TCA_FLOWER_KEY_IPV6_SRC, + IPV6_ADDR_LEN, spec.ipv6->hdr.src_addr); - mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC_MASK, - sizeof(mask.ipv6->hdr.src_addr), + mnl_attr_put(nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK : + TCA_FLOWER_KEY_IPV6_SRC_MASK, + IPV6_ADDR_LEN, mask.ipv6->hdr.src_addr); } if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) { - mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST, - sizeof(spec.ipv6->hdr.dst_addr), + mnl_attr_put(nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_IPV6_DST : + TCA_FLOWER_KEY_IPV6_DST, + IPV6_ADDR_LEN, spec.ipv6->hdr.dst_addr); - mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST_MASK, - sizeof(mask.ipv6->hdr.dst_addr), + mnl_attr_put(nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK : + TCA_FLOWER_KEY_IPV6_DST_MASK, + IPV6_ADDR_LEN, mask.ipv6->hdr.dst_addr); } + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; case RTE_FLOW_ITEM_TYPE_UDP: item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP; @@ -1964,26 +3356,45 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, sizeof(flow_tcf_mask_supported.udp), error); assert(mask.udp); - if (!ip_proto_set) - mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO, - IPPROTO_UDP); - if (mask.udp == &flow_tcf_mask_empty.udp) - break; spec.udp = items->spec; + if (!decap.vxlan) { + if (!ip_proto_set) + mnl_attr_put_u8 + (nlh, TCA_FLOWER_KEY_IP_PROTO, + IPPROTO_UDP); + if (mask.udp == &flow_tcf_mask_empty.udp) + break; + } else { + assert(mask.udp != &flow_tcf_mask_empty.udp); + decap.vxlan->udp_port = + rte_be_to_cpu_16 + (spec.udp->hdr.dst_port); + } if (mask.udp->hdr.src_port) { - mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_SRC, - spec.udp->hdr.src_port); - mnl_attr_put_u16(nlh, - TCA_FLOWER_KEY_UDP_SRC_MASK, - mask.udp->hdr.src_port); + mnl_attr_put_u16 + (nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT : + TCA_FLOWER_KEY_UDP_SRC, + spec.udp->hdr.src_port); + mnl_attr_put_u16 + (nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK : + TCA_FLOWER_KEY_UDP_SRC_MASK, + mask.udp->hdr.src_port); } if (mask.udp->hdr.dst_port) { - mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_DST, - spec.udp->hdr.dst_port); - mnl_attr_put_u16(nlh, - TCA_FLOWER_KEY_UDP_DST_MASK, - mask.udp->hdr.dst_port); + mnl_attr_put_u16 + (nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_UDP_DST_PORT : + TCA_FLOWER_KEY_UDP_DST, + spec.udp->hdr.dst_port); + mnl_attr_put_u16 + (nlh, decap.vxlan ? + TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK : + TCA_FLOWER_KEY_UDP_DST_MASK, + mask.udp->hdr.dst_port); } + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; case RTE_FLOW_ITEM_TYPE_TCP: item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP; @@ -2026,6 +3437,16 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, rte_cpu_to_be_16 (mask.tcp->hdr.tcp_flags)); } + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + assert(decap.vxlan); + item_flags |= MLX5_FLOW_LAYER_VXLAN; + spec.vxlan = items->spec; + mnl_attr_put_u32(nlh, + TCA_FLOWER_KEY_ENC_KEY_ID, + vxlan_vni_as_be32(spec.vxlan->vni)); + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; default: return rte_flow_error_set(error, ENOTSUP, @@ -2059,6 +3480,14 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred"); na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS); assert(na_act); + if (encap.hdr) { + assert(dev_flow->tcf.tunnel); + dev_flow->tcf.tunnel->ifindex_ptr = + &((struct tc_mirred *) + mnl_attr_get_payload + (mnl_nlmsg_get_payload_tail + (nlh)))->ifindex; + } mnl_attr_put(nlh, TCA_MIRRED_PARMS, sizeof(struct tc_mirred), &(struct tc_mirred){ @@ -2176,6 +3605,74 @@ override_na_vlan_priority: conf.of_set_vlan_pcp->vlan_pcp; } break; + case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP: + assert(decap.vxlan); + assert(dev_flow->tcf.tunnel); + dev_flow->tcf.tunnel->ifindex_ptr = + (unsigned int *)&tcm->tcm_ifindex; + na_act_index = + mnl_attr_nest_start(nlh, na_act_index_cur++); + assert(na_act_index); + mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key"); + na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS); + assert(na_act); + mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS, + sizeof(struct tc_tunnel_key), + &(struct tc_tunnel_key){ + .action = TC_ACT_PIPE, + .t_action = TCA_TUNNEL_KEY_ACT_RELEASE, + }); + mnl_attr_nest_end(nlh, na_act); + mnl_attr_nest_end(nlh, na_act_index); + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + assert(encap.vxlan); + flow_tcf_vxlan_encap_parse(actions, encap.vxlan); + na_act_index = + mnl_attr_nest_start(nlh, na_act_index_cur++); + assert(na_act_index); + mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key"); + na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS); + assert(na_act); + mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS, + sizeof(struct tc_tunnel_key), + &(struct tc_tunnel_key){ + .action = TC_ACT_PIPE, + .t_action = TCA_TUNNEL_KEY_ACT_SET, + }); + if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST) + mnl_attr_put_u16(nlh, + TCA_TUNNEL_KEY_ENC_DST_PORT, + encap.vxlan->udp.dst); + if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC) + mnl_attr_put_u32(nlh, + TCA_TUNNEL_KEY_ENC_IPV4_SRC, + encap.vxlan->ipv4.src); + if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST) + mnl_attr_put_u32(nlh, + TCA_TUNNEL_KEY_ENC_IPV4_DST, + encap.vxlan->ipv4.dst); + if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC) + mnl_attr_put(nlh, + TCA_TUNNEL_KEY_ENC_IPV6_SRC, + sizeof(encap.vxlan->ipv6.src), + &encap.vxlan->ipv6.src); + if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST) + mnl_attr_put(nlh, + TCA_TUNNEL_KEY_ENC_IPV6_DST, + sizeof(encap.vxlan->ipv6.dst), + &encap.vxlan->ipv6.dst); + if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI) + mnl_attr_put_u32(nlh, + TCA_TUNNEL_KEY_ENC_KEY_ID, + vxlan_vni_as_be32 + (encap.vxlan->vxlan.vni)); + mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0); + mnl_attr_nest_end(nlh, na_act); + mnl_attr_nest_end(nlh, na_act_index); + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); + break; case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: @@ -2202,47 +3699,1381 @@ override_na_vlan_priority: assert(na_flower); assert(na_flower_act); mnl_attr_nest_end(nlh, na_flower_act); + mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ? + 0 : TCA_CLS_FLAGS_SKIP_SW); mnl_attr_nest_end(nlh, na_flower); + if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr) + dev_flow->tcf.tunnel->ifindex_org = + *dev_flow->tcf.tunnel->ifindex_ptr; + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); return 0; } /** * Send Netlink message with acknowledgment. * - * @param ctx + * @param tcf * Flow context to use. * @param nlh * Message to send. This function always raises the NLM_F_ACK flag before * sending. + * @param[in] msglen + * Message length. Message buffer may contain multiple commands and + * nlmsg_len field not always corresponds to actual message length. + * If 0 specified the nlmsg_len field in header is used as message length. + * @param[in] cb + * Callback handler for received message. + * @param[in] arg + * Context pointer for callback handler. * * @return * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -flow_tcf_nl_ack(struct mlx5_flow_tcf_context *ctx, struct nlmsghdr *nlh) +flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf, + struct nlmsghdr *nlh, + uint32_t msglen, + mnl_cb_t cb, void *arg) { - alignas(struct nlmsghdr) - uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) + - nlh->nlmsg_len - sizeof(*nlh)]; - uint32_t seq = ctx->seq++; - struct mnl_socket *nl = ctx->nl; - int ret; + unsigned int portid = mnl_socket_get_portid(tcf->nl); + uint32_t seq = tcf->seq++; + int err, ret; - nlh->nlmsg_flags |= NLM_F_ACK; + assert(tcf->nl); + assert(tcf->buf); + if (!seq) + /* seq 0 is reserved for kernel event-driven notifications. */ + seq = tcf->seq++; nlh->nlmsg_seq = seq; - ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len); - if (ret != -1) - ret = mnl_socket_recvfrom(nl, ans, sizeof(ans)); - if (ret != -1) - ret = mnl_cb_run - (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL); + if (!msglen) { + msglen = nlh->nlmsg_len; + nlh->nlmsg_flags |= NLM_F_ACK; + } + ret = mnl_socket_sendto(tcf->nl, nlh, msglen); + err = (ret <= 0) ? errno : 0; + nlh = (struct nlmsghdr *)(tcf->buf); + /* + * The following loop postpones non-fatal errors until multipart + * messages are complete. + */ if (ret > 0) + while (true) { + ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, + tcf->buf_size); + if (ret < 0) { + err = errno; + if (err != ENOSPC) + break; + } + if (!err) { + ret = mnl_cb_run(nlh, ret, seq, portid, + cb, arg); + if (ret < 0) { + err = errno; + break; + } + } + /* Will receive till end of multipart message */ + if (!(nlh->nlmsg_flags & NLM_F_MULTI) || + nlh->nlmsg_type == NLMSG_DONE) + break; + } + if (!err) return 0; - rte_errno = errno; - return -rte_errno; + rte_errno = err; + return -err; +} + +#define MNL_BUF_EXTRA_SPACE 16 +#define MNL_REQUEST_SIZE_MIN 256 +#define MNL_REQUEST_SIZE_MAX 2048 +#define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \ + MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX) + +/* Data structures used by flow_tcf_xxx_cb() routines. */ +struct tcf_nlcb_buf { + LIST_ENTRY(tcf_nlcb_buf) next; + uint32_t size; + alignas(struct nlmsghdr) + uint8_t msg[]; /**< Netlink message data. */ +}; + +struct tcf_nlcb_context { + unsigned int ifindex; /**< Base interface index. */ + uint32_t bufsize; + LIST_HEAD(, tcf_nlcb_buf) nlbuf; +}; + +/** + * Allocate space for netlink command in buffer list + * + * @param[in, out] ctx + * Pointer to callback context with command buffers list. + * @param[in] size + * Required size of data buffer to be allocated. + * + * @return + * Pointer to allocated memory, aligned as message header. + * NULL if some error occurred. + */ +static struct nlmsghdr * +flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size) +{ + struct tcf_nlcb_buf *buf; + struct nlmsghdr *nlh; + + size = NLMSG_ALIGN(size); + buf = LIST_FIRST(&ctx->nlbuf); + if (buf && (buf->size + size) <= ctx->bufsize) { + nlh = (struct nlmsghdr *)&buf->msg[buf->size]; + buf->size += size; + return nlh; + } + if (size > ctx->bufsize) { + DRV_LOG(WARNING, "netlink: too long command buffer requested"); + return NULL; + } + buf = rte_malloc(__func__, + ctx->bufsize + sizeof(struct tcf_nlcb_buf), + alignof(struct tcf_nlcb_buf)); + if (!buf) { + DRV_LOG(WARNING, "netlink: no memory for command buffer"); + return NULL; + } + LIST_INSERT_HEAD(&ctx->nlbuf, buf, next); + buf->size = size; + nlh = (struct nlmsghdr *)&buf->msg[0]; + return nlh; +} + +/** + * Set NLM_F_ACK flags in the last netlink command in buffer. + * Only last command in the buffer will be acked by system. + * + * @param[in, out] buf + * Pointer to buffer with netlink commands. + */ +static void +flow_tcf_setack_nlcmd(struct tcf_nlcb_buf *buf) +{ + struct nlmsghdr *nlh; + uint32_t size = 0; + + assert(buf->size); + do { + nlh = (struct nlmsghdr *)&buf->msg[size]; + size += NLMSG_ALIGN(nlh->nlmsg_len); + if (size >= buf->size) { + nlh->nlmsg_flags |= NLM_F_ACK; + break; + } + } while (true); +} + +/** + * Send the buffers with prepared netlink commands. Scans the list and + * sends all found buffers. Buffers are sent and freed anyway in order + * to prevent memory leakage if some every message in received packet. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in, out] ctx + * Pointer to callback context with command buffers list. + * + * @return + * Zero value on success, negative errno value otherwise + * and rte_errno is set. + */ +static int +flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf, + struct tcf_nlcb_context *ctx) +{ + struct tcf_nlcb_buf *bc, *bn; + struct nlmsghdr *nlh; + int ret = 0; + + bc = LIST_FIRST(&ctx->nlbuf); + while (bc) { + int rc; + + bn = LIST_NEXT(bc, next); + if (bc->size) { + flow_tcf_setack_nlcmd(bc); + nlh = (struct nlmsghdr *)&bc->msg; + rc = flow_tcf_nl_ack(tcf, nlh, bc->size, NULL, NULL); + if (rc && !ret) + ret = rc; + } + rte_free(bc); + bc = bn; + } + LIST_INIT(&ctx->nlbuf); + return ret; +} + +/** + * Collect local IP address rules with scope link attribute on specified + * network device. This is callback routine called by libmnl mnl_cb_run() + * in loop for every message in received packet. + * + * @param[in] nlh + * Pointer to reply header. + * @param[in, out] arg + * Opaque data pointer for this callback. + * + * @return + * A positive, nonzero value on success, negative errno value otherwise + * and rte_errno is set. + */ +static int +flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg) +{ + struct tcf_nlcb_context *ctx = arg; + struct nlmsghdr *cmd; + struct ifaddrmsg *ifa; + struct nlattr *na; + struct nlattr *na_local = NULL; + struct nlattr *na_peer = NULL; + unsigned char family; + + if (nlh->nlmsg_type != RTM_NEWADDR) { + rte_errno = EINVAL; + return -rte_errno; + } + ifa = mnl_nlmsg_get_payload(nlh); + family = ifa->ifa_family; + if (ifa->ifa_index != ctx->ifindex || + ifa->ifa_scope != RT_SCOPE_LINK || + !(ifa->ifa_flags & IFA_F_PERMANENT) || + (family != AF_INET && family != AF_INET6)) + return 1; + mnl_attr_for_each(na, nlh, sizeof(*ifa)) { + switch (mnl_attr_get_type(na)) { + case IFA_LOCAL: + na_local = na; + break; + case IFA_ADDRESS: + na_peer = na; + break; + } + if (na_local && na_peer) + break; + } + if (!na_local || !na_peer) + return 1; + /* Local rule found with scope link, permanent and assigned peer. */ + cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) + + MNL_ALIGN(sizeof(struct ifaddrmsg)) + + (family == AF_INET6 + ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) + : 2 * SZ_NLATTR_TYPE_OF(uint32_t))); + if (!cmd) { + rte_errno = ENOMEM; + return -rte_errno; + } + cmd = mnl_nlmsg_put_header(cmd); + cmd->nlmsg_type = RTM_DELADDR; + cmd->nlmsg_flags = NLM_F_REQUEST; + ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa)); + ifa->ifa_flags = IFA_F_PERMANENT; + ifa->ifa_scope = RT_SCOPE_LINK; + ifa->ifa_index = ctx->ifindex; + if (family == AF_INET) { + ifa->ifa_family = AF_INET; + ifa->ifa_prefixlen = 32; + mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local)); + mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer)); + } else { + ifa->ifa_family = AF_INET6; + ifa->ifa_prefixlen = 128; + mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN, + mnl_attr_get_payload(na_local)); + mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN, + mnl_attr_get_payload(na_peer)); + } + return 1; +} + +/** + * Cleanup the local IP addresses on outer interface. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifindex + * Network inferface index to perform cleanup. + */ +static void +flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf, + unsigned int ifindex) +{ + struct nlmsghdr *nlh; + struct ifaddrmsg *ifa; + struct tcf_nlcb_context ctx = { + .ifindex = ifindex, + .bufsize = MNL_REQUEST_SIZE, + .nlbuf = LIST_HEAD_INITIALIZER(), + }; + int ret; + + assert(ifindex); + /* + * Seek and destroy leftovers of local IP addresses with + * matching properties "scope link". + */ + nlh = mnl_nlmsg_put_header(tcf->buf); + nlh->nlmsg_type = RTM_GETADDR; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa)); + ifa->ifa_family = AF_UNSPEC; + ifa->ifa_index = ifindex; + ifa->ifa_scope = RT_SCOPE_LINK; + ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_local_cb, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: query device list error %d", ret); + ret = flow_tcf_send_nlcmd(tcf, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: device delete error %d", ret); +} + +/** + * Collect neigh permament rules on specified network device. + * This is callback routine called by libmnl mnl_cb_run() in loop for + * every message in received packet. + * + * @param[in] nlh + * Pointer to reply header. + * @param[in, out] arg + * Opaque data pointer for this callback. + * + * @return + * A positive, nonzero value on success, negative errno value otherwise + * and rte_errno is set. + */ +static int +flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg) +{ + struct tcf_nlcb_context *ctx = arg; + struct nlmsghdr *cmd; + struct ndmsg *ndm; + struct nlattr *na; + struct nlattr *na_ip = NULL; + struct nlattr *na_mac = NULL; + unsigned char family; + + if (nlh->nlmsg_type != RTM_NEWNEIGH) { + rte_errno = EINVAL; + return -rte_errno; + } + ndm = mnl_nlmsg_get_payload(nlh); + family = ndm->ndm_family; + if (ndm->ndm_ifindex != (int)ctx->ifindex || + !(ndm->ndm_state & NUD_PERMANENT) || + (family != AF_INET && family != AF_INET6)) + return 1; + mnl_attr_for_each(na, nlh, sizeof(*ndm)) { + switch (mnl_attr_get_type(na)) { + case NDA_DST: + na_ip = na; + break; + case NDA_LLADDR: + na_mac = na; + break; + } + if (na_mac && na_ip) + break; + } + if (!na_mac || !na_ip) + return 1; + /* Neigh rule with permenent attribute found. */ + cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) + + MNL_ALIGN(sizeof(struct ndmsg)) + + SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) + + (family == AF_INET6 + ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) + : SZ_NLATTR_TYPE_OF(uint32_t))); + if (!cmd) { + rte_errno = ENOMEM; + return -rte_errno; + } + cmd = mnl_nlmsg_put_header(cmd); + cmd->nlmsg_type = RTM_DELNEIGH; + cmd->nlmsg_flags = NLM_F_REQUEST; + ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm)); + ndm->ndm_ifindex = ctx->ifindex; + ndm->ndm_state = NUD_PERMANENT; + ndm->ndm_flags = 0; + ndm->ndm_type = 0; + if (family == AF_INET) { + ndm->ndm_family = AF_INET; + mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip)); + } else { + ndm->ndm_family = AF_INET6; + mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN, + mnl_attr_get_payload(na_ip)); + } + mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN, + mnl_attr_get_payload(na_mac)); + return 1; +} + +/** + * Cleanup the neigh rules on outer interface. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifindex + * Network inferface index to perform cleanup. + */ +static void +flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf, + unsigned int ifindex) +{ + struct nlmsghdr *nlh; + struct ndmsg *ndm; + struct tcf_nlcb_context ctx = { + .ifindex = ifindex, + .bufsize = MNL_REQUEST_SIZE, + .nlbuf = LIST_HEAD_INITIALIZER(), + }; + int ret; + + assert(ifindex); + /* Seek and destroy leftovers of neigh rules. */ + nlh = mnl_nlmsg_put_header(tcf->buf); + nlh->nlmsg_type = RTM_GETNEIGH; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm)); + ndm->ndm_family = AF_UNSPEC; + ndm->ndm_ifindex = ifindex; + ndm->ndm_state = NUD_PERMANENT; + ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_neigh_cb, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: query device list error %d", ret); + ret = flow_tcf_send_nlcmd(tcf, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: device delete error %d", ret); +} + +/** + * Collect indices of VXLAN encap/decap interfaces associated with device. + * This is callback routine called by libmnl mnl_cb_run() in loop for + * every message in received packet. + * + * @param[in] nlh + * Pointer to reply header. + * @param[in, out] arg + * Opaque data pointer for this callback. + * + * @return + * A positive, nonzero value on success, negative errno value otherwise + * and rte_errno is set. + */ +static int +flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg) +{ + struct tcf_nlcb_context *ctx = arg; + struct nlmsghdr *cmd; + struct ifinfomsg *ifm; + struct nlattr *na; + struct nlattr *na_info = NULL; + struct nlattr *na_vxlan = NULL; + bool found = false; + unsigned int vxindex; + + if (nlh->nlmsg_type != RTM_NEWLINK) { + rte_errno = EINVAL; + return -rte_errno; + } + ifm = mnl_nlmsg_get_payload(nlh); + if (!ifm->ifi_index) { + rte_errno = EINVAL; + return -rte_errno; + } + mnl_attr_for_each(na, nlh, sizeof(*ifm)) + if (mnl_attr_get_type(na) == IFLA_LINKINFO) { + na_info = na; + break; + } + if (!na_info) + return 1; + mnl_attr_for_each_nested(na, na_info) { + switch (mnl_attr_get_type(na)) { + case IFLA_INFO_KIND: + if (!strncmp("vxlan", mnl_attr_get_str(na), + mnl_attr_get_len(na))) + found = true; + break; + case IFLA_INFO_DATA: + na_vxlan = na; + break; + } + if (found && na_vxlan) + break; + } + if (!found || !na_vxlan) + return 1; + found = false; + mnl_attr_for_each_nested(na, na_vxlan) { + if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK && + mnl_attr_get_u32(na) == ctx->ifindex) { + found = true; + break; + } + } + if (!found) + return 1; + /* Attached VXLAN device found, store the command to delete. */ + vxindex = ifm->ifi_index; + cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) + + MNL_ALIGN(sizeof(struct ifinfomsg))); + if (!nlh) { + rte_errno = ENOMEM; + return -rte_errno; + } + cmd = mnl_nlmsg_put_header(cmd); + cmd->nlmsg_type = RTM_DELLINK; + cmd->nlmsg_flags = NLM_F_REQUEST; + ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_index = vxindex; + return 1; +} + +/** + * Cleanup the outer interface. Removes all found vxlan devices + * attached to specified index, flushes the meigh and local IP + * datavase. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifindex + * Network inferface index to perform cleanup. + */ +static void +flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf, + unsigned int ifindex) +{ + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + struct tcf_nlcb_context ctx = { + .ifindex = ifindex, + .bufsize = MNL_REQUEST_SIZE, + .nlbuf = LIST_HEAD_INITIALIZER(), + }; + int ret; + + assert(ifindex); + /* + * Seek and destroy leftover VXLAN encap/decap interfaces with + * matching properties. + */ + nlh = mnl_nlmsg_put_header(tcf->buf); + nlh->nlmsg_type = RTM_GETLINK; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_vxlan_cb, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: query device list error %d", ret); + ret = flow_tcf_send_nlcmd(tcf, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: device delete error %d", ret); +} + +/** + * Emit Netlink message to add/remove local address to the outer device. + * The address being added is visible within the link only (scope link). + * + * Note that an implicit route is maintained by the kernel due to the + * presence of a peer address (IFA_ADDRESS). + * + * These rules are used for encapsultion only and allow to assign + * the outer tunnel source IP address. + * + * @param[in] tcf + * Libmnl socket context object. + * @param[in] encap + * Encapsulation properties (source address and its peer). + * @param[in] ifindex + * Network interface to apply rule. + * @param[in] enable + * Toggle between add and remove. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf, + const struct flow_tcf_vxlan_encap *encap, + unsigned int ifindex, + bool enable, + struct rte_flow_error *error) +{ + struct nlmsghdr *nlh; + struct ifaddrmsg *ifa; + alignas(struct nlmsghdr) + uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)]; + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR; + nlh->nlmsg_flags = + NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0); + nlh->nlmsg_seq = 0; + ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa)); + ifa->ifa_flags = IFA_F_PERMANENT; + ifa->ifa_scope = RT_SCOPE_LINK; + ifa->ifa_index = ifindex; + if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) { + ifa->ifa_family = AF_INET; + ifa->ifa_prefixlen = 32; + mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src); + if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) + mnl_attr_put_u32(nlh, IFA_ADDRESS, + encap->ipv4.dst); + } else { + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC); + ifa->ifa_family = AF_INET6; + ifa->ifa_prefixlen = 128; + mnl_attr_put(nlh, IFA_LOCAL, + sizeof(encap->ipv6.src), + &encap->ipv6.src); + if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST) + mnl_attr_put(nlh, IFA_ADDRESS, + sizeof(encap->ipv6.dst), + &encap->ipv6.dst); + } + if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL)) + return 0; + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: cannot complete IFA request" + " (ip addr add)"); +} + +/** + * Emit Netlink message to add/remove neighbor. + * + * @param[in] tcf + * Libmnl socket context object. + * @param[in] encap + * Encapsulation properties (destination address). + * @param[in] ifindex + * Network interface. + * @param[in] enable + * Toggle between add and remove. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf, + const struct flow_tcf_vxlan_encap *encap, + unsigned int ifindex, + bool enable, + struct rte_flow_error *error) +{ + struct nlmsghdr *nlh; + struct ndmsg *ndm; + alignas(struct nlmsghdr) + uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)]; + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH; + nlh->nlmsg_flags = + NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0); + nlh->nlmsg_seq = 0; + ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm)); + ndm->ndm_ifindex = ifindex; + ndm->ndm_state = NUD_PERMANENT; + ndm->ndm_flags = 0; + ndm->ndm_type = 0; + if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) { + ndm->ndm_family = AF_INET; + mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst); + } else { + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST); + ndm->ndm_family = AF_INET6; + mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst), + &encap->ipv6.dst); + } + if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable) + DRV_LOG(WARNING, + "outer ethernet source address cannot be " + "forced for VXLAN encapsulation"); + if (encap->mask & FLOW_TCF_ENCAP_ETH_DST) + mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst), + &encap->eth.dst); + if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL)) + return 0; + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: cannot complete ND request" + " (ip neigh)"); +} + +/** + * Manage the local IP addresses and their peers IP addresses on the + * outer interface for encapsulation purposes. The kernel searches the + * appropriate device for tunnel egress traffic using the outer source + * IP, this IP should be assigned to the outer network device, otherwise + * kernel rejects the rule. + * + * Adds or removes the addresses using the Netlink command like this: + * ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter> + * + * The addresses are local to the netdev ("scope link"), this reduces + * the risk of conflicts. Note that an implicit route is maintained by + * the kernel due to the presence of a peer address (IFA_ADDRESS). + * + * @param[in] tcf + * Libmnl socket context object. + * @param[in] vtep + * VTEP object, contains rule database and ifouter index. + * @param[in] dev_flow + * Flow object, contains the tunnel parameters (for encap only). + * @param[in] enable + * Toggle between add and remove. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf, + struct tcf_vtep *vtep, + struct mlx5_flow *dev_flow, + bool enable, + struct rte_flow_error *error) +{ + const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap; + struct tcf_local_rule *rule; + bool found = false; + int ret; + + assert(encap); + assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP); + if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) { + assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST); + LIST_FOREACH(rule, &vtep->local, next) { + if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC && + encap->ipv4.src == rule->ipv4.src && + encap->ipv4.dst == rule->ipv4.dst) { + found = true; + break; + } + } + } else { + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC); + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST); + LIST_FOREACH(rule, &vtep->local, next) { + if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC && + !memcmp(&encap->ipv6.src, &rule->ipv6.src, + sizeof(encap->ipv6.src)) && + !memcmp(&encap->ipv6.dst, &rule->ipv6.dst, + sizeof(encap->ipv6.dst))) { + found = true; + break; + } + } + } + if (found) { + if (enable) { + rule->refcnt++; + return 0; + } + if (!rule->refcnt || !--rule->refcnt) { + LIST_REMOVE(rule, next); + return flow_tcf_rule_local(tcf, encap, + vtep->ifouter, false, error); + } + return 0; + } + if (!enable) { + DRV_LOG(WARNING, "disabling not existing local rule"); + rte_flow_error_set(error, ENOENT, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "disabling not existing local rule"); + return -ENOENT; + } + rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule), + alignof(struct tcf_local_rule)); + if (!rule) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unable to allocate memory for local rule"); + return -rte_errno; + } + *rule = (struct tcf_local_rule){.refcnt = 0, + .mask = 0, + }; + if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) { + rule->mask = FLOW_TCF_ENCAP_IPV4_SRC + | FLOW_TCF_ENCAP_IPV4_DST; + rule->ipv4.src = encap->ipv4.src; + rule->ipv4.dst = encap->ipv4.dst; + } else { + rule->mask = FLOW_TCF_ENCAP_IPV6_SRC + | FLOW_TCF_ENCAP_IPV6_DST; + memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN); + memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN); + } + ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error); + if (ret) { + rte_free(rule); + return ret; + } + rule->refcnt++; + LIST_INSERT_HEAD(&vtep->local, rule, next); + return 0; +} + +/** + * Manage the destination MAC/IP addresses neigh database, kernel uses + * this one to determine the destination MAC address within encapsulation + * header. Adds or removes the entries using the Netlink command like this: + * ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent + * + * @param[in] tcf + * Libmnl socket context object. + * @param[in] vtep + * VTEP object, contains rule database and ifouter index. + * @param[in] dev_flow + * Flow object, contains the tunnel parameters (for encap only). + * @param[in] enable + * Toggle between add and remove. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf, + struct tcf_vtep *vtep, + struct mlx5_flow *dev_flow, + bool enable, + struct rte_flow_error *error) +{ + const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap; + struct tcf_neigh_rule *rule; + bool found = false; + int ret; + + assert(encap); + assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP); + if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) { + assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC); + LIST_FOREACH(rule, &vtep->neigh, next) { + if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST && + encap->ipv4.dst == rule->ipv4.dst) { + found = true; + break; + } + } + } else { + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC); + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST); + LIST_FOREACH(rule, &vtep->neigh, next) { + if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST && + !memcmp(&encap->ipv6.dst, &rule->ipv6.dst, + sizeof(encap->ipv6.dst))) { + found = true; + break; + } + } + } + if (found) { + if (memcmp(&encap->eth.dst, &rule->eth, + sizeof(encap->eth.dst))) { + DRV_LOG(WARNING, "Destination MAC differs" + " in neigh rule"); + rte_flow_error_set(error, EEXIST, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "Different MAC address" + " neigh rule for the same" + " destination IP"); + return -EEXIST; + } + if (enable) { + rule->refcnt++; + return 0; + } + if (!rule->refcnt || !--rule->refcnt) { + LIST_REMOVE(rule, next); + return flow_tcf_rule_neigh(tcf, encap, + vtep->ifouter, + false, error); + } + return 0; + } + if (!enable) { + DRV_LOG(WARNING, "Disabling not existing neigh rule"); + rte_flow_error_set(error, ENOENT, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unable to allocate memory for neigh rule"); + return -ENOENT; + } + rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule), + alignof(struct tcf_neigh_rule)); + if (!rule) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unable to allocate memory for neigh rule"); + return -rte_errno; + } + *rule = (struct tcf_neigh_rule){.refcnt = 0, + .mask = 0, + }; + if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) { + rule->mask = FLOW_TCF_ENCAP_IPV4_DST; + rule->ipv4.dst = encap->ipv4.dst; + } else { + rule->mask = FLOW_TCF_ENCAP_IPV6_DST; + memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN); + } + memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth)); + ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error); + if (ret) { + rte_free(rule); + return ret; + } + rule->refcnt++; + LIST_INSERT_HEAD(&vtep->neigh, rule, next); + return 0; +} + +/* VTEP device list is shared between PMD port instances. */ +static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER(); +static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER; + +/** + * Deletes VTEP network device. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] vtep + * Object represinting the network device to delete. Memory + * allocated for this object is freed by routine. + */ +static void +flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf, + struct tcf_vtep *vtep) +{ + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + alignas(struct nlmsghdr) + uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) + + MNL_BUF_EXTRA_SPACE]; + int ret; + + assert(!vtep->refcnt); + /* Delete only ifaces those we actually created. */ + if (vtep->created && vtep->ifindex) { + DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex); + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = RTM_DELLINK; + nlh->nlmsg_flags = NLM_F_REQUEST; + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_index = vtep->ifindex; + assert(sizeof(buf) >= nlh->nlmsg_len); + ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL); + if (ret) + DRV_LOG(WARNING, "netlink: error deleting vxlan" + " encap/decap ifindex %u", + ifm->ifi_index); + } + rte_free(vtep); +} + +/** + * Creates VTEP network device. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifouter + * Outer interface to attach new-created VXLAN device + * If zero the VXLAN device will not be attached to any device. + * These VTEPs are used for decapsulation and can be precreated + * and shared between processes. + * @param[in] port + * UDP port of created VTEP device. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * Pointer to created device structure on success, + * NULL otherwise and rte_errno is set. + */ +#ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA +static struct tcf_vtep* +flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf, + unsigned int ifouter, + uint16_t port, struct rte_flow_error *error) +{ + struct tcf_vtep *vtep; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24]; + alignas(struct nlmsghdr) + uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) + + SZ_NLATTR_DATA_OF(sizeof(name)) + + SZ_NLATTR_NEST * 2 + + SZ_NLATTR_STRZ_OF("vxlan") + + SZ_NLATTR_DATA_OF(sizeof(uint32_t)) + + SZ_NLATTR_DATA_OF(sizeof(uint16_t)) + + SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 + + MNL_BUF_EXTRA_SPACE]; + struct nlattr *na_info; + struct nlattr *na_vxlan; + rte_be16_t vxlan_port = rte_cpu_to_be_16(port); + int ret; + + vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep)); + if (!vtep) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unable to allocate memory for VTEP"); + return NULL; + } + *vtep = (struct tcf_vtep){ + .port = port, + .local = LIST_HEAD_INITIALIZER(), + .neigh = LIST_HEAD_INITIALIZER(), + }; + memset(buf, 0, sizeof(buf)); + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_type = 0; + ifm->ifi_index = 0; + ifm->ifi_flags = IFF_UP; + ifm->ifi_change = 0xffffffff; + snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port); + mnl_attr_put_strz(nlh, IFLA_IFNAME, name); + na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO); + assert(na_info); + mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan"); + na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA); + if (ifouter) + mnl_attr_put_u32(nlh, IFLA_VXLAN_LINK, ifouter); + assert(na_vxlan); + mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1); + mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1); + mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0); + mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port); + mnl_attr_nest_end(nlh, na_vxlan); + mnl_attr_nest_end(nlh, na_info); + assert(sizeof(buf) >= nlh->nlmsg_len); + ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL); + if (ret) { + DRV_LOG(WARNING, + "netlink: VTEP %s create failure (%d)", + name, rte_errno); + if (rte_errno != EEXIST || ifouter) + /* + * Some unhandled error occurred or device is + * for encapsulation and cannot be shared. + */ + goto error; + } else { + /* + * Mark device we actually created. + * We should explicitly delete + * when we do not need it anymore. + */ + vtep->created = 1; + } + /* Try to get ifindex of created of pre-existing device. */ + ret = if_nametoindex(name); + if (!ret) { + DRV_LOG(WARNING, + "VTEP %s failed to get index (%d)", name, errno); + rte_flow_error_set + (error, -errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to retrieve VTEP ifindex"); + goto error; + } + vtep->ifindex = ret; + vtep->ifouter = ifouter; + memset(buf, 0, sizeof(buf)); + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + nlh->nlmsg_flags = NLM_F_REQUEST; + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_type = 0; + ifm->ifi_index = vtep->ifindex; + ifm->ifi_flags = IFF_UP; + ifm->ifi_change = IFF_UP; + ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL); + if (ret) { + rte_flow_error_set(error, -errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to set VTEP link up"); + DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)", + name, rte_errno); + goto clean; + } + ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error); + if (ret) { + DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno); + goto clean; + } + DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex); + vtep->refcnt = 1; + return vtep; +clean: + flow_tcf_vtep_delete(tcf, vtep); + return NULL; +error: + rte_free(vtep); + return NULL; +} +#else +static struct tcf_vtep* +flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused, + unsigned int ifouter __rte_unused, + uint16_t port __rte_unused, + struct rte_flow_error *error) +{ + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to create VTEP, " + "vxlan metadata are not supported by kernel"); + return NULL; +} +#endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */ + +/** + * Acquire target interface index for VXLAN tunneling decapsulation. + * In order to share the UDP port within the other interfaces the + * VXLAN device created as not attached to any interface (if created). + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] dev_flow + * Flow tcf object with tunnel structure pointer set. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @return + * Interface descriptor pointer on success, + * NULL otherwise and rte_errno is set. + */ +static struct tcf_vtep* +flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct tcf_vtep *vtep; + uint16_t port = dev_flow->tcf.vxlan_decap->udp_port; + + LIST_FOREACH(vtep, &vtep_list_vxlan, next) { + if (vtep->port == port) + break; + } + if (vtep && vtep->ifouter) { + rte_flow_error_set(error, -errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "Failed to create decap VTEP with specified" + " UDP port, atatched device exists"); + return NULL; + } + if (vtep) { + /* Device exists, just increment the reference counter. */ + vtep->refcnt++; + assert(vtep->ifindex); + return vtep; + } + /* No decapsulation device exists, try to create the new one. */ + vtep = flow_tcf_vtep_create(tcf, 0, port, error); + if (vtep) + LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next); + return vtep; +} + +/** + * Aqcuire target interface index for VXLAN tunneling encapsulation. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifouter + * Network interface index to attach VXLAN encap device to. + * @param[in] dev_flow + * Flow tcf object with tunnel structure pointer set. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @return + * Interface descriptor pointer on success, + * NULL otherwise and rte_errno is set. + */ +static struct tcf_vtep* +flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf, + unsigned int ifouter, + struct mlx5_flow *dev_flow __rte_unused, + struct rte_flow_error *error) +{ + static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1; + struct tcf_vtep *vtep; + int ret; + + assert(ifouter); + /* Look whether the attached VTEP for encap is created. */ + LIST_FOREACH(vtep, &vtep_list_vxlan, next) { + if (vtep->ifouter == ifouter) + break; + } + if (vtep) { + /* VTEP already exists, just increment the reference. */ + vtep->refcnt++; + } else { + uint16_t pcnt; + + /* Not found, we should create the new attached VTEP. */ + flow_tcf_encap_iface_cleanup(tcf, ifouter); + flow_tcf_encap_local_cleanup(tcf, ifouter); + flow_tcf_encap_neigh_cleanup(tcf, ifouter); + for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX + - MLX5_VXLAN_PORT_MIN); pcnt++) { + encap_port++; + /* Wraparound the UDP port index. */ + if (encap_port < MLX5_VXLAN_PORT_MIN || + encap_port > MLX5_VXLAN_PORT_MAX) + encap_port = MLX5_VXLAN_PORT_MIN; + /* Check whether UDP port is in already in use. */ + LIST_FOREACH(vtep, &vtep_list_vxlan, next) { + if (vtep->port == encap_port) + break; + } + if (vtep) { + /* Port is in use, try the next one. */ + vtep = NULL; + continue; + } + vtep = flow_tcf_vtep_create(tcf, ifouter, + encap_port, error); + if (vtep) { + LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next); + break; + } + if (rte_errno != EEXIST) + break; + } + if (!vtep) + return NULL; + } + assert(vtep->ifouter == ifouter); + assert(vtep->ifindex); + /* Create local ipaddr with peer to specify the outer IPs. */ + ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error); + if (!ret) { + /* Create neigh rule to specify outer destination MAC. */ + ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error); + if (ret) + flow_tcf_encap_local(tcf, vtep, + dev_flow, false, error); + } + if (ret) { + if (--vtep->refcnt == 0) + flow_tcf_vtep_delete(tcf, vtep); + return NULL; + } + return vtep; } /** + * Acquires target interface index for tunneling of any type. + * Creates the new VTEP if needed. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifouter + * Network interface index to attach VXLAN encap device to. + * @param[in] dev_flow + * Flow tcf object with tunnel structure pointer set. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @return + * Interface descriptor pointer on success, + * NULL otherwise and rte_errno is set. + */ +static struct tcf_vtep* +flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf, + unsigned int ifouter, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct tcf_vtep *vtep = NULL; + + assert(dev_flow->tcf.tunnel); + pthread_mutex_lock(&vtep_list_mutex); + switch (dev_flow->tcf.tunnel->type) { + case FLOW_TCF_TUNACT_VXLAN_ENCAP: + vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter, + dev_flow, error); + break; + case FLOW_TCF_TUNACT_VXLAN_DECAP: + vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error); + break; + default: + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unsupported tunnel type"); + break; + } + pthread_mutex_unlock(&vtep_list_mutex); + return vtep; +} + +/** + * Release tunneling interface by ifindex. Decrements reference + * counter and actually removes the device if counter is zero. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] vtep + * VTEP device descriptor structure. + * @param[in] dev_flow + * Flow tcf object with tunnel structure pointer set. + */ +static void +flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf, + struct tcf_vtep *vtep, + struct mlx5_flow *dev_flow) +{ + assert(dev_flow->tcf.tunnel); + pthread_mutex_lock(&vtep_list_mutex); + switch (dev_flow->tcf.tunnel->type) { + case FLOW_TCF_TUNACT_VXLAN_DECAP: + break; + case FLOW_TCF_TUNACT_VXLAN_ENCAP: + /* Remove the encap ancillary rules first. */ + flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL); + flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL); + break; + default: + assert(false); + DRV_LOG(WARNING, "Unsupported tunnel type"); + break; + } + assert(vtep->refcnt); + if (--vtep->refcnt == 0) { + LIST_REMOVE(vtep, next); + flow_tcf_vtep_delete(tcf, vtep); + } + pthread_mutex_unlock(&vtep_list_mutex); +} + + +/** * Apply flow to E-Switch by sending Netlink message. * * @param[in] dev @@ -2267,11 +5098,35 @@ flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow, dev_flow = LIST_FIRST(&flow->dev_flows); /* E-Switch flow can't be expanded. */ assert(!LIST_NEXT(dev_flow, next)); + if (dev_flow->tcf.applied) + return 0; nlh = dev_flow->tcf.nlh; nlh->nlmsg_type = RTM_NEWTFILTER; nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; - if (!flow_tcf_nl_ack(ctx, nlh)) + if (dev_flow->tcf.tunnel) { + /* + * Replace the interface index, target for + * encapsulation, source for decapsulation. + */ + assert(!dev_flow->tcf.tunnel->vtep); + assert(dev_flow->tcf.tunnel->ifindex_ptr); + /* Acquire actual VTEP device when rule is being applied. */ + dev_flow->tcf.tunnel->vtep = + flow_tcf_vtep_acquire(ctx, + dev_flow->tcf.tunnel->ifindex_org, + dev_flow, error); + if (!dev_flow->tcf.tunnel->vtep) + return -rte_errno; + DRV_LOG(INFO, "Replace ifindex: %d->%d", + dev_flow->tcf.tunnel->vtep->ifindex, + dev_flow->tcf.tunnel->ifindex_org); + *dev_flow->tcf.tunnel->ifindex_ptr = + dev_flow->tcf.tunnel->vtep->ifindex; + } + if (!flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL)) { + dev_flow->tcf.applied = 1; return 0; + } return rte_flow_error_set(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, "netlink: failed to create TC flow rule"); @@ -2295,21 +5150,25 @@ flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow) if (!flow) return; - if (flow->counter) { - if (--flow->counter->ref_cnt == 0) { - rte_free(flow->counter); - flow->counter = NULL; - } - } dev_flow = LIST_FIRST(&flow->dev_flows); if (!dev_flow) return; /* E-Switch flow can't be expanded. */ assert(!LIST_NEXT(dev_flow, next)); - nlh = dev_flow->tcf.nlh; - nlh->nlmsg_type = RTM_DELTFILTER; - nlh->nlmsg_flags = NLM_F_REQUEST; - flow_tcf_nl_ack(ctx, nlh); + if (dev_flow->tcf.applied) { + nlh = dev_flow->tcf.nlh; + nlh->nlmsg_type = RTM_DELTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST; + flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL); + if (dev_flow->tcf.tunnel) { + assert(dev_flow->tcf.tunnel->vtep); + flow_tcf_vtep_release(ctx, + dev_flow->tcf.tunnel->vtep, + dev_flow); + dev_flow->tcf.tunnel->vtep = NULL; + } + dev_flow->tcf.applied = 0; + } } /** @@ -2328,6 +5187,12 @@ flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) if (!flow) return; flow_tcf_remove(dev, flow); + if (flow->counter) { + if (--flow->counter->ref_cnt == 0) { + rte_free(flow->counter); + flow->counter = NULL; + } + } dev_flow = LIST_FIRST(&flow->dev_flows); if (!dev_flow) return; @@ -2830,7 +5695,9 @@ mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx, struct nlmsghdr *nlh; struct tcmsg *tcm; alignas(struct nlmsghdr) - uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)]; + uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) + + SZ_NLATTR_STRZ_OF("ingress") + + MNL_BUF_EXTRA_SPACE]; /* Destroy existing ingress qdisc and everything attached to it. */ nlh = mnl_nlmsg_put_header(buf); @@ -2841,8 +5708,9 @@ mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx, tcm->tcm_ifindex = ifindex; tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); tcm->tcm_parent = TC_H_INGRESS; + assert(sizeof(buf) >= nlh->nlmsg_len); /* Ignore errors when qdisc is already absent. */ - if (flow_tcf_nl_ack(ctx, nlh) && + if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL) && rte_errno != EINVAL && rte_errno != ENOENT) return rte_flow_error_set(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, @@ -2858,7 +5726,8 @@ mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx, tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); tcm->tcm_parent = TC_H_INGRESS; mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress"); - if (flow_tcf_nl_ack(ctx, nlh)) + assert(sizeof(buf) >= nlh->nlmsg_len); + if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL)) return rte_flow_error_set(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, "netlink: failed to create ingress" diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c index 81bc39f9..699cc88c 100644 --- a/drivers/net/mlx5/mlx5_flow_verbs.c +++ b/drivers/net/mlx5/mlx5_flow_verbs.c @@ -33,6 +33,9 @@ #include "mlx5_glue.h" #include "mlx5_flow.h" +#define VERBS_SPEC_INNER(item_flags) \ + (!!((item_flags) & MLX5_FLOW_LAYER_TUNNEL) ? IBV_FLOW_SPEC_INNER : 0) + /** * Create Verbs flow counter with Verbs library. * @@ -231,27 +234,26 @@ flow_verbs_counter_query(struct rte_eth_dev *dev __rte_unused, } /** - * Add a verbs item specification into @p flow. + * Add a verbs item specification into @p verbs. * - * @param[in, out] flow - * Pointer to flow structure. + * @param[out] verbs + * Pointer to verbs structure. * @param[in] src * Create specification. * @param[in] size * Size in bytes of the specification to copy. */ static void -flow_verbs_spec_add(struct mlx5_flow *flow, void *src, unsigned int size) +flow_verbs_spec_add(struct mlx5_flow_verbs *verbs, void *src, unsigned int size) { - struct mlx5_flow_verbs *verbs = &flow->verbs; + void *dst; - if (verbs->specs) { - void *dst; - - dst = (void *)(verbs->specs + verbs->size); - memcpy(dst, src, size); - ++verbs->attr->num_of_specs; - } + if (!verbs) + return; + assert(verbs->specs); + dst = (void *)(verbs->specs + verbs->size); + memcpy(dst, src, size); + ++verbs->attr->num_of_specs; verbs->size += size; } @@ -260,24 +262,23 @@ flow_verbs_spec_add(struct mlx5_flow *flow, void *src, unsigned int size) * the input is valid and that there is space to insert the requested item * into the flow. * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. * @param[in] item * Item specification. * @param[in] item_flags - * Bit field with all detected items. - * @param[in, out] dev_flow - * Pointer to dev_flow structure. + * Parsed item flags. */ static void -flow_verbs_translate_item_eth(const struct rte_flow_item *item, - uint64_t *item_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_item_eth(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags) { const struct rte_flow_item_eth *spec = item->spec; const struct rte_flow_item_eth *mask = item->mask; - const int tunnel = !!(*item_flags & MLX5_FLOW_LAYER_TUNNEL); const unsigned int size = sizeof(struct ibv_flow_spec_eth); struct ibv_flow_spec_eth eth = { - .type = IBV_FLOW_SPEC_ETH | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .type = IBV_FLOW_SPEC_ETH | VERBS_SPEC_INNER(item_flags), .size = size, }; @@ -298,11 +299,8 @@ flow_verbs_translate_item_eth(const struct rte_flow_item *item, eth.val.src_mac[i] &= eth.mask.src_mac[i]; } eth.val.ether_type &= eth.mask.ether_type; - dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L2; } - flow_verbs_spec_add(dev_flow, ð, size); - *item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 : - MLX5_FLOW_LAYER_OUTER_L2; + flow_verbs_spec_add(&dev_flow->verbs, ð, size); } /** @@ -344,24 +342,24 @@ flow_verbs_item_vlan_update(struct ibv_flow_attr *attr, * the input is valid and that there is space to insert the requested item * into the flow. * - * @param[in] item - * Item specification. - * @param[in, out] item_flags - * Bit mask that holds all detected items. * @param[in, out] dev_flow * Pointer to dev_flow structure. + * @param[in] item + * Item specification. + * @param[in] item_flags + * Parsed item flags. */ static void -flow_verbs_translate_item_vlan(const struct rte_flow_item *item, - uint64_t *item_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_item_vlan(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags) { const struct rte_flow_item_vlan *spec = item->spec; const struct rte_flow_item_vlan *mask = item->mask; unsigned int size = sizeof(struct ibv_flow_spec_eth); - const int tunnel = !!(*item_flags & MLX5_FLOW_LAYER_TUNNEL); + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); struct ibv_flow_spec_eth eth = { - .type = IBV_FLOW_SPEC_ETH | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .type = IBV_FLOW_SPEC_ETH | VERBS_SPEC_INNER(item_flags), .size = size, }; const uint32_t l2m = tunnel ? MLX5_FLOW_LAYER_INNER_L2 : @@ -377,16 +375,10 @@ flow_verbs_translate_item_vlan(const struct rte_flow_item *item, eth.mask.ether_type = mask->inner_type; eth.val.ether_type &= eth.mask.ether_type; } - if (!(*item_flags & l2m)) { - dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L2; - flow_verbs_spec_add(dev_flow, ð, size); - } else { + if (!(item_flags & l2m)) + flow_verbs_spec_add(&dev_flow->verbs, ð, size); + else flow_verbs_item_vlan_update(dev_flow->verbs.attr, ð); - size = 0; /* Only an update is done in eth specification. */ - } - *item_flags |= tunnel ? - (MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_VLAN) : - (MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_VLAN); } /** @@ -394,32 +386,28 @@ flow_verbs_translate_item_vlan(const struct rte_flow_item *item, * the input is valid and that there is space to insert the requested item * into the flow. * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. * @param[in] item * Item specification. - * @param[in, out] item_flags - * Bit mask that marks all detected items. - * @param[in, out] dev_flow - * Pointer to sepacific flow structure. + * @param[in] item_flags + * Parsed item flags. */ static void -flow_verbs_translate_item_ipv4(const struct rte_flow_item *item, - uint64_t *item_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_item_ipv4(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags) { const struct rte_flow_item_ipv4 *spec = item->spec; const struct rte_flow_item_ipv4 *mask = item->mask; - const int tunnel = !!(*item_flags & MLX5_FLOW_LAYER_TUNNEL); unsigned int size = sizeof(struct ibv_flow_spec_ipv4_ext); struct ibv_flow_spec_ipv4_ext ipv4 = { - .type = IBV_FLOW_SPEC_IPV4_EXT | - (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .type = IBV_FLOW_SPEC_IPV4_EXT | VERBS_SPEC_INNER(item_flags), .size = size, }; if (!mask) mask = &rte_flow_item_ipv4_mask; - *item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : - MLX5_FLOW_LAYER_OUTER_L3_IPV4; if (spec) { ipv4.val = (struct ibv_flow_ipv4_ext_filter){ .src_ip = spec->hdr.src_addr, @@ -439,12 +427,7 @@ flow_verbs_translate_item_ipv4(const struct rte_flow_item *item, ipv4.val.proto &= ipv4.mask.proto; ipv4.val.tos &= ipv4.mask.tos; } - dev_flow->verbs.hash_fields |= - mlx5_flow_hashfields_adjust(dev_flow, tunnel, - MLX5_IPV4_LAYER_TYPES, - MLX5_IPV4_IBV_RX_HASH); - dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L3; - flow_verbs_spec_add(dev_flow, &ipv4, size); + flow_verbs_spec_add(&dev_flow->verbs, &ipv4, size); } /** @@ -452,31 +435,28 @@ flow_verbs_translate_item_ipv4(const struct rte_flow_item *item, * the input is valid and that there is space to insert the requested item * into the flow. * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. * @param[in] item * Item specification. - * @param[in, out] item_flags - * Bit mask that marks all detected items. - * @param[in, out] dev_flow - * Pointer to sepacific flow structure. + * @param[in] item_flags + * Parsed item flags. */ static void -flow_verbs_translate_item_ipv6(const struct rte_flow_item *item, - uint64_t *item_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_item_ipv6(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags) { const struct rte_flow_item_ipv6 *spec = item->spec; const struct rte_flow_item_ipv6 *mask = item->mask; - const int tunnel = !!(dev_flow->layers & MLX5_FLOW_LAYER_TUNNEL); unsigned int size = sizeof(struct ibv_flow_spec_ipv6); struct ibv_flow_spec_ipv6 ipv6 = { - .type = IBV_FLOW_SPEC_IPV6 | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .type = IBV_FLOW_SPEC_IPV6 | VERBS_SPEC_INNER(item_flags), .size = size, }; if (!mask) mask = &rte_flow_item_ipv6_mask; - *item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : - MLX5_FLOW_LAYER_OUTER_L3_IPV6; if (spec) { unsigned int i; uint32_t vtc_flow_val; @@ -516,12 +496,7 @@ flow_verbs_translate_item_ipv6(const struct rte_flow_item *item, ipv6.val.next_hdr &= ipv6.mask.next_hdr; ipv6.val.hop_limit &= ipv6.mask.hop_limit; } - dev_flow->verbs.hash_fields |= - mlx5_flow_hashfields_adjust(dev_flow, tunnel, - MLX5_IPV6_LAYER_TYPES, - MLX5_IPV6_IBV_RX_HASH); - dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L3; - flow_verbs_spec_add(dev_flow, &ipv6, size); + flow_verbs_spec_add(&dev_flow->verbs, &ipv6, size); } /** @@ -529,46 +504,38 @@ flow_verbs_translate_item_ipv6(const struct rte_flow_item *item, * the input is valid and that there is space to insert the requested item * into the flow. * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. * @param[in] item * Item specification. - * @param[in, out] item_flags - * Bit mask that marks all detected items. - * @param[in, out] dev_flow - * Pointer to sepacific flow structure. + * @param[in] item_flags + * Parsed item flags. */ static void -flow_verbs_translate_item_udp(const struct rte_flow_item *item, - uint64_t *item_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_item_tcp(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags __rte_unused) { - const struct rte_flow_item_udp *spec = item->spec; - const struct rte_flow_item_udp *mask = item->mask; - const int tunnel = !!(*item_flags & MLX5_FLOW_LAYER_TUNNEL); + const struct rte_flow_item_tcp *spec = item->spec; + const struct rte_flow_item_tcp *mask = item->mask; unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp); - struct ibv_flow_spec_tcp_udp udp = { - .type = IBV_FLOW_SPEC_UDP | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + struct ibv_flow_spec_tcp_udp tcp = { + .type = IBV_FLOW_SPEC_TCP | VERBS_SPEC_INNER(item_flags), .size = size, }; if (!mask) - mask = &rte_flow_item_udp_mask; - *item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : - MLX5_FLOW_LAYER_OUTER_L4_UDP; + mask = &rte_flow_item_tcp_mask; if (spec) { - udp.val.dst_port = spec->hdr.dst_port; - udp.val.src_port = spec->hdr.src_port; - udp.mask.dst_port = mask->hdr.dst_port; - udp.mask.src_port = mask->hdr.src_port; + tcp.val.dst_port = spec->hdr.dst_port; + tcp.val.src_port = spec->hdr.src_port; + tcp.mask.dst_port = mask->hdr.dst_port; + tcp.mask.src_port = mask->hdr.src_port; /* Remove unwanted bits from values. */ - udp.val.src_port &= udp.mask.src_port; - udp.val.dst_port &= udp.mask.dst_port; + tcp.val.src_port &= tcp.mask.src_port; + tcp.val.dst_port &= tcp.mask.dst_port; } - dev_flow->verbs.hash_fields |= - mlx5_flow_hashfields_adjust(dev_flow, tunnel, ETH_RSS_UDP, - (IBV_RX_HASH_SRC_PORT_UDP | - IBV_RX_HASH_DST_PORT_UDP)); - dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L4; - flow_verbs_spec_add(dev_flow, &udp, size); + flow_verbs_spec_add(&dev_flow->verbs, &tcp, size); } /** @@ -576,46 +543,38 @@ flow_verbs_translate_item_udp(const struct rte_flow_item *item, * the input is valid and that there is space to insert the requested item * into the flow. * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. * @param[in] item * Item specification. - * @param[in, out] item_flags - * Bit mask that marks all detected items. - * @param[in, out] dev_flow - * Pointer to sepacific flow structure. + * @param[in] item_flags + * Parsed item flags. */ static void -flow_verbs_translate_item_tcp(const struct rte_flow_item *item, - uint64_t *item_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_item_udp(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags __rte_unused) { - const struct rte_flow_item_tcp *spec = item->spec; - const struct rte_flow_item_tcp *mask = item->mask; - const int tunnel = !!(dev_flow->layers & MLX5_FLOW_LAYER_TUNNEL); + const struct rte_flow_item_udp *spec = item->spec; + const struct rte_flow_item_udp *mask = item->mask; unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp); - struct ibv_flow_spec_tcp_udp tcp = { - .type = IBV_FLOW_SPEC_TCP | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + struct ibv_flow_spec_tcp_udp udp = { + .type = IBV_FLOW_SPEC_UDP | VERBS_SPEC_INNER(item_flags), .size = size, }; if (!mask) - mask = &rte_flow_item_tcp_mask; - *item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : - MLX5_FLOW_LAYER_OUTER_L4_TCP; + mask = &rte_flow_item_udp_mask; if (spec) { - tcp.val.dst_port = spec->hdr.dst_port; - tcp.val.src_port = spec->hdr.src_port; - tcp.mask.dst_port = mask->hdr.dst_port; - tcp.mask.src_port = mask->hdr.src_port; + udp.val.dst_port = spec->hdr.dst_port; + udp.val.src_port = spec->hdr.src_port; + udp.mask.dst_port = mask->hdr.dst_port; + udp.mask.src_port = mask->hdr.src_port; /* Remove unwanted bits from values. */ - tcp.val.src_port &= tcp.mask.src_port; - tcp.val.dst_port &= tcp.mask.dst_port; + udp.val.src_port &= udp.mask.src_port; + udp.val.dst_port &= udp.mask.dst_port; } - dev_flow->verbs.hash_fields |= - mlx5_flow_hashfields_adjust(dev_flow, tunnel, ETH_RSS_TCP, - (IBV_RX_HASH_SRC_PORT_TCP | - IBV_RX_HASH_DST_PORT_TCP)); - dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L4; - flow_verbs_spec_add(dev_flow, &tcp, size); + flow_verbs_spec_add(&dev_flow->verbs, &udp, size); } /** @@ -623,17 +582,17 @@ flow_verbs_translate_item_tcp(const struct rte_flow_item *item, * the input is valid and that there is space to insert the requested item * into the flow. * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. * @param[in] item * Item specification. - * @param[in, out] item_flags - * Bit mask that marks all detected items. - * @param[in, out] dev_flow - * Pointer to sepacific flow structure. + * @param[in] item_flags + * Parsed item flags. */ static void -flow_verbs_translate_item_vxlan(const struct rte_flow_item *item, - uint64_t *item_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_item_vxlan(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags __rte_unused) { const struct rte_flow_item_vxlan *spec = item->spec; const struct rte_flow_item_vxlan *mask = item->mask; @@ -657,9 +616,7 @@ flow_verbs_translate_item_vxlan(const struct rte_flow_item *item, /* Remove unwanted bits from values. */ vxlan.val.tunnel_id &= vxlan.mask.tunnel_id; } - flow_verbs_spec_add(dev_flow, &vxlan, size); - dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L2; - *item_flags |= MLX5_FLOW_LAYER_VXLAN; + flow_verbs_spec_add(&dev_flow->verbs, &vxlan, size); } /** @@ -667,17 +624,17 @@ flow_verbs_translate_item_vxlan(const struct rte_flow_item *item, * the input is valid and that there is space to insert the requested item * into the flow. * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. * @param[in] item * Item specification. - * @param[in, out] item_flags - * Bit mask that marks all detected items. - * @param[in, out] dev_flow - * Pointer to sepacific flow structure. + * @param[in] item_flags + * Parsed item flags. */ static void -flow_verbs_translate_item_vxlan_gpe(const struct rte_flow_item *item, - uint64_t *item_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_item_vxlan_gpe(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item, + uint64_t item_flags __rte_unused) { const struct rte_flow_item_vxlan_gpe *spec = item->spec; const struct rte_flow_item_vxlan_gpe *mask = item->mask; @@ -701,9 +658,7 @@ flow_verbs_translate_item_vxlan_gpe(const struct rte_flow_item *item, /* Remove unwanted bits from values. */ vxlan_gpe.val.tunnel_id &= vxlan_gpe.mask.tunnel_id; } - flow_verbs_spec_add(dev_flow, &vxlan_gpe, size); - dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L2; - *item_flags |= MLX5_FLOW_LAYER_VXLAN_GPE; + flow_verbs_spec_add(&dev_flow->verbs, &vxlan_gpe, size); } /** @@ -763,17 +718,17 @@ flow_verbs_item_gre_ip_protocol_update(struct ibv_flow_attr *attr, * the input is valid and that there is space to insert the requested item * into the flow. * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. * @param[in] item * Item specification. - * @param[in, out] item_flags - * Bit mask that marks all detected items. - * @param[in, out] dev_flow - * Pointer to sepacific flow structure. + * @param[in] item_flags + * Parsed item flags. */ static void -flow_verbs_translate_item_gre(const struct rte_flow_item *item __rte_unused, - uint64_t *item_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_item_gre(struct mlx5_flow *dev_flow, + const struct rte_flow_item *item __rte_unused, + uint64_t item_flags) { struct mlx5_flow_verbs *verbs = &dev_flow->verbs; #ifndef HAVE_IBV_DEVICE_MPLS_SUPPORT @@ -804,7 +759,7 @@ flow_verbs_translate_item_gre(const struct rte_flow_item *item __rte_unused, tunnel.val.key &= tunnel.mask.key; } #endif - if (*item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) + if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) flow_verbs_item_gre_ip_protocol_update(verbs->attr, IBV_FLOW_SPEC_IPV4_EXT, IPPROTO_GRE); @@ -812,9 +767,7 @@ flow_verbs_translate_item_gre(const struct rte_flow_item *item __rte_unused, flow_verbs_item_gre_ip_protocol_update(verbs->attr, IBV_FLOW_SPEC_IPV6, IPPROTO_GRE); - flow_verbs_spec_add(dev_flow, &tunnel, size); - verbs->attr->priority = MLX5_PRIORITY_MAP_L2; - *item_flags |= MLX5_FLOW_LAYER_GRE; + flow_verbs_spec_add(verbs, &tunnel, size); } /** @@ -822,17 +775,17 @@ flow_verbs_translate_item_gre(const struct rte_flow_item *item __rte_unused, * the input is valid and that there is space to insert the requested action * into the flow. This function also return the action that was added. * + * @param[in, out] dev_flow + * Pointer to dev_flow structure. * @param[in] item * Item specification. - * @param[in, out] item_flags - * Bit mask that marks all detected items. - * @param[in, out] dev_flow - * Pointer to sepacific flow structure. + * @param[in] item_flags + * Parsed item flags. */ static void -flow_verbs_translate_item_mpls(const struct rte_flow_item *item __rte_unused, - uint64_t *action_flags __rte_unused, - struct mlx5_flow *dev_flow __rte_unused) +flow_verbs_translate_item_mpls(struct mlx5_flow *dev_flow __rte_unused, + const struct rte_flow_item *item __rte_unused, + uint64_t item_flags __rte_unused) { #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT const struct rte_flow_item_mpls *spec = item->spec; @@ -851,25 +804,24 @@ flow_verbs_translate_item_mpls(const struct rte_flow_item *item __rte_unused, /* Remove unwanted bits from values. */ mpls.val.label &= mpls.mask.label; } - flow_verbs_spec_add(dev_flow, &mpls, size); - dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L2; - *action_flags |= MLX5_FLOW_LAYER_MPLS; + flow_verbs_spec_add(&dev_flow->verbs, &mpls, size); #endif } /** * Convert the @p action into a Verbs specification. This function assumes that * the input is valid and that there is space to insert the requested action - * into the flow. This function also return the action that was added. + * into the flow. * - * @param[in, out] action_flags - * Pointer to the detected actions. * @param[in] dev_flow * Pointer to mlx5_flow. + * @param[in] action + * Action configuration. */ static void -flow_verbs_translate_action_drop(uint64_t *action_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_action_drop + (struct mlx5_flow *dev_flow, + const struct rte_flow_action *action __rte_unused) { unsigned int size = sizeof(struct ibv_flow_spec_action_drop); struct ibv_flow_spec_action_drop drop = { @@ -877,26 +829,22 @@ flow_verbs_translate_action_drop(uint64_t *action_flags, .size = size, }; - flow_verbs_spec_add(dev_flow, &drop, size); - *action_flags |= MLX5_FLOW_ACTION_DROP; + flow_verbs_spec_add(&dev_flow->verbs, &drop, size); } /** * Convert the @p action into a Verbs specification. This function assumes that * the input is valid and that there is space to insert the requested action - * into the flow. This function also return the action that was added. + * into the flow. * - * @param[in] action - * Action configuration. - * @param[in, out] action_flags - * Pointer to the detected actions. * @param[in] dev_flow * Pointer to mlx5_flow. + * @param[in] action + * Action configuration. */ static void -flow_verbs_translate_action_queue(const struct rte_flow_action *action, - uint64_t *action_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_action_queue(struct mlx5_flow *dev_flow, + const struct rte_flow_action *action) { const struct rte_flow_action_queue *queue = action->conf; struct rte_flow *flow = dev_flow->flow; @@ -904,13 +852,12 @@ flow_verbs_translate_action_queue(const struct rte_flow_action *action, if (flow->queue) (*flow->queue)[0] = queue->index; flow->rss.queue_num = 1; - *action_flags |= MLX5_FLOW_ACTION_QUEUE; } /** * Convert the @p action into a Verbs specification. This function assumes that * the input is valid and that there is space to insert the requested action - * into the flow. This function also return the action that was added. + * into the flow. * * @param[in] action * Action configuration. @@ -920,40 +867,39 @@ flow_verbs_translate_action_queue(const struct rte_flow_action *action, * Pointer to mlx5_flow. */ static void -flow_verbs_translate_action_rss(const struct rte_flow_action *action, - uint64_t *action_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_action_rss(struct mlx5_flow *dev_flow, + const struct rte_flow_action *action) { const struct rte_flow_action_rss *rss = action->conf; + const uint8_t *rss_key; struct rte_flow *flow = dev_flow->flow; if (flow->queue) memcpy((*flow->queue), rss->queue, rss->queue_num * sizeof(uint16_t)); flow->rss.queue_num = rss->queue_num; - memcpy(flow->key, rss->key, MLX5_RSS_HASH_KEY_LEN); - flow->rss.types = rss->types; + /* NULL RSS key indicates default RSS key. */ + rss_key = !rss->key ? rss_hash_default_key : rss->key; + memcpy(flow->key, rss_key, MLX5_RSS_HASH_KEY_LEN); + /* RSS type 0 indicates default RSS type (ETH_RSS_IP). */ + flow->rss.types = !rss->types ? ETH_RSS_IP : rss->types; flow->rss.level = rss->level; - *action_flags |= MLX5_FLOW_ACTION_RSS; } /** * Convert the @p action into a Verbs specification. This function assumes that * the input is valid and that there is space to insert the requested action - * into the flow. This function also return the action that was added. + * into the flow. * - * @param[in] action - * Action configuration. - * @param[in, out] action_flags - * Pointer to the detected actions. * @param[in] dev_flow * Pointer to mlx5_flow. + * @param[in] action + * Action configuration. */ static void flow_verbs_translate_action_flag - (const struct rte_flow_action *action __rte_unused, - uint64_t *action_flags, - struct mlx5_flow *dev_flow) + (struct mlx5_flow *dev_flow, + const struct rte_flow_action *action __rte_unused) { unsigned int size = sizeof(struct ibv_flow_spec_action_tag); struct ibv_flow_spec_action_tag tag = { @@ -961,87 +907,44 @@ flow_verbs_translate_action_flag .size = size, .tag_id = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT), }; - *action_flags |= MLX5_FLOW_ACTION_MARK; - flow_verbs_spec_add(dev_flow, &tag, size); -} -/** - * Update verbs specification to modify the flag to mark. - * - * @param[in, out] verbs - * Pointer to the mlx5_flow_verbs structure. - * @param[in] mark_id - * Mark identifier to replace the flag. - */ -static void -flow_verbs_mark_update(struct mlx5_flow_verbs *verbs, uint32_t mark_id) -{ - struct ibv_spec_header *hdr; - int i; - - if (!verbs) - return; - /* Update Verbs specification. */ - hdr = (struct ibv_spec_header *)verbs->specs; - if (!hdr) - return; - for (i = 0; i != verbs->attr->num_of_specs; ++i) { - if (hdr->type == IBV_FLOW_SPEC_ACTION_TAG) { - struct ibv_flow_spec_action_tag *t = - (struct ibv_flow_spec_action_tag *)hdr; - - t->tag_id = mlx5_flow_mark_set(mark_id); - } - hdr = (struct ibv_spec_header *)((uintptr_t)hdr + hdr->size); - } + flow_verbs_spec_add(&dev_flow->verbs, &tag, size); } /** * Convert the @p action into a Verbs specification. This function assumes that * the input is valid and that there is space to insert the requested action - * into the flow. This function also return the action that was added. + * into the flow. * - * @param[in] action - * Action configuration. - * @param[in, out] action_flags - * Pointer to the detected actions. * @param[in] dev_flow * Pointer to mlx5_flow. + * @param[in] action + * Action configuration. */ static void -flow_verbs_translate_action_mark(const struct rte_flow_action *action, - uint64_t *action_flags, - struct mlx5_flow *dev_flow) +flow_verbs_translate_action_mark(struct mlx5_flow *dev_flow, + const struct rte_flow_action *action) { const struct rte_flow_action_mark *mark = action->conf; unsigned int size = sizeof(struct ibv_flow_spec_action_tag); struct ibv_flow_spec_action_tag tag = { .type = IBV_FLOW_SPEC_ACTION_TAG, .size = size, + .tag_id = mlx5_flow_mark_set(mark->id), }; - struct mlx5_flow_verbs *verbs = &dev_flow->verbs; - if (*action_flags & MLX5_FLOW_ACTION_FLAG) { - flow_verbs_mark_update(verbs, mark->id); - size = 0; - } else { - tag.tag_id = mlx5_flow_mark_set(mark->id); - flow_verbs_spec_add(dev_flow, &tag, size); - } - *action_flags |= MLX5_FLOW_ACTION_MARK; + flow_verbs_spec_add(&dev_flow->verbs, &tag, size); } /** * Convert the @p action into a Verbs specification. This function assumes that * the input is valid and that there is space to insert the requested action - * into the flow. This function also return the action that was added. + * into the flow. * * @param[in] dev * Pointer to the Ethernet device structure. * @param[in] action * Action configuration. - * @param[in, out] action_flags - * Pointer to the detected actions. * @param[in] dev_flow * Pointer to mlx5_flow. * @param[out] error @@ -1051,10 +954,9 @@ flow_verbs_translate_action_mark(const struct rte_flow_action *action, * 0 On success else a negative errno value is returned and rte_errno is set. */ static int -flow_verbs_translate_action_count(struct rte_eth_dev *dev, +flow_verbs_translate_action_count(struct mlx5_flow *dev_flow, const struct rte_flow_action *action, - uint64_t *action_flags, - struct mlx5_flow *dev_flow, + struct rte_eth_dev *dev, struct rte_flow_error *error) { const struct rte_flow_action_count *count = action->conf; @@ -1078,13 +980,12 @@ flow_verbs_translate_action_count(struct rte_eth_dev *dev, "cannot get counter" " context."); } - *action_flags |= MLX5_FLOW_ACTION_COUNT; #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) counter.counter_set_handle = flow->counter->cs->handle; - flow_verbs_spec_add(dev_flow, &counter, size); + flow_verbs_spec_add(&dev_flow->verbs, &counter, size); #elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) counter.counters = flow->counter->cs; - flow_verbs_spec_add(dev_flow, &counter, size); + flow_verbs_spec_add(&dev_flow->verbs, &counter, size); #endif return 0; } @@ -1116,7 +1017,6 @@ flow_verbs_validate(struct rte_eth_dev *dev, int ret; uint64_t action_flags = 0; uint64_t item_flags = 0; - int tunnel = 0; uint8_t next_protocol = 0xff; if (items == NULL) @@ -1125,9 +1025,9 @@ flow_verbs_validate(struct rte_eth_dev *dev, if (ret < 0) return ret; for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); int ret = 0; - tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); switch (items->type) { case RTE_FLOW_ITEM_TYPE_VOID: break; @@ -1144,8 +1044,10 @@ flow_verbs_validate(struct rte_eth_dev *dev, error); if (ret < 0) return ret; - item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : - MLX5_FLOW_LAYER_OUTER_VLAN; + item_flags |= tunnel ? (MLX5_FLOW_LAYER_INNER_L2 | + MLX5_FLOW_LAYER_INNER_VLAN) : + (MLX5_FLOW_LAYER_OUTER_L2 | + MLX5_FLOW_LAYER_OUTER_VLAN); break; case RTE_FLOW_ITEM_TYPE_IPV4: ret = mlx5_flow_validate_item_ipv4(items, item_flags, @@ -1307,23 +1209,18 @@ flow_verbs_validate(struct rte_eth_dev *dev, /** * Calculate the required bytes that are needed for the action part of the verbs - * flow, in addtion returns bit-fields with all the detected action, in order to - * avoid another interation over the actions. + * flow. * * @param[in] actions * Pointer to the list of actions. - * @param[out] action_flags - * Pointer to the detected actions. * * @return * The size of the memory needed for all actions. */ static int -flow_verbs_get_actions_and_size(const struct rte_flow_action actions[], - uint64_t *action_flags) +flow_verbs_get_actions_size(const struct rte_flow_action actions[]) { int size = 0; - uint64_t detected_actions = 0; for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { switch (actions->type) { @@ -1331,125 +1228,89 @@ flow_verbs_get_actions_and_size(const struct rte_flow_action actions[], break; case RTE_FLOW_ACTION_TYPE_FLAG: size += sizeof(struct ibv_flow_spec_action_tag); - detected_actions |= MLX5_FLOW_ACTION_FLAG; break; case RTE_FLOW_ACTION_TYPE_MARK: size += sizeof(struct ibv_flow_spec_action_tag); - detected_actions |= MLX5_FLOW_ACTION_MARK; break; case RTE_FLOW_ACTION_TYPE_DROP: size += sizeof(struct ibv_flow_spec_action_drop); - detected_actions |= MLX5_FLOW_ACTION_DROP; break; case RTE_FLOW_ACTION_TYPE_QUEUE: - detected_actions |= MLX5_FLOW_ACTION_QUEUE; break; case RTE_FLOW_ACTION_TYPE_RSS: - detected_actions |= MLX5_FLOW_ACTION_RSS; break; case RTE_FLOW_ACTION_TYPE_COUNT: #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \ defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) size += sizeof(struct ibv_flow_spec_counter_action); #endif - detected_actions |= MLX5_FLOW_ACTION_COUNT; break; default: break; } } - *action_flags = detected_actions; return size; } /** * Calculate the required bytes that are needed for the item part of the verbs - * flow, in addtion returns bit-fields with all the detected action, in order to - * avoid another interation over the actions. + * flow. * - * @param[in] actions + * @param[in] items * Pointer to the list of items. - * @param[in, out] item_flags - * Pointer to the detected items. * * @return * The size of the memory needed for all items. */ static int -flow_verbs_get_items_and_size(const struct rte_flow_item items[], - uint64_t *item_flags) +flow_verbs_get_items_size(const struct rte_flow_item items[]) { int size = 0; - uint64_t detected_items = 0; for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { - int tunnel = !!(detected_items & MLX5_FLOW_LAYER_TUNNEL); - switch (items->type) { case RTE_FLOW_ITEM_TYPE_VOID: break; case RTE_FLOW_ITEM_TYPE_ETH: size += sizeof(struct ibv_flow_spec_eth); - detected_items |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 : - MLX5_FLOW_LAYER_OUTER_L2; break; case RTE_FLOW_ITEM_TYPE_VLAN: size += sizeof(struct ibv_flow_spec_eth); - detected_items |= tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : - MLX5_FLOW_LAYER_OUTER_VLAN; break; case RTE_FLOW_ITEM_TYPE_IPV4: size += sizeof(struct ibv_flow_spec_ipv4_ext); - detected_items |= tunnel ? - MLX5_FLOW_LAYER_INNER_L3_IPV4 : - MLX5_FLOW_LAYER_OUTER_L3_IPV4; break; case RTE_FLOW_ITEM_TYPE_IPV6: size += sizeof(struct ibv_flow_spec_ipv6); - detected_items |= tunnel ? - MLX5_FLOW_LAYER_INNER_L3_IPV6 : - MLX5_FLOW_LAYER_OUTER_L3_IPV6; break; case RTE_FLOW_ITEM_TYPE_UDP: size += sizeof(struct ibv_flow_spec_tcp_udp); - detected_items |= tunnel ? - MLX5_FLOW_LAYER_INNER_L4_UDP : - MLX5_FLOW_LAYER_OUTER_L4_UDP; break; case RTE_FLOW_ITEM_TYPE_TCP: size += sizeof(struct ibv_flow_spec_tcp_udp); - detected_items |= tunnel ? - MLX5_FLOW_LAYER_INNER_L4_TCP : - MLX5_FLOW_LAYER_OUTER_L4_TCP; break; case RTE_FLOW_ITEM_TYPE_VXLAN: size += sizeof(struct ibv_flow_spec_tunnel); - detected_items |= MLX5_FLOW_LAYER_VXLAN; break; case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: size += sizeof(struct ibv_flow_spec_tunnel); - detected_items |= MLX5_FLOW_LAYER_VXLAN_GPE; break; #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT case RTE_FLOW_ITEM_TYPE_GRE: size += sizeof(struct ibv_flow_spec_gre); - detected_items |= MLX5_FLOW_LAYER_GRE; break; case RTE_FLOW_ITEM_TYPE_MPLS: size += sizeof(struct ibv_flow_spec_mpls); - detected_items |= MLX5_FLOW_LAYER_MPLS; break; #else case RTE_FLOW_ITEM_TYPE_GRE: size += sizeof(struct ibv_flow_spec_tunnel); - detected_items |= MLX5_FLOW_LAYER_TUNNEL; break; #endif default: break; } } - *item_flags = detected_items; return size; } @@ -1464,10 +1325,6 @@ flow_verbs_get_items_and_size(const struct rte_flow_item items[], * Pointer to the list of items. * @param[in] actions * Pointer to the list of actions. - * @param[out] item_flags - * Pointer to bit mask of all items detected. - * @param[out] action_flags - * Pointer to bit mask of all actions detected. * @param[out] error * Pointer to the error structure. * @@ -1479,15 +1336,13 @@ static struct mlx5_flow * flow_verbs_prepare(const struct rte_flow_attr *attr __rte_unused, const struct rte_flow_item items[], const struct rte_flow_action actions[], - uint64_t *item_flags, - uint64_t *action_flags, struct rte_flow_error *error) { uint32_t size = sizeof(struct mlx5_flow) + sizeof(struct ibv_flow_attr); struct mlx5_flow *flow; - size += flow_verbs_get_actions_and_size(actions, action_flags); - size += flow_verbs_get_items_and_size(items, item_flags); + size += flow_verbs_get_actions_size(actions); + size += flow_verbs_get_items_size(items); flow = rte_calloc(__func__, 1, size, 0); if (!flow) { rte_flow_error_set(error, ENOMEM, @@ -1528,50 +1383,48 @@ flow_verbs_translate(struct rte_eth_dev *dev, const struct rte_flow_action actions[], struct rte_flow_error *error) { - uint64_t action_flags = 0; + struct rte_flow *flow = dev_flow->flow; uint64_t item_flags = 0; + uint64_t action_flags = 0; uint64_t priority = attr->priority; + uint32_t subpriority = 0; struct priv *priv = dev->data->dev_private; if (priority == MLX5_FLOW_PRIO_RSVD) priority = priv->config.flow_prio - 1; for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { int ret; + switch (actions->type) { case RTE_FLOW_ACTION_TYPE_VOID: break; case RTE_FLOW_ACTION_TYPE_FLAG: - flow_verbs_translate_action_flag(actions, - &action_flags, - dev_flow); + flow_verbs_translate_action_flag(dev_flow, actions); + action_flags |= MLX5_FLOW_ACTION_FLAG; break; case RTE_FLOW_ACTION_TYPE_MARK: - flow_verbs_translate_action_mark(actions, - &action_flags, - dev_flow); + flow_verbs_translate_action_mark(dev_flow, actions); + action_flags |= MLX5_FLOW_ACTION_MARK; break; case RTE_FLOW_ACTION_TYPE_DROP: - flow_verbs_translate_action_drop(&action_flags, - dev_flow); + flow_verbs_translate_action_drop(dev_flow, actions); + action_flags |= MLX5_FLOW_ACTION_DROP; break; case RTE_FLOW_ACTION_TYPE_QUEUE: - flow_verbs_translate_action_queue(actions, - &action_flags, - dev_flow); + flow_verbs_translate_action_queue(dev_flow, actions); + action_flags |= MLX5_FLOW_ACTION_QUEUE; break; case RTE_FLOW_ACTION_TYPE_RSS: - flow_verbs_translate_action_rss(actions, - &action_flags, - dev_flow); + flow_verbs_translate_action_rss(dev_flow, actions); + action_flags |= MLX5_FLOW_ACTION_RSS; break; case RTE_FLOW_ACTION_TYPE_COUNT: - ret = flow_verbs_translate_action_count(dev, + ret = flow_verbs_translate_action_count(dev_flow, actions, - &action_flags, - dev_flow, - error); + dev, error); if (ret < 0) return ret; + action_flags |= MLX5_FLOW_ACTION_COUNT; break; default: return rte_flow_error_set(error, ENOTSUP, @@ -1580,51 +1433,100 @@ flow_verbs_translate(struct rte_eth_dev *dev, "action not supported"); } } - /* Device flow should have action flags by flow_drv_prepare(). */ - assert(dev_flow->flow->actions == action_flags); + flow->actions = action_flags; for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + switch (items->type) { case RTE_FLOW_ITEM_TYPE_VOID: break; case RTE_FLOW_ITEM_TYPE_ETH: - flow_verbs_translate_item_eth(items, &item_flags, - dev_flow); + flow_verbs_translate_item_eth(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; break; case RTE_FLOW_ITEM_TYPE_VLAN: - flow_verbs_translate_item_vlan(items, &item_flags, - dev_flow); + flow_verbs_translate_item_vlan(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= tunnel ? (MLX5_FLOW_LAYER_INNER_L2 | + MLX5_FLOW_LAYER_INNER_VLAN) : + (MLX5_FLOW_LAYER_OUTER_L2 | + MLX5_FLOW_LAYER_OUTER_VLAN); break; case RTE_FLOW_ITEM_TYPE_IPV4: - flow_verbs_translate_item_ipv4(items, &item_flags, - dev_flow); + flow_verbs_translate_item_ipv4(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L3; + dev_flow->verbs.hash_fields |= + mlx5_flow_hashfields_adjust + (dev_flow, tunnel, + MLX5_IPV4_LAYER_TYPES, + MLX5_IPV4_IBV_RX_HASH); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; break; case RTE_FLOW_ITEM_TYPE_IPV6: - flow_verbs_translate_item_ipv6(items, &item_flags, - dev_flow); - break; - case RTE_FLOW_ITEM_TYPE_UDP: - flow_verbs_translate_item_udp(items, &item_flags, - dev_flow); + flow_verbs_translate_item_ipv6(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L3; + dev_flow->verbs.hash_fields |= + mlx5_flow_hashfields_adjust + (dev_flow, tunnel, + MLX5_IPV6_LAYER_TYPES, + MLX5_IPV6_IBV_RX_HASH); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; break; case RTE_FLOW_ITEM_TYPE_TCP: - flow_verbs_translate_item_tcp(items, &item_flags, - dev_flow); + flow_verbs_translate_item_tcp(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L4; + dev_flow->verbs.hash_fields |= + mlx5_flow_hashfields_adjust + (dev_flow, tunnel, ETH_RSS_TCP, + (IBV_RX_HASH_SRC_PORT_TCP | + IBV_RX_HASH_DST_PORT_TCP)); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + flow_verbs_translate_item_udp(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L4; + dev_flow->verbs.hash_fields |= + mlx5_flow_hashfields_adjust + (dev_flow, tunnel, ETH_RSS_UDP, + (IBV_RX_HASH_SRC_PORT_UDP | + IBV_RX_HASH_DST_PORT_UDP)); + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; break; case RTE_FLOW_ITEM_TYPE_VXLAN: - flow_verbs_translate_item_vxlan(items, &item_flags, - dev_flow); + flow_verbs_translate_item_vxlan(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= MLX5_FLOW_LAYER_VXLAN; break; case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: - flow_verbs_translate_item_vxlan_gpe(items, &item_flags, - dev_flow); + flow_verbs_translate_item_vxlan_gpe(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= MLX5_FLOW_LAYER_VXLAN_GPE; break; case RTE_FLOW_ITEM_TYPE_GRE: - flow_verbs_translate_item_gre(items, &item_flags, - dev_flow); + flow_verbs_translate_item_gre(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= MLX5_FLOW_LAYER_GRE; break; case RTE_FLOW_ITEM_TYPE_MPLS: - flow_verbs_translate_item_mpls(items, &item_flags, - dev_flow); + flow_verbs_translate_item_mpls(dev_flow, items, + item_flags); + subpriority = MLX5_PRIORITY_MAP_L2; + item_flags |= MLX5_FLOW_LAYER_MPLS; break; default: return rte_flow_error_set(error, ENOTSUP, @@ -1633,9 +1535,9 @@ flow_verbs_translate(struct rte_eth_dev *dev, "item not supported"); } } + dev_flow->layers = item_flags; dev_flow->verbs.attr->priority = - mlx5_flow_adjust_priority(dev, priority, - dev_flow->verbs.attr->priority); + mlx5_flow_adjust_priority(dev, priority, subpriority); return 0; } @@ -1669,10 +1571,6 @@ flow_verbs_remove(struct rte_eth_dev *dev, struct rte_flow *flow) verbs->hrxq = NULL; } } - if (flow->counter) { - flow_verbs_counter_release(flow->counter); - flow->counter = NULL; - } } /** @@ -1696,6 +1594,10 @@ flow_verbs_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) LIST_REMOVE(dev_flow, next); rte_free(dev_flow); } + if (flow->counter) { + flow_verbs_counter_release(flow->counter); + flow->counter = NULL; + } } /** diff --git a/drivers/net/mlx5/mlx5_glue.c b/drivers/net/mlx5/mlx5_glue.c index 1afb114f..dd10ad6d 100644 --- a/drivers/net/mlx5/mlx5_glue.c +++ b/drivers/net/mlx5/mlx5_glue.c @@ -174,6 +174,17 @@ mlx5_glue_destroy_flow(struct ibv_flow *flow_id) return ibv_destroy_flow(flow_id); } +static int +mlx5_glue_destroy_flow_action(struct ibv_flow_action *action) +{ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + return ibv_destroy_flow_action(action); +#else + (void)action; + return ENOTSUP; +#endif +} + static struct ibv_qp * mlx5_glue_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { @@ -444,6 +455,30 @@ mlx5_glue_dv_destroy_flow_matcher(struct mlx5dv_flow_matcher *matcher) #endif } +static struct ibv_flow_action * +mlx5_glue_dv_create_flow_action_packet_reformat + (struct ibv_context *ctx, + size_t data_sz, + void *data, + enum mlx5dv_flow_action_packet_reformat_type reformat_type, + enum mlx5dv_flow_table_type ft_type) +{ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + return mlx5dv_create_flow_action_packet_reformat(ctx, + data_sz, + data, + reformat_type, + ft_type); +#else + (void)ctx; + (void)data_sz; + (void)data; + (void)reformat_type; + (void)ft_type; + return NULL; +#endif +} + alignas(RTE_CACHE_LINE_SIZE) const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){ .version = MLX5_GLUE_VERSION, @@ -470,6 +505,7 @@ const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){ .modify_wq = mlx5_glue_modify_wq, .create_flow = mlx5_glue_create_flow, .destroy_flow = mlx5_glue_destroy_flow, + .destroy_flow_action = mlx5_glue_destroy_flow_action, .create_qp = mlx5_glue_create_qp, .create_qp_ex = mlx5_glue_create_qp_ex, .destroy_qp = mlx5_glue_destroy_qp, @@ -497,4 +533,6 @@ const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){ .dv_create_flow_matcher = mlx5_glue_dv_create_flow_matcher, .dv_destroy_flow_matcher = mlx5_glue_dv_destroy_flow_matcher, .dv_create_flow = mlx5_glue_dv_create_flow, + .dv_create_flow_action_packet_reformat = + mlx5_glue_dv_create_flow_action_packet_reformat, }; diff --git a/drivers/net/mlx5/mlx5_glue.h b/drivers/net/mlx5/mlx5_glue.h index 44bfefed..2d92ba8b 100644 --- a/drivers/net/mlx5/mlx5_glue.h +++ b/drivers/net/mlx5/mlx5_glue.h @@ -50,6 +50,9 @@ struct mlx5dv_flow_matcher; struct mlx5dv_flow_matcher_attr; struct mlx5dv_flow_action_attr; struct mlx5dv_flow_match_parameters; +struct ibv_flow_action; +enum mlx5dv_flow_action_packet_reformat_type { packet_reformat_type = 0, }; +enum mlx5dv_flow_table_type { flow_table_type = 0, }; #endif /* LIB_GLUE_VERSION must be updated every time this structure is modified. */ @@ -91,6 +94,7 @@ struct mlx5_glue { struct ibv_flow *(*create_flow)(struct ibv_qp *qp, struct ibv_flow_attr *flow); int (*destroy_flow)(struct ibv_flow *flow_id); + int (*destroy_flow_action)(struct ibv_flow_action *action); struct ibv_qp *(*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); struct ibv_qp *(*create_qp_ex) @@ -154,6 +158,12 @@ struct mlx5_glue { struct mlx5dv_flow_match_parameters *match_value, size_t num_actions, struct mlx5dv_flow_action_attr *actions_attr); + struct ibv_flow_action *(*dv_create_flow_action_packet_reformat) + (struct ibv_context *ctx, + size_t data_sz, + void *data, + enum mlx5dv_flow_action_packet_reformat_type reformat_type, + enum mlx5dv_flow_table_type ft_type); }; const struct mlx5_glue *mlx5_glue; diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index ed993ea6..eef48502 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -841,6 +841,12 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev, uint16_t idx) " timestamp", dev->data->port_id); } +#ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD + if (config->cqe_pad) { + attr.cq.mlx5.comp_mask |= MLX5DV_CQ_INIT_ATTR_MASK_FLAGS; + attr.cq.mlx5.flags |= MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD; + } +#endif tmpl->cq = mlx5_glue->cq_ex_to_cq (mlx5_glue->dv_create_cq(priv->ctx, &attr.cq.ibv, &attr.cq.mlx5)); @@ -1758,6 +1764,8 @@ mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev) * first queue index will be taken for the indirection table. * @param queues_n * Number of queues. + * @param tunnel + * Tunnel type. * * @return * The Verbs object initialised, NULL otherwise and rte_errno is set. @@ -1773,6 +1781,9 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, struct mlx5_hrxq *hrxq; struct mlx5_ind_table_ibv *ind_tbl; struct ibv_qp *qp; +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + struct mlx5dv_qp_init_attr qp_init_attr = {0}; +#endif int err; queues_n = hash_fields ? queues_n : 1; @@ -1783,11 +1794,21 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, rte_errno = ENOMEM; return NULL; } - if (!rss_key_len) { - rss_key_len = MLX5_RSS_HASH_KEY_LEN; - rss_key = rss_hash_default_key; - } #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + if (tunnel) { + qp_init_attr.comp_mask = + MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS; + qp_init_attr.create_flags = MLX5DV_QP_CREATE_TUNNEL_OFFLOADS; + } +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + if (dev->data->dev_conf.lpbk_mode) { + /* Allow packet sent from NIC loop back w/o source MAC check. */ + qp_init_attr.comp_mask |= + MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS; + qp_init_attr.create_flags |= + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC; + } +#endif qp = mlx5_glue->dv_create_qp (priv->ctx, &(struct ibv_qp_init_attr_ex){ @@ -1798,21 +1819,14 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, IBV_QP_INIT_ATTR_RX_HASH, .rx_hash_conf = (struct ibv_rx_hash_conf){ .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ, - .rx_hash_key_len = rss_key_len ? rss_key_len : - MLX5_RSS_HASH_KEY_LEN, - .rx_hash_key = rss_key ? - (void *)(uintptr_t)rss_key : - rss_hash_default_key, + .rx_hash_key_len = rss_key_len, + .rx_hash_key = (void *)(uintptr_t)rss_key, .rx_hash_fields_mask = hash_fields, }, .rwq_ind_tbl = ind_tbl->ind_table, .pd = priv->pd, }, - &(struct mlx5dv_qp_init_attr){ - .comp_mask = tunnel ? - MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS : 0, - .create_flags = MLX5DV_QP_CREATE_TUNNEL_OFFLOADS, - }); + &qp_init_attr); #else qp = mlx5_glue->create_qp_ex (priv->ctx, @@ -1824,11 +1838,8 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, IBV_QP_INIT_ATTR_RX_HASH, .rx_hash_conf = (struct ibv_rx_hash_conf){ .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ, - .rx_hash_key_len = rss_key_len ? rss_key_len : - MLX5_RSS_HASH_KEY_LEN, - .rx_hash_key = rss_key ? - (void *)(uintptr_t)rss_key : - rss_hash_default_key, + .rx_hash_key_len = rss_key_len, + .rx_hash_key = (void *)(uintptr_t)rss_key, .rx_hash_fields_mask = hash_fields, }, .rwq_ind_tbl = ind_tbl->ind_table, diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 24a054d5..6eceea5f 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -417,20 +417,17 @@ mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) } /** - * DPDK callback to check the status of a rx descriptor. + * Internal function to compute the number of used descriptors in an RX queue * - * @param rx_queue - * The rx queue. - * @param[in] offset - * The index of the descriptor in the ring. + * @param rxq + * The Rx queue. * * @return - * The status of the tx descriptor. + * The number of used rx descriptor. */ -int -mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) +static uint32_t +rx_queue_count(struct mlx5_rxq_data *rxq) { - struct mlx5_rxq_data *rxq = rx_queue; struct rxq_zip *zip = &rxq->zip; volatile struct mlx5_cqe *cqe; const unsigned int cqe_n = (1 << rxq->cqe_n); @@ -461,12 +458,73 @@ mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; } used = RTE_MIN(used, (1U << rxq->elts_n) - 1); - if (offset < used) + return used; +} + +/** + * DPDK callback to check the status of a rx descriptor. + * + * @param rx_queue + * The Rx queue. + * @param[in] offset + * The index of the descriptor in the ring. + * + * @return + * The status of the tx descriptor. + */ +int +mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) +{ + struct mlx5_rxq_data *rxq = rx_queue; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv); + + if (dev->rx_pkt_burst != mlx5_rx_burst) { + rte_errno = ENOTSUP; + return -rte_errno; + } + if (offset >= (1 << rxq->elts_n)) { + rte_errno = EINVAL; + return -rte_errno; + } + if (offset < rx_queue_count(rxq)) return RTE_ETH_RX_DESC_DONE; return RTE_ETH_RX_DESC_AVAIL; } /** + * DPDK callback to get the number of used descriptors in a RX queue + * + * @param dev + * Pointer to the device structure. + * + * @param rx_queue_id + * The Rx queue. + * + * @return + * The number of used rx descriptor. + * -EINVAL if the queue is invalid + */ +uint32_t +mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq; + + if (dev->rx_pkt_burst != mlx5_rx_burst) { + rte_errno = ENOTSUP; + return -rte_errno; + } + rxq = (*priv->rxqs)[rx_queue_id]; + if (!rxq) { + rte_errno = EINVAL; + return -rte_errno; + } + return rx_queue_count(rxq); +} + +/** * DPDK callback for TX. * * @param dpdk_txq diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index 1db468c3..1b6200f6 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -345,6 +345,7 @@ uint16_t removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n); int mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset); int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset); +uint32_t mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id); /* Vectorized version of mlx5_rxtx.c */ int mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev); diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c index 1453f4ff..340292ad 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.c +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c @@ -277,7 +277,7 @@ mlx5_check_vec_tx_support(struct rte_eth_dev *dev) uint64_t offloads = dev->data->dev_conf.txmode.offloads; if (!priv->config.tx_vec_en || - priv->txqs_n > MLX5_VPMD_MIN_TXQS || + priv->txqs_n > (unsigned int)priv->config.txqs_vec || priv->config.mps != MLX5_MPW_ENHANCED || offloads & ~MLX5_VEC_TX_OFFLOAD_CAP) return -ENOTSUP; diff --git a/drivers/net/netvsc/hn_ethdev.c b/drivers/net/netvsc/hn_ethdev.c index aa38ee7a..b330bf3d 100644 --- a/drivers/net/netvsc/hn_ethdev.c +++ b/drivers/net/netvsc/hn_ethdev.c @@ -879,9 +879,7 @@ static struct rte_vmbus_driver rte_netvsc_pmd = { RTE_PMD_REGISTER_VMBUS(net_netvsc, rte_netvsc_pmd); RTE_PMD_REGISTER_KMOD_DEP(net_netvsc, "* uio_hv_generic"); -RTE_INIT(hn_init_log); -static void -hn_init_log(void) +RTE_INIT(hn_init_log) { hn_logtype_init = rte_log_register("pmd.net.netvsc.init"); if (hn_logtype_init >= 0) diff --git a/drivers/net/netvsc/hn_vf.c b/drivers/net/netvsc/hn_vf.c index 7a84ad8c..3f714ec9 100644 --- a/drivers/net/netvsc/hn_vf.c +++ b/drivers/net/netvsc/hn_vf.c @@ -223,7 +223,7 @@ int hn_vf_link_update(struct rte_eth_dev *dev, rte_spinlock_lock(&hv->vf_lock); vf_dev = hv->vf_dev; if (vf_dev && vf_dev->dev_ops->link_update) - ret = (*vf_dev->dev_ops->link_update)(dev, wait_to_complete); + ret = (*vf_dev->dev_ops->link_update)(vf_dev, wait_to_complete); rte_spinlock_unlock(&hv->vf_lock); return ret; diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c index bab1f68e..54c6da92 100644 --- a/drivers/net/nfp/nfp_net.c +++ b/drivers/net/nfp/nfp_net.c @@ -2703,7 +2703,7 @@ nfp_net_init(struct rte_eth_dev *eth_dev) pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev); /* NFP can not handle DMA addresses requiring more than 40 bits */ - if (rte_eal_check_dma_mask(40)) { + if (rte_mem_check_dma_mask(40)) { RTE_LOG(ERR, PMD, "device %s can not be used:", pci_dev->device.name); RTE_LOG(ERR, PMD, "\trestricted dma mask to 40 bits!\n"); diff --git a/drivers/net/qede/base/ecore_dev.c b/drivers/net/qede/base/ecore_dev.c index cf454b19..d7e1d7b3 100644 --- a/drivers/net/qede/base/ecore_dev.c +++ b/drivers/net/qede/base/ecore_dev.c @@ -3429,6 +3429,14 @@ ecore_hw_init_pf(struct ecore_hwfn *p_hwfn, struct ecore_ptt *p_ptt, if (rc != ECORE_SUCCESS) return rc; + /* Use the leading hwfn since in CMT only NIG #0 is operational */ + if (IS_LEAD_HWFN(p_hwfn)) { + rc = ecore_llh_hw_init_pf(p_hwfn, p_ptt, + p_params->avoid_eng_affin); + if (rc) + return rc; + } + if (p_params->b_hw_start) { /* enable interrupts */ rc = ecore_int_igu_enable(p_hwfn, p_ptt, p_params->int_mode); diff --git a/drivers/net/qede/qede_main.c b/drivers/net/qede/qede_main.c index df83666f..ec6190b1 100644 --- a/drivers/net/qede/qede_main.c +++ b/drivers/net/qede/qede_main.c @@ -288,6 +288,7 @@ static int qed_slowpath_start(struct ecore_dev *edev, drv_load_params.mfw_timeout_val = ECORE_LOAD_REQ_LOCK_TO_DEFAULT; drv_load_params.avoid_eng_reset = false; drv_load_params.override_force_load = ECORE_OVERRIDE_FORCE_LOAD_ALWAYS; + hw_init_params.avoid_eng_affin = false; hw_init_params.p_drv_load_params = &drv_load_params; rc = ecore_hw_init(edev, &hw_init_params); diff --git a/drivers/net/softnic/rte_eth_softnic_cli.c b/drivers/net/softnic/rte_eth_softnic_cli.c index c6640d65..57b62337 100644 --- a/drivers/net/softnic/rte_eth_softnic_cli.c +++ b/drivers/net/softnic/rte_eth_softnic_cli.c @@ -1867,7 +1867,7 @@ cmd_pipeline_port_in(struct pmd_internals *softnic, p.type = PORT_IN_RXQ; - strcpy(p.dev_name, tokens[t0 + 1]); + strlcpy(p.dev_name, tokens[t0 + 1], sizeof(p.dev_name)); if (strcmp(tokens[t0 + 2], "rxq") != 0) { snprintf(out, out_size, MSG_ARG_NOT_FOUND, "rxq"); @@ -1890,7 +1890,7 @@ cmd_pipeline_port_in(struct pmd_internals *softnic, p.type = PORT_IN_SWQ; - strcpy(p.dev_name, tokens[t0 + 1]); + strlcpy(p.dev_name, tokens[t0 + 1], sizeof(p.dev_name)); t0 += 2; } else if (strcmp(tokens[t0], "tmgr") == 0) { @@ -1902,7 +1902,7 @@ cmd_pipeline_port_in(struct pmd_internals *softnic, p.type = PORT_IN_TMGR; - strcpy(p.dev_name, tokens[t0 + 1]); + strlcpy(p.dev_name, tokens[t0 + 1], sizeof(p.dev_name)); t0 += 2; } else if (strcmp(tokens[t0], "tap") == 0) { @@ -1914,7 +1914,7 @@ cmd_pipeline_port_in(struct pmd_internals *softnic, p.type = PORT_IN_TAP; - strcpy(p.dev_name, tokens[t0 + 1]); + strlcpy(p.dev_name, tokens[t0 + 1], sizeof(p.dev_name)); if (strcmp(tokens[t0 + 2], "mempool") != 0) { snprintf(out, out_size, MSG_ARG_NOT_FOUND, @@ -2009,7 +2009,8 @@ cmd_pipeline_port_in(struct pmd_internals *softnic, return; } - strcpy(p.action_profile_name, tokens[t0 + 1]); + strlcpy(p.action_profile_name, tokens[t0 + 1], + sizeof(p.action_profile_name)); t0 += 2; } @@ -2096,7 +2097,7 @@ cmd_pipeline_port_out(struct pmd_internals *softnic, p.type = PORT_OUT_TXQ; - strcpy(p.dev_name, tokens[7]); + strlcpy(p.dev_name, tokens[7], sizeof(p.dev_name)); if (strcmp(tokens[8], "txq") != 0) { snprintf(out, out_size, MSG_ARG_NOT_FOUND, "txq"); @@ -2117,7 +2118,7 @@ cmd_pipeline_port_out(struct pmd_internals *softnic, p.type = PORT_OUT_SWQ; - strcpy(p.dev_name, tokens[7]); + strlcpy(p.dev_name, tokens[7], sizeof(p.dev_name)); } else if (strcmp(tokens[6], "tmgr") == 0) { if (n_tokens != 8) { snprintf(out, out_size, MSG_ARG_MISMATCH, @@ -2127,7 +2128,7 @@ cmd_pipeline_port_out(struct pmd_internals *softnic, p.type = PORT_OUT_TMGR; - strcpy(p.dev_name, tokens[7]); + strlcpy(p.dev_name, tokens[7], sizeof(p.dev_name)); } else if (strcmp(tokens[6], "tap") == 0) { if (n_tokens != 8) { snprintf(out, out_size, MSG_ARG_MISMATCH, @@ -2137,7 +2138,7 @@ cmd_pipeline_port_out(struct pmd_internals *softnic, p.type = PORT_OUT_TAP; - strcpy(p.dev_name, tokens[7]); + strlcpy(p.dev_name, tokens[7], sizeof(p.dev_name)); } else if (strcmp(tokens[6], "sink") == 0) { if ((n_tokens != 7) && (n_tokens != 11)) { snprintf(out, out_size, MSG_ARG_MISMATCH, @@ -2485,7 +2486,8 @@ cmd_pipeline_table(struct pmd_internals *softnic, return; } - strcpy(p.action_profile_name, tokens[t0 + 1]); + strlcpy(p.action_profile_name, tokens[t0 + 1], + sizeof(p.action_profile_name)); t0 += 2; } diff --git a/drivers/net/softnic/rte_eth_softnic_flow.c b/drivers/net/softnic/rte_eth_softnic_flow.c index 285af462..21e75300 100644 --- a/drivers/net/softnic/rte_eth_softnic_flow.c +++ b/drivers/net/softnic/rte_eth_softnic_flow.c @@ -56,7 +56,7 @@ flow_attr_map_set(struct pmd_internals *softnic, map = (ingress) ? &softnic->flow.ingress_map[group_id] : &softnic->flow.egress_map[group_id]; - strcpy(map->pipeline_name, pipeline_name); + strlcpy(map->pipeline_name, pipeline_name, sizeof(map->pipeline_name)); map->table_id = table_id; map->valid = 1; @@ -1624,11 +1624,11 @@ flow_rule_action_get(struct pmd_internals *softnic, /* RTE_TABLE_ACTION_METER */ rule_action->mtr.mtr[0].meter_profile_id = meter_profile_id; rule_action->mtr.mtr[0].policer[e_RTE_METER_GREEN] = - (enum rte_table_action_policer)m->params.action[RTE_MTR_GREEN]; + softnic_table_action_policer(m->params.action[RTE_MTR_GREEN]); rule_action->mtr.mtr[0].policer[e_RTE_METER_YELLOW] = - (enum rte_table_action_policer)m->params.action[RTE_MTR_YELLOW]; + softnic_table_action_policer(m->params.action[RTE_MTR_YELLOW]); rule_action->mtr.mtr[0].policer[e_RTE_METER_RED] = - (enum rte_table_action_policer)m->params.action[RTE_MTR_RED]; + softnic_table_action_policer(m->params.action[RTE_MTR_RED]); rule_action->mtr.tc_mask = 1; rule_action->action_mask |= 1 << RTE_TABLE_ACTION_MTR; break; diff --git a/drivers/net/softnic/rte_eth_softnic_internals.h b/drivers/net/softnic/rte_eth_softnic_internals.h index e12b8ae4..31698b9f 100644 --- a/drivers/net/softnic/rte_eth_softnic_internals.h +++ b/drivers/net/softnic/rte_eth_softnic_internals.h @@ -828,6 +828,9 @@ softnic_table_action_profile_create(struct pmd_internals *p, const char *name, struct softnic_table_action_profile_params *params); +enum rte_table_action_policer +softnic_table_action_policer(enum rte_mtr_policer_action action); + /** * Pipeline */ diff --git a/drivers/net/softnic/rte_eth_softnic_meter.c b/drivers/net/softnic/rte_eth_softnic_meter.c index 73ecf3b1..7b747ba5 100644 --- a/drivers/net/softnic/rte_eth_softnic_meter.c +++ b/drivers/net/softnic/rte_eth_softnic_meter.c @@ -65,6 +65,27 @@ softnic_mtr_meter_profile_find(struct pmd_internals *p, return NULL; } +enum rte_table_action_policer +softnic_table_action_policer(enum rte_mtr_policer_action action) +{ + switch (action) { + case MTR_POLICER_ACTION_COLOR_GREEN: + return RTE_TABLE_ACTION_POLICER_COLOR_GREEN; + + /* FALLTHROUGH */ + case MTR_POLICER_ACTION_COLOR_YELLOW: + return RTE_TABLE_ACTION_POLICER_COLOR_YELLOW; + + /* FALLTHROUGH */ + case MTR_POLICER_ACTION_COLOR_RED: + return RTE_TABLE_ACTION_POLICER_COLOR_RED; + + /* FALLTHROUGH */ + default: + return RTE_TABLE_ACTION_POLICER_DROP; + } +} + static int meter_profile_check(struct rte_eth_dev *dev, uint32_t meter_profile_id, @@ -542,7 +563,7 @@ pmd_mtr_policer_actions_update(struct rte_eth_dev *dev, for (i = 0; i < RTE_MTR_COLORS; i++) if (action_mask & (1 << i)) action.mtr.mtr[0].policer[i] = - (enum rte_table_action_policer)actions[i]; + softnic_table_action_policer(actions[i]); /* Re-add the rule */ status = softnic_pipeline_table_rule_add(p, diff --git a/drivers/net/vhost/rte_eth_vhost.c b/drivers/net/vhost/rte_eth_vhost.c index b38a4b6b..42bdfcbd 100644 --- a/drivers/net/vhost/rte_eth_vhost.c +++ b/drivers/net/vhost/rte_eth_vhost.c @@ -1467,7 +1467,11 @@ RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv); RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost); RTE_PMD_REGISTER_PARAM_STRING(net_vhost, "iface=<ifc> " - "queues=<int>"); + "queues=<int> " + "client=<0|1> " + "dequeue-zero-copy=<0|1> " + "iommu-support=<0|1> " + "postcopy-support=<0|1>"); RTE_INIT(vhost_init_log) { diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c index 10a7e3fc..e1fe36a2 100644 --- a/drivers/net/virtio/virtio_ethdev.c +++ b/drivers/net/virtio/virtio_ethdev.c @@ -588,6 +588,10 @@ virtio_dev_close(struct rte_eth_dev *dev) PMD_INIT_LOG(DEBUG, "virtio_dev_close"); + if (!hw->opened) + return; + hw->opened = false; + /* reset the NIC */ if (dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) VTPCI_OPS(hw)->set_config_irq(hw, VIRTIO_MSI_NO_VECTOR); @@ -1288,6 +1292,7 @@ virtio_interrupt_handler(void *param) struct rte_eth_dev *dev = param; struct virtio_hw *hw = dev->data->dev_private; uint8_t isr; + uint16_t status; /* Read interrupt status which clears interrupt */ isr = vtpci_isr(hw); @@ -1301,12 +1306,17 @@ virtio_interrupt_handler(void *param) _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); - } - if (isr & VIRTIO_NET_S_ANNOUNCE) { - virtio_notify_peers(dev); - if (hw->cvq) - virtio_ack_link_announce(dev); + if (vtpci_with_feature(hw, VIRTIO_NET_F_STATUS)) { + vtpci_read_dev_config(hw, + offsetof(struct virtio_net_config, status), + &status, sizeof(status)); + if (status & VIRTIO_NET_S_ANNOUNCE) { + virtio_notify_peers(dev); + if (hw->cvq) + virtio_ack_link_announce(dev); + } + } } } @@ -1679,11 +1689,6 @@ eth_virtio_dev_init(struct rte_eth_dev *eth_dev) if (ret < 0) goto out; - /* Setup interrupt callback */ - if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) - rte_intr_callback_register(eth_dev->intr_handle, - virtio_interrupt_handler, eth_dev); - return 0; out: @@ -1706,11 +1711,6 @@ eth_virtio_dev_uninit(struct rte_eth_dev *eth_dev) eth_dev->tx_pkt_burst = NULL; eth_dev->rx_pkt_burst = NULL; - /* reset interrupt callback */ - if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) - rte_intr_callback_unregister(eth_dev->intr_handle, - virtio_interrupt_handler, - eth_dev); if (eth_dev->device) rte_pci_unmap_device(RTE_ETH_DEV_TO_PCI(eth_dev)); @@ -1928,6 +1928,8 @@ virtio_dev_configure(struct rte_eth_dev *dev) DEV_RX_OFFLOAD_VLAN_STRIP)) hw->use_simple_rx = 0; + hw->opened = true; + return 0; } @@ -1969,6 +1971,12 @@ virtio_dev_start(struct rte_eth_dev *dev) dev->data->dev_conf.intr_conf.rxq) { virtio_intr_disable(dev); + /* Setup interrupt callback */ + if (dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) + rte_intr_callback_register(dev->intr_handle, + virtio_interrupt_handler, + dev); + if (virtio_intr_enable(dev) < 0) { PMD_DRV_LOG(ERR, "interrupt enable failed"); return -EIO; @@ -2012,7 +2020,7 @@ virtio_dev_start(struct rte_eth_dev *dev) } set_rxtx_funcs(dev); - hw->started = 1; + hw->started = true; /* Initialize Link state */ virtio_dev_link_update(dev, 0); @@ -2078,12 +2086,24 @@ virtio_dev_stop(struct rte_eth_dev *dev) PMD_INIT_LOG(DEBUG, "stop"); rte_spinlock_lock(&hw->state_lock); - if (intr_conf->lsc || intr_conf->rxq) + if (!hw->started) + goto out_unlock; + hw->started = false; + + if (intr_conf->lsc || intr_conf->rxq) { virtio_intr_disable(dev); - hw->started = 0; + /* Reset interrupt callback */ + if (dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) { + rte_intr_callback_unregister(dev->intr_handle, + virtio_interrupt_handler, + dev); + } + } + memset(&link, 0, sizeof(link)); rte_eth_linkstatus_set(dev, &link); +out_unlock: rte_spinlock_unlock(&hw->state_lock); } @@ -2099,7 +2119,7 @@ virtio_dev_link_update(struct rte_eth_dev *dev, __rte_unused int wait_to_complet link.link_speed = ETH_SPEED_NUM_10G; link.link_autoneg = ETH_LINK_FIXED; - if (hw->started == 0) { + if (!hw->started) { link.link_status = ETH_LINK_DOWN; } else if (vtpci_with_feature(hw, VIRTIO_NET_F_STATUS)) { PMD_INIT_LOG(DEBUG, "Get link status from hw"); diff --git a/drivers/net/virtio/virtio_pci.c b/drivers/net/virtio/virtio_pci.c index b6a3c80b..21110cd6 100644 --- a/drivers/net/virtio/virtio_pci.c +++ b/drivers/net/virtio/virtio_pci.c @@ -166,12 +166,6 @@ legacy_set_status(struct virtio_hw *hw, uint8_t status) rte_pci_ioport_write(VTPCI_IO(hw), &status, 1, VIRTIO_PCI_STATUS); } -static void -legacy_reset(struct virtio_hw *hw) -{ - legacy_set_status(hw, VIRTIO_CONFIG_STATUS_RESET); -} - static uint8_t legacy_get_isr(struct virtio_hw *hw) { @@ -250,7 +244,6 @@ legacy_notify_queue(struct virtio_hw *hw, struct virtqueue *vq) const struct virtio_pci_ops legacy_ops = { .read_dev_cfg = legacy_read_dev_config, .write_dev_cfg = legacy_write_dev_config, - .reset = legacy_reset, .get_status = legacy_get_status, .set_status = legacy_set_status, .get_features = legacy_get_features, @@ -339,13 +332,6 @@ modern_set_status(struct virtio_hw *hw, uint8_t status) rte_write8(status, &hw->common_cfg->device_status); } -static void -modern_reset(struct virtio_hw *hw) -{ - modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET); - modern_get_status(hw); -} - static uint8_t modern_get_isr(struct virtio_hw *hw) { @@ -438,7 +424,6 @@ modern_notify_queue(struct virtio_hw *hw __rte_unused, struct virtqueue *vq) const struct virtio_pci_ops modern_ops = { .read_dev_cfg = modern_read_dev_config, .write_dev_cfg = modern_write_dev_config, - .reset = modern_reset, .get_status = modern_get_status, .set_status = modern_set_status, .get_features = modern_get_features, diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h index 58fdd3d4..e961a58c 100644 --- a/drivers/net/virtio/virtio_pci.h +++ b/drivers/net/virtio/virtio_pci.h @@ -204,7 +204,6 @@ struct virtio_pci_ops { void *dst, int len); void (*write_dev_cfg)(struct virtio_hw *hw, size_t offset, const void *src, int len); - void (*reset)(struct virtio_hw *hw); uint8_t (*get_status)(struct virtio_hw *hw); void (*set_status)(struct virtio_hw *hw, uint8_t status); @@ -232,7 +231,7 @@ struct virtio_hw { uint64_t req_guest_features; uint64_t guest_features; uint32_t max_queue_pairs; - uint16_t started; + bool started; uint16_t max_mtu; uint16_t vtnet_hdr_size; uint8_t vlan_strip; @@ -258,6 +257,7 @@ struct virtio_hw { */ rte_spinlock_t state_lock; struct rte_mbuf **inject_pkts; + bool opened; struct virtqueue **vqs; }; diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c index b4997ee3..20816c93 100644 --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c @@ -134,9 +134,6 @@ virtio_user_start_device(struct virtio_user_dev *dev) if (is_vhost_user_by_type(dev->path) && dev->vhostfd < 0) goto error; - /* Do not check return as already done in init, or reset in stop */ - dev->ops->send_request(dev, VHOST_USER_SET_OWNER, NULL); - /* Step 0: tell vhost to create queues */ if (virtio_user_queue_setup(dev, virtio_user_create_queue) < 0) goto error; @@ -181,21 +178,34 @@ error: int virtio_user_stop_device(struct virtio_user_dev *dev) { + struct vhost_vring_state state; uint32_t i; + int error = 0; pthread_mutex_lock(&dev->mutex); + if (!dev->started) + goto out; + for (i = 0; i < dev->max_queue_pairs; ++i) dev->ops->enable_qp(dev, i, 0); - if (dev->ops->send_request(dev, VHOST_USER_RESET_OWNER, NULL) < 0) { - PMD_DRV_LOG(INFO, "Failed to reset the device\n"); - pthread_mutex_unlock(&dev->mutex); - return -1; + /* Stop the backend. */ + for (i = 0; i < dev->max_queue_pairs * 2; ++i) { + state.index = i; + if (dev->ops->send_request(dev, VHOST_USER_GET_VRING_BASE, + &state) < 0) { + PMD_DRV_LOG(ERR, "get_vring_base failed, index=%u\n", + i); + error = -1; + goto out; + } } + dev->started = false; +out: pthread_mutex_unlock(&dev->mutex); - return 0; + return error; } static inline void @@ -411,7 +421,8 @@ virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues, dev->queue_pairs = 1; /* mq disabled by default */ dev->queue_size = queue_size; dev->mac_specified = 0; - dev->unsupported_features = 0; + dev->frontend_features = 0; + dev->unsupported_features = ~VIRTIO_USER_SUPPORTED_FEATURES; parse_mac(dev, mac); if (*ifname) { @@ -447,37 +458,25 @@ virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues, dev->device_features = VIRTIO_USER_SUPPORTED_FEATURES; } - if (!mrg_rxbuf) { - dev->device_features &= ~(1ull << VIRTIO_NET_F_MRG_RXBUF); + if (!mrg_rxbuf) dev->unsupported_features |= (1ull << VIRTIO_NET_F_MRG_RXBUF); - } - if (!in_order) { - dev->device_features &= ~(1ull << VIRTIO_F_IN_ORDER); + if (!in_order) dev->unsupported_features |= (1ull << VIRTIO_F_IN_ORDER); - } - if (dev->mac_specified) { - dev->device_features |= (1ull << VIRTIO_NET_F_MAC); - } else { - dev->device_features &= ~(1ull << VIRTIO_NET_F_MAC); + if (dev->mac_specified) + dev->frontend_features |= (1ull << VIRTIO_NET_F_MAC); + else dev->unsupported_features |= (1ull << VIRTIO_NET_F_MAC); - } if (cq) { /* device does not really need to know anything about CQ, * so if necessary, we just claim to support CQ */ - dev->device_features |= (1ull << VIRTIO_NET_F_CTRL_VQ); + dev->frontend_features |= (1ull << VIRTIO_NET_F_CTRL_VQ); } else { - dev->device_features &= ~(1ull << VIRTIO_NET_F_CTRL_VQ); - /* Also disable features depends on VIRTIO_NET_F_CTRL_VQ */ - dev->device_features &= ~(1ull << VIRTIO_NET_F_CTRL_RX); - dev->device_features &= ~(1ull << VIRTIO_NET_F_CTRL_VLAN); - dev->device_features &= ~(1ull << VIRTIO_NET_F_GUEST_ANNOUNCE); - dev->device_features &= ~(1ull << VIRTIO_NET_F_MQ); - dev->device_features &= ~(1ull << VIRTIO_NET_F_CTRL_MAC_ADDR); dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_VQ); + /* Also disable features that depend on VIRTIO_NET_F_CTRL_VQ */ dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_RX); dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_VLAN); dev->unsupported_features |= @@ -489,10 +488,14 @@ virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues, /* The backend will not report this feature, we add it explicitly */ if (is_vhost_user_by_type(dev->path)) - dev->device_features |= (1ull << VIRTIO_NET_F_STATUS); + dev->frontend_features |= (1ull << VIRTIO_NET_F_STATUS); - dev->device_features &= VIRTIO_USER_SUPPORTED_FEATURES; - dev->unsupported_features |= ~VIRTIO_USER_SUPPORTED_FEATURES; + /* + * Device features = + * (frontend_features | backend_features) & ~unsupported_features; + */ + dev->device_features |= dev->frontend_features; + dev->device_features &= ~dev->unsupported_features; if (rte_mem_event_callback_register(VIRTIO_USER_MEM_EVENT_CLB_NAME, virtio_user_mem_event_cb, dev)) { diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.h b/drivers/net/virtio/virtio_user/virtio_user_dev.h index d6e0e137..c42ce5d4 100644 --- a/drivers/net/virtio/virtio_user/virtio_user_dev.h +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.h @@ -33,6 +33,7 @@ struct virtio_user_dev { * and will be sync with device */ uint64_t device_features; /* supported features by device */ + uint64_t frontend_features; /* enabled frontend features */ uint64_t unsupported_features; /* unsupported features mask */ uint8_t status; uint16_t port_id; diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c index b51cbc85..61b7c0a3 100644 --- a/drivers/net/virtio/virtio_user_ethdev.c +++ b/drivers/net/virtio/virtio_user_ethdev.c @@ -28,7 +28,6 @@ static int virtio_user_server_reconnect(struct virtio_user_dev *dev) { int ret; - int flag; int connectfd; struct rte_eth_dev *eth_dev = &rte_eth_devices[dev->port_id]; @@ -44,14 +43,13 @@ virtio_user_server_reconnect(struct virtio_user_dev *dev) return -1; } + dev->device_features |= dev->frontend_features; + /* umask vhost-user unsupported features */ dev->device_features &= ~(dev->unsupported_features); dev->features &= dev->device_features; - flag = fcntl(connectfd, F_GETFD); - fcntl(connectfd, F_SETFL, flag | O_NONBLOCK); - ret = virtio_user_start_device(dev); if (ret < 0) return -1; @@ -331,7 +329,6 @@ virtio_user_notify_queue(struct virtio_hw *hw, struct virtqueue *vq) const struct virtio_pci_ops virtio_user_ops = { .read_dev_cfg = virtio_user_read_dev_config, .write_dev_cfg = virtio_user_write_dev_config, - .reset = virtio_user_reset, .get_status = virtio_user_get_status, .set_status = virtio_user_set_status, .get_features = virtio_user_get_features, diff --git a/drivers/net/vmxnet3/vmxnet3_ethdev.c b/drivers/net/vmxnet3/vmxnet3_ethdev.c index 41bcd450..84acd9db 100644 --- a/drivers/net/vmxnet3/vmxnet3_ethdev.c +++ b/drivers/net/vmxnet3/vmxnet3_ethdev.c @@ -360,8 +360,10 @@ eth_vmxnet3_dev_uninit(struct rte_eth_dev *eth_dev) if (rte_eal_process_type() != RTE_PROC_PRIMARY) return 0; - if (hw->adapter_stopped == 0) - vmxnet3_dev_close(eth_dev); + if (hw->adapter_stopped == 0) { + PMD_INIT_LOG(DEBUG, "Device has not been closed."); + return -EBUSY; + } eth_dev->dev_ops = NULL; eth_dev->rx_pkt_burst = NULL; @@ -805,7 +807,7 @@ vmxnet3_dev_stop(struct rte_eth_dev *dev) PMD_INIT_FUNC_TRACE(); if (hw->adapter_stopped == 1) { - PMD_INIT_LOG(DEBUG, "Device already closed."); + PMD_INIT_LOG(DEBUG, "Device already stopped."); return; } @@ -829,7 +831,6 @@ vmxnet3_dev_stop(struct rte_eth_dev *dev) /* reset the device */ VMXNET3_WRITE_BAR1_REG(hw, VMXNET3_REG_CMD, VMXNET3_CMD_RESET_DEV); PMD_INIT_LOG(DEBUG, "Device reset."); - hw->adapter_stopped = 0; vmxnet3_dev_clear_queues(dev); @@ -839,6 +840,30 @@ vmxnet3_dev_stop(struct rte_eth_dev *dev) link.link_speed = ETH_SPEED_NUM_10G; link.link_autoneg = ETH_LINK_FIXED; rte_eth_linkstatus_set(dev, &link); + + hw->adapter_stopped = 1; +} + +static void +vmxnet3_free_queues(struct rte_eth_dev *dev) +{ + int i; + + PMD_INIT_FUNC_TRACE(); + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + void *rxq = dev->data->rx_queues[i]; + + vmxnet3_dev_rx_queue_release(rxq); + } + dev->data->nb_rx_queues = 0; + + for (i = 0; i < dev->data->nb_tx_queues; i++) { + void *txq = dev->data->tx_queues[i]; + + vmxnet3_dev_tx_queue_release(txq); + } + dev->data->nb_tx_queues = 0; } /* @@ -847,12 +872,16 @@ vmxnet3_dev_stop(struct rte_eth_dev *dev) static void vmxnet3_dev_close(struct rte_eth_dev *dev) { - struct vmxnet3_hw *hw = dev->data->dev_private; - PMD_INIT_FUNC_TRACE(); vmxnet3_dev_stop(dev); - hw->adapter_stopped = 1; + vmxnet3_free_queues(dev); + + /* + * flag to rte_eth_dev_close() that it should release the port resources + * (calling rte_eth_dev_release_port()) in addition to closing it. + */ + dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; } static void |