diff options
Diffstat (limited to 'drivers/net/mlx5')
26 files changed, 8910 insertions, 3638 deletions
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index 2e70dec5..fecb57c1 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -8,7 +8,7 @@ include $(RTE_SDK)/mk/rte.vars.mk LIB = librte_pmd_mlx5.a LIB_GLUE = $(LIB_GLUE_BASE).$(LIB_GLUE_VERSION) LIB_GLUE_BASE = librte_pmd_mlx5_glue.so -LIB_GLUE_VERSION = 18.05.0 +LIB_GLUE_VERSION = 18.11.0 # Sources. SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c @@ -31,9 +31,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_stats.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_dv.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_tcf.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_verbs.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c -SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl_flow.c ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y) INSTALL-$(CONFIG_RTE_LIBRTE_MLX5_PMD)-lib += $(LIB_GLUE) @@ -135,6 +137,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh enum MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ + HAVE_IBV_FLOW_DV_SUPPORT \ + infiniband/mlx5dv.h \ + enum MLX5DV_FLOW_ACTION_TAG \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ HAVE_ETHTOOL_LINK_MODE_25G \ /usr/include/linux/ethtool.h \ enum ETHTOOL_LINK_MODE_25000baseCR_Full_BIT \ @@ -150,11 +157,16 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh enum ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ - HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT \ + HAVE_IBV_DEVICE_COUNTERS_SET_V42 \ infiniband/verbs.h \ type 'struct ibv_counter_set_init_attr' \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ + HAVE_IBV_DEVICE_COUNTERS_SET_V45 \ + infiniband/verbs.h \ + type 'struct ibv_counters_init_attr' \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ HAVE_RDMA_NL_NLDEV \ rdma/rdma_netlink.h \ enum RDMA_NL_NLDEV \ @@ -200,6 +212,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh enum IFLA_PHYS_PORT_NAME \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ + HAVE_TCA_CHAIN \ + linux/rtnetlink.h \ + enum TCA_CHAIN \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ HAVE_TCA_FLOWER_ACT \ linux/pkt_cls.h \ enum TCA_FLOWER_ACT \ @@ -335,11 +352,31 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh enum TCA_FLOWER_KEY_VLAN_ETH_TYPE \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_TCP_FLAGS \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_TCP_FLAGS \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_TCP_FLAGS_MASK \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TC_ACT_GOTO_CHAIN \ + linux/pkt_cls.h \ + define TC_ACT_GOTO_CHAIN \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ HAVE_TC_ACT_VLAN \ linux/tc_act/tc_vlan.h \ enum TCA_VLAN_PUSH_VLAN_PRIORITY \ $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ + HAVE_TC_ACT_PEDIT \ + linux/tc_act/tc_pedit.h \ + enum TCA_PEDIT_KEY_EX_HDR_TYPE_UDP \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ HAVE_SUPPORTED_40000baseKR4_Full \ /usr/include/linux/ethtool.h \ define SUPPORTED_40000baseKR4_Full \ diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build new file mode 100644 index 00000000..e8cbe3ee --- /dev/null +++ b/drivers/net/mlx5/meson.build @@ -0,0 +1,244 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2018 6WIND S.A. +# Copyright 2018 Mellanox Technologies, Ltd + +pmd_dlopen = get_option('enable_driver_mlx_glue') +LIB_GLUE_BASE = 'librte_pmd_mlx5_glue.so' +LIB_GLUE_VERSION = '18.11.0' +LIB_GLUE = LIB_GLUE_BASE + '.' + LIB_GLUE_VERSION +if pmd_dlopen + dpdk_conf.set('RTE_LIBRTE_MLX5_DLOPEN_DEPS', 1) + cflags += [ + '-DMLX5_GLUE="@0@"'.format(LIB_GLUE), + '-DMLX5_GLUE_VERSION="@0@"'.format(LIB_GLUE_VERSION), + ] +endif +libs = [ + cc.find_library('mnl', required:false), + cc.find_library('mlx5', required:false), + cc.find_library('ibverbs', required:false), +] +build = true +foreach lib:libs + if not lib.found() + build = false + endif +endforeach +if build + allow_experimental_apis = true + ext_deps += libs + sources = files( + 'mlx5.c', + 'mlx5_ethdev.c', + 'mlx5_flow.c', + 'mlx5_flow_dv.c', + 'mlx5_flow_tcf.c', + 'mlx5_flow_verbs.c', + 'mlx5_mac.c', + 'mlx5_mr.c', + 'mlx5_nl.c', + 'mlx5_rss.c', + 'mlx5_rxmode.c', + 'mlx5_rxq.c', + 'mlx5_rxtx.c', + 'mlx5_socket.c', + 'mlx5_stats.c', + 'mlx5_trigger.c', + 'mlx5_txq.c', + 'mlx5_vlan.c', + ) + if dpdk_conf.has('RTE_ARCH_X86_64') or dpdk_conf.has('RTE_ARCH_ARM64') + sources += files('mlx5_rxtx_vec.c') + endif + if not pmd_dlopen + sources += files('mlx5_glue.c') + endif + cflags_options = [ + '-Wextra', + '-std=c11', + '-Wno-strict-prototypes', + '-D_BSD_SOURCE', + '-D_DEFAULT_SOURCE', + '-D_XOPEN_SOURCE=600' + ] + foreach option:cflags_options + if cc.has_argument(option) + cflags += option + endif + endforeach + if get_option('buildtype').contains('debug') + cflags += [ '-pedantic', '-UNDEBUG', '-DPEDANTIC' ] + else + cflags += [ '-DNDEBUG', '-UPEDANTIC' ] + endif + # To maintain the compatibility with the make build system + # mlx5_autoconf.h file is still generated. + # input array for meson member search: + # [ "MACRO to define if found", "header for the search", + # "symbol to search", "struct member to search" ] + has_member_args = [ + [ 'HAVE_IBV_MLX5_MOD_SWP', 'infiniband/mlx5dv.h', + 'struct mlx5dv_sw_parsing_caps', 'sw_parsing_offloads' ], + [ 'HAVE_IBV_DEVICE_COUNTERS_SET_V42', 'infiniband/verbs.h', + 'struct ibv_counter_set_init_attr', 'counter_set_id' ], + [ 'HAVE_IBV_DEVICE_COUNTERS_SET_V45', 'infiniband/verbs.h', + 'struct ibv_counters_init_attr', 'comp_mask' ], + ] + # input array for meson symbol search: + # [ "MACRO to define if found", "header for the search", + # "symbol to search" ] + has_sym_args = [ + [ 'HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT', 'infiniband/mlx5dv.h', + 'MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX' ], + [ 'HAVE_IBV_DEVICE_TUNNEL_SUPPORT', 'infiniband/mlx5dv.h', + 'MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS' ], + [ 'HAVE_IBV_MLX5_MOD_MPW', 'infiniband/mlx5dv.h', + 'MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED' ], + [ 'HAVE_IBV_MLX5_MOD_CQE_128B_COMP', 'infiniband/mlx5dv.h', + 'MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP' ], + [ 'HAVE_IBV_FLOW_DV_SUPPORT', 'infiniband/mlx5dv.h', + 'MLX5DV_FLOW_ACTION_TAG' ], + [ 'HAVE_IBV_DEVICE_MPLS_SUPPORT', 'infiniband/verbs.h', + 'IBV_FLOW_SPEC_MPLS' ], + [ 'HAVE_IBV_WQ_FLAG_RX_END_PADDING', 'infiniband/verbs.h', + 'IBV_WQ_FLAG_RX_END_PADDING' ], + [ 'HAVE_SUPPORTED_40000baseKR4_Full', 'linux/ethtool.h', + 'SUPPORTED_40000baseKR4_Full' ], + [ 'HAVE_SUPPORTED_40000baseCR4_Full', 'linux/ethtool.h', + 'SUPPORTED_40000baseCR4_Full' ], + [ 'HAVE_SUPPORTED_40000baseSR4_Full', 'linux/ethtool.h', + 'SUPPORTED_40000baseSR4_Full' ], + [ 'HAVE_SUPPORTED_40000baseLR4_Full', 'linux/ethtool.h', + 'SUPPORTED_40000baseLR4_Full' ], + [ 'HAVE_SUPPORTED_56000baseKR4_Full', 'linux/ethtool.h', + 'SUPPORTED_56000baseKR4_Full' ], + [ 'HAVE_SUPPORTED_56000baseCR4_Full', 'linux/ethtool.h', + 'SUPPORTED_56000baseCR4_Full' ], + [ 'HAVE_SUPPORTED_56000baseSR4_Full', 'linux/ethtool.h', + 'SUPPORTED_56000baseSR4_Full' ], + [ 'HAVE_SUPPORTED_56000baseLR4_Full', 'linux/ethtool.h', + 'SUPPORTED_56000baseLR4_Full' ], + [ 'HAVE_ETHTOOL_LINK_MODE_25G', 'linux/ethtool.h', + 'ETHTOOL_LINK_MODE_25000baseCR_Full_BIT' ], + [ 'HAVE_ETHTOOL_LINK_MODE_50G', 'linux/ethtool.h', + 'ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT' ], + [ 'HAVE_ETHTOOL_LINK_MODE_100G', 'linux/ethtool.h', + 'ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT' ], + [ 'HAVE_IFLA_PHYS_SWITCH_ID', 'linux/if_link.h', + 'IFLA_PHYS_SWITCH_ID' ], + [ 'HAVE_IFLA_PHYS_PORT_NAME', 'linux/if_link.h', + 'IFLA_PHYS_PORT_NAME' ], + [ 'HAVE_TCA_CHAIN', 'linux/rtnetlink.h', + 'TCA_CHAIN' ], + [ 'HAVE_TCA_FLOWER_ACT', 'linux/pkt_cls.h', + 'TCA_FLOWER_ACT' ], + [ 'HAVE_TCA_FLOWER_FLAGS', 'linux/pkt_cls.h', + 'TCA_FLOWER_FLAGS' ], + [ 'HAVE_TCA_FLOWER_KEY_ETH_TYPE', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ETH_TYPE' ], + [ 'HAVE_TCA_FLOWER_KEY_ETH_DST', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ETH_DST' ], + [ 'HAVE_TCA_FLOWER_KEY_ETH_DST_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ETH_DST_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_ETH_SRC', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ETH_SRC' ], + [ 'HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_ETH_SRC_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_IP_PROTO', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_IP_PROTO' ], + [ 'HAVE_TCA_FLOWER_KEY_IPV4_SRC', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_IPV4_SRC' ], + [ 'HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_IPV4_SRC_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_IPV4_DST', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_IPV4_DST' ], + [ 'HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_IPV4_DST_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_IPV6_SRC', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_IPV6_SRC' ], + [ 'HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_IPV6_SRC_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_IPV6_DST', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_IPV6_DST' ], + [ 'HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_IPV6_DST_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_TCP_SRC', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_TCP_SRC' ], + [ 'HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_TCP_SRC_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_TCP_DST', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_TCP_DST' ], + [ 'HAVE_TCA_FLOWER_KEY_TCP_DST_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_TCP_DST_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_UDP_SRC', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_UDP_SRC' ], + [ 'HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_UDP_SRC_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_UDP_DST', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_UDP_DST' ], + [ 'HAVE_TCA_FLOWER_KEY_UDP_DST_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_UDP_DST_MASK' ], + [ 'HAVE_TCA_FLOWER_KEY_VLAN_ID', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_VLAN_ID' ], + [ 'HAVE_TCA_FLOWER_KEY_VLAN_PRIO', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_VLAN_PRIO' ], + [ 'HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_VLAN_ETH_TYPE' ], + [ 'HAVE_TCA_FLOWER_KEY_TCP_FLAGS', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_TCP_FLAGS' ], + [ 'HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK', 'linux/pkt_cls.h', + 'TCA_FLOWER_KEY_TCP_FLAGS_MASK' ], + [ 'HAVE_TC_ACT_GOTO_CHAIN', 'linux/pkt_cls.h', + 'TC_ACT_GOTO_CHAIN' ], + [ 'HAVE_TC_ACT_VLAN', 'linux/tc_act/tc_vlan.h', + 'TCA_VLAN_PUSH_VLAN_PRIORITY' ], + [ 'HAVE_TC_ACT_PEDIT', 'linux/tc_act/tc_pedit.h', + 'TCA_PEDIT_KEY_EX_HDR_TYPE_UDP' ], + [ 'HAVE_RDMA_NL_NLDEV', 'rdma/rdma_netlink.h', + 'RDMA_NL_NLDEV' ], + [ 'HAVE_RDMA_NLDEV_CMD_GET', 'rdma/rdma_netlink.h', + 'RDMA_NLDEV_CMD_GET' ], + [ 'HAVE_RDMA_NLDEV_CMD_PORT_GET', 'rdma/rdma_netlink.h', + 'RDMA_NLDEV_CMD_PORT_GET' ], + [ 'HAVE_RDMA_NLDEV_ATTR_DEV_INDEX', 'rdma/rdma_netlink.h', + 'RDMA_NLDEV_ATTR_DEV_INDEX' ], + [ 'HAVE_RDMA_NLDEV_ATTR_DEV_NAME', 'rdma/rdma_netlink.h', + 'RDMA_NLDEV_ATTR_DEV_NAME' ], + [ 'HAVE_RDMA_NLDEV_ATTR_PORT_INDEX', 'rdma/rdma_netlink.h', + 'RDMA_NLDEV_ATTR_PORT_INDEX' ], + [ 'HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX', 'rdma/rdma_netlink.h', + 'RDMA_NLDEV_ATTR_NDEV_INDEX' ], + ] + config = configuration_data() + foreach arg:has_sym_args + config.set(arg[0], cc.has_header_symbol(arg[1], arg[2])) + endforeach + foreach arg:has_member_args + file_prefix = '#include<' + arg[1] + '>' + config.set(arg[0], cc.has_member(arg[2], arg[3], + prefix : file_prefix)) + endforeach + configure_file(output : 'mlx5_autoconf.h', configuration : config) +endif +# Build Glue Library +if pmd_dlopen and build + dlopen_name = 'mlx5_glue' + dlopen_lib_name = driver_name_fmt.format(dlopen_name) + dlopen_so_version = LIB_GLUE_VERSION + dlopen_sources = files('mlx5_glue.c') + dlopen_install_dir = [ eal_pmd_path + '-glue' ] + shared_lib = shared_library( + dlopen_lib_name, + dlopen_sources, + include_directories: global_inc, + c_args: cflags, + dependencies: libs, + link_args: [ + '-Wl,-export-dynamic', + '-Wl,-h,@0@'.format(LIB_GLUE), + ], + soversion: dlopen_so_version, + install: true, + install_dir: dlopen_install_dir, + ) +endif diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index ec63bc6e..a277b573 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -46,6 +46,7 @@ #include "mlx5_defs.h" #include "mlx5_glue.h" #include "mlx5_mr.h" +#include "mlx5_flow.h" /* Device parameter to enable RX completion queue compression. */ #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" @@ -89,6 +90,9 @@ /* Allow L3 VXLAN flow creation. */ #define MLX5_L3_VXLAN_EN "l3_vxlan_en" +/* Activate DV flow steering. */ +#define MLX5_DV_FLOW_EN "dv_flow_en" + /* Activate Netlink support in VF mode. */ #define MLX5_VF_NL_EN "vf_nl_en" @@ -282,8 +286,8 @@ mlx5_dev_close(struct rte_eth_dev *dev) close(priv->nl_socket_route); if (priv->nl_socket_rdma >= 0) close(priv->nl_socket_rdma); - if (priv->mnl_socket) - mlx5_nl_flow_socket_destroy(priv->mnl_socket); + if (priv->tcf_context) + mlx5_flow_tcf_context_destroy(priv->tcf_context); ret = mlx5_hrxq_ibv_verify(dev); if (ret) DRV_LOG(WARNING, "port %u some hash Rx queue still remain", @@ -333,6 +337,17 @@ mlx5_dev_close(struct rte_eth_dev *dev) } memset(priv, 0, sizeof(*priv)); priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; + /* + * flag to rte_eth_dev_close() that it should release the port resources + * (calling rte_eth_dev_release_port()) in addition to closing it. + */ + dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; + /* + * Reset mac_addrs to NULL such that it is not freed as part of + * rte_eth_dev_release_port(). mac_addrs is part of dev_private so + * it is freed when dev_private is freed. + */ + dev->data->mac_addrs = NULL; } const struct eth_dev_ops mlx5_dev_ops = { @@ -477,7 +492,7 @@ mlx5_args_check(const char *key, const char *val, void *opaque) } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { config->txqs_inline = tmp; } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { - config->mps = !!tmp ? config->mps : 0; + config->mps = !!tmp; } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { config->mpw_hdr_dseg = !!tmp; } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { @@ -490,6 +505,8 @@ mlx5_args_check(const char *key, const char *val, void *opaque) config->l3_vxlan_en = !!tmp; } else if (strcmp(MLX5_VF_NL_EN, key) == 0) { config->vf_nl_en = !!tmp; + } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) { + config->dv_flow_en = !!tmp; } else { DRV_LOG(WARNING, "%s: unknown parameter", key); rte_errno = EINVAL; @@ -527,6 +544,7 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) MLX5_RX_VEC_EN, MLX5_L3_VXLAN_EN, MLX5_VF_NL_EN, + MLX5_DV_FLOW_EN, MLX5_REPRESENTOR, NULL, }; @@ -568,11 +586,13 @@ static struct rte_pci_driver mlx5_driver; static void *uar_base; static int -find_lower_va_bound(const struct rte_memseg_list *msl __rte_unused, +find_lower_va_bound(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg) { void **addr = arg; + if (msl->external) + return 0; if (*addr == NULL) *addr = ms->addr; else @@ -685,9 +705,10 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev) * * @return * A valid Ethernet device object on success, NULL otherwise and rte_errno - * is set. The following error is defined: + * is set. The following errors are defined: * * EBUSY: device is not supposed to be spawned. + * EEXIST: device is already spawned */ static struct rte_eth_dev * mlx5_dev_spawn(struct rte_device *dpdk_dev, @@ -702,6 +723,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, struct mlx5dv_context dv_attr = { .comp_mask = 0 }; struct mlx5_dev_config config = { .vf = !!vf, + .mps = MLX5_ARG_UNSET, .tx_vec_en = 1, .rx_vec_en = 1, .mpw_hdr_dseg = 0, @@ -729,12 +751,10 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, unsigned int mprq_max_stride_size_n = 0; unsigned int mprq_min_stride_num_n = 0; unsigned int mprq_max_stride_num_n = 0; -#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT - struct ibv_counter_set_description cs_desc = { .counter_type = 0 }; -#endif struct ether_addr mac; char name[RTE_ETH_NAME_MAX_LEN]; int own_domain_id = 0; + uint16_t port_id; unsigned int i; /* Determine if this port representor is supposed to be spawned. */ @@ -757,6 +777,17 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, return NULL; } } + /* Build device name. */ + if (!switch_info->representor) + rte_strlcpy(name, dpdk_dev->name, sizeof(name)); + else + snprintf(name, sizeof(name), "%s_representor_%u", + dpdk_dev->name, switch_info->port_name); + /* check if the device is already spawned */ + if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { + rte_errno = EEXIST; + return NULL; + } /* Prepare shared data between primary and secondary process. */ mlx5_prepare_shared_data(); errno = 0; @@ -791,7 +822,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, DRV_LOG(DEBUG, "MPW isn't supported"); mps = MLX5_MPW_DISABLED; } - config.mps = mps; #ifdef HAVE_IBV_MLX5_MOD_SWP if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; @@ -864,11 +894,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, DEBUG("ibv_query_device_ex() failed"); goto error; } - if (!switch_info->representor) - rte_strlcpy(name, dpdk_dev->name, sizeof(name)); - else - snprintf(name, sizeof(name), "%s_representor_%u", - dpdk_dev->name, switch_info->port_name); DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); if (rte_eal_process_type() == RTE_PROC_SECONDARY) { eth_dev = rte_eth_dev_attach_secondary(name); @@ -1000,12 +1025,15 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, config.hw_csum = !!(attr.device_cap_flags_ex & IBV_DEVICE_RAW_IP_CSUM); DRV_LOG(DEBUG, "checksum offloading is %ssupported", (config.hw_csum ? "" : "not ")); -#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT - config.flow_counter_en = !!attr.max_counter_sets; - mlx5_glue->describe_counter_set(ctx, 0, &cs_desc); - DRV_LOG(DEBUG, "counter type = %d, num of cs = %ld, attributes = %d", - cs_desc.counter_type, cs_desc.num_of_cs, - cs_desc.attributes); +#if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ + !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + DRV_LOG(DEBUG, "counters are not supported"); +#endif +#ifndef HAVE_IBV_FLOW_DV_SUPPORT + if (config.dv_flow_en) { + DRV_LOG(WARNING, "DV flow is not supported"); + config.dv_flow_en = 0; + } #endif config.ind_table_max_size = attr.rss_caps.max_rwq_indirection_table_size; @@ -1035,13 +1063,15 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, (1 << IBV_QPT_RAW_PACKET))); if (config.tso) config.tso_max_payload_sz = attr.tso_caps.max_tso; - if (config.mps && !mps) { - DRV_LOG(ERR, - "multi-packet send not supported on this device" - " (" MLX5_TXQ_MPW_EN ")"); - err = ENOTSUP; - goto error; - } + /* + * MPW is disabled by default, while the Enhanced MPW is enabled + * by default. + */ + if (config.mps == MLX5_ARG_UNSET) + config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : + MLX5_MPW_DISABLED; + else + config.mps = config.mps ? mps : MLX5_MPW_DISABLED; DRV_LOG(INFO, "%sMPS is %s", config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "", config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); @@ -1073,13 +1103,14 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, err = ENOMEM; goto error; } - if (priv->representor) + if (priv->representor) { eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; + eth_dev->data->representor_id = priv->representor_id; + } eth_dev->data->dev_private = priv; priv->dev_data = eth_dev->data; eth_dev->data->mac_addrs = priv->mac; eth_dev->device = dpdk_dev; - eth_dev->device->driver = &mlx5_driver.driver; err = mlx5_uar_init_primary(eth_dev); if (err) { err = rte_errno; @@ -1128,8 +1159,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); if (vf && config.vf_nl_en) mlx5_nl_mac_addr_sync(eth_dev); - priv->mnl_socket = mlx5_nl_flow_socket_create(); - if (!priv->mnl_socket) { + priv->tcf_context = mlx5_flow_tcf_context_create(); + if (!priv->tcf_context) { err = -rte_errno; DRV_LOG(WARNING, "flow rules relying on switch offloads will not be" @@ -1144,16 +1175,16 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, error.message = "cannot retrieve network interface index"; } else { - err = mlx5_nl_flow_init(priv->mnl_socket, ifindex, - &error); + err = mlx5_flow_tcf_init(priv->tcf_context, + ifindex, &error); } if (err) { DRV_LOG(WARNING, "flow rules relying on switch offloads will" " not be supported: %s: %s", error.message, strerror(rte_errno)); - mlx5_nl_flow_socket_destroy(priv->mnl_socket); - priv->mnl_socket = NULL; + mlx5_flow_tcf_context_destroy(priv->tcf_context); + priv->tcf_context = NULL; } } TAILQ_INIT(&priv->flows); @@ -1208,16 +1239,21 @@ error: close(priv->nl_socket_route); if (priv->nl_socket_rdma >= 0) close(priv->nl_socket_rdma); - if (priv->mnl_socket) - mlx5_nl_flow_socket_destroy(priv->mnl_socket); + if (priv->tcf_context) + mlx5_flow_tcf_context_destroy(priv->tcf_context); if (own_domain_id) claim_zero(rte_eth_switch_domain_free(priv->domain_id)); rte_free(priv); + if (eth_dev != NULL) + eth_dev->data->dev_private = NULL; } if (pd) claim_zero(mlx5_glue->dealloc_pd(pd)); - if (eth_dev) + if (eth_dev != NULL) { + /* mac_addrs must not be freed alone because part of dev_private */ + eth_dev->data->mac_addrs = NULL; rte_eth_dev_release_port(eth_dev); + } if (ctx) claim_zero(mlx5_glue->close_device(ctx)); assert(err > 0); @@ -1404,9 +1440,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, list[i].eth_dev = mlx5_dev_spawn (&pci_dev->device, list[i].ibv_dev, vf, &list[i].info); if (!list[i].eth_dev) { - if (rte_errno != EBUSY) + if (rte_errno != EBUSY && rte_errno != EEXIST) break; - /* Device is disabled, ignore it. */ + /* Device is disabled or already spawned. Ignore it. */ continue; } restore = list[i].eth_dev->data->dev_flags; @@ -1437,8 +1473,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, if (!list[i].eth_dev) continue; mlx5_dev_close(list[i].eth_dev); - if (rte_eal_process_type() == RTE_PROC_PRIMARY) - rte_free(list[i].eth_dev->data->dev_private); + /* mac_addrs must not be freed because in dev_private */ + list[i].eth_dev->data->mac_addrs = NULL; claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); } /* Restore original error. */ @@ -1449,6 +1485,32 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, return ret; } +/** + * DPDK callback to remove a PCI device. + * + * This function removes all Ethernet devices belong to a given PCI device. + * + * @param[in] pci_dev + * Pointer to the PCI device. + * + * @return + * 0 on success, the function cannot fail. + */ +static int +mlx5_pci_remove(struct rte_pci_device *pci_dev) +{ + uint16_t port_id; + struct rte_eth_dev *port; + + for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) { + port = &rte_eth_devices[port_id]; + if (port->state != RTE_ETH_DEV_UNUSED && + port->device == &pci_dev->device) + rte_eth_dev_close(port_id); + } + return 0; +} + static const struct rte_pci_id mlx5_pci_id_map[] = { { RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, @@ -1487,6 +1549,10 @@ static const struct rte_pci_id mlx5_pci_id_map[] = { PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) }, { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, + PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF) + }, + { .vendor_id = 0 } }; @@ -1497,7 +1563,9 @@ static struct rte_pci_driver mlx5_driver = { }, .id_table = mlx5_pci_id_map, .probe = mlx5_pci_probe, - .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV, + .remove = mlx5_pci_remove, + .drv_flags = (RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV | + RTE_PCI_DRV_PROBE_AGAIN), }; #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index a3a34cff..74d87c05 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -51,6 +51,7 @@ enum { PCI_DEVICE_ID_MELLANOX_CONNECTX5EX = 0x1019, PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF = 0x101a, PCI_DEVICE_ID_MELLANOX_CONNECTX5BF = 0xa2d2, + PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF = 0xa2d3, }; /** Switch information returned by mlx5_nl_switch_info(). */ @@ -71,12 +72,23 @@ struct mlx5_shared_data { extern struct mlx5_shared_data *mlx5_shared_data; +struct mlx5_counter_ctrl { + /* Name of the counter. */ + char dpdk_name[RTE_ETH_XSTATS_NAME_SIZE]; + /* Name of the counter on the device table. */ + char ctr_name[RTE_ETH_XSTATS_NAME_SIZE]; + uint32_t ib:1; /**< Nonzero for IB counters. */ +}; + struct mlx5_xstats_ctrl { /* Number of device stats. */ uint16_t stats_n; + /* Number of device stats identified by PMD. */ + uint16_t mlx5_stats_n; /* Index in the device counters table. */ uint16_t dev_table_idx[MLX5_MAX_XSTATS]; uint64_t base[MLX5_MAX_XSTATS]; + struct mlx5_counter_ctrl info[MLX5_MAX_XSTATS]; }; /* Flow list . */ @@ -99,11 +111,9 @@ struct mlx5_dev_config { unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */ unsigned int hw_padding:1; /* End alignment padding is supported. */ unsigned int vf:1; /* This is a VF. */ - unsigned int mps:2; /* Multi-packet send supported mode. */ unsigned int tunnel_en:1; /* Whether tunnel stateless offloads are supported. */ unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */ - unsigned int flow_counter_en:1; /* Whether flow counter is supported. */ unsigned int cqe_comp:1; /* CQE compression is enabled. */ unsigned int tso:1; /* Whether TSO is supported. */ unsigned int tx_vec_en:1; /* Tx vector is enabled. */ @@ -111,6 +121,7 @@ struct mlx5_dev_config { unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */ unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */ unsigned int vf_nl_en:1; /* Enable Netlink requests in VF mode. */ + unsigned int dv_flow_en:1; /* Enable DV flow. */ unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload. */ struct { unsigned int enabled:1; /* Whether MPRQ is enabled. */ @@ -122,6 +133,7 @@ struct mlx5_dev_config { unsigned int min_rxqs_num; /* Rx queue count threshold to enable MPRQ. */ } mprq; /* Configurations for Multi-Packet RQ. */ + int mps; /* Multi-packet send supported mode. */ unsigned int flow_prio; /* Number of flow priorities. */ unsigned int tso_max_payload_sz; /* Maximum TCP payload for TSO. */ unsigned int ind_table_max_size; /* Maximum indirection table size. */ @@ -156,13 +168,7 @@ struct mlx5_drop { struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */ }; -/** DPDK port to network interface index (ifindex) conversion. */ -struct mlx5_nl_flow_ptoi { - uint16_t port_id; /**< DPDK port ID. */ - unsigned int ifindex; /**< Network interface index. */ -}; - -struct mnl_socket; +struct mlx5_flow_tcf_context; struct priv { LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */ @@ -212,6 +218,7 @@ struct priv { LIST_HEAD(txqibv, mlx5_txq_ibv) txqsibv; /* Verbs Tx queues. */ /* Verbs Indirection tables. */ LIST_HEAD(ind_tables, mlx5_ind_table_ibv) ind_tbls; + LIST_HEAD(matchers, mlx5_flow_dv_matcher) matchers; uint32_t link_speed_capa; /* Link speed capabilities. */ struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */ int primary_socket; /* Unix socket for primary process. */ @@ -228,7 +235,7 @@ struct priv { rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX]; /* UAR same-page access control required in 32bit implementations. */ #endif - struct mnl_socket *mnl_socket; /* Libmnl socket. */ + struct mlx5_flow_tcf_context *tcf_context; /* TC flower context. */ }; #define PORT_ID(priv) ((priv)->dev_data->port_id) @@ -240,12 +247,9 @@ int mlx5_getenv_int(const char *); /* mlx5_ethdev.c */ -int mlx5_get_master_ifname(const struct rte_eth_dev *dev, - char (*ifname)[IF_NAMESIZE]); int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]); unsigned int mlx5_ifindex(const struct rte_eth_dev *dev); -int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr, - int master); +int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr); int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu); int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags); @@ -396,23 +400,4 @@ unsigned int mlx5_nl_ifindex(int nl, const char *name); int mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info); -/* mlx5_nl_flow.c */ - -int mlx5_nl_flow_transpose(void *buf, - size_t size, - const struct mlx5_nl_flow_ptoi *ptoi, - const struct rte_flow_attr *attr, - const struct rte_flow_item *pattern, - const struct rte_flow_action *actions, - struct rte_flow_error *error); -void mlx5_nl_flow_brand(void *buf, uint32_t handle); -int mlx5_nl_flow_create(struct mnl_socket *nl, void *buf, - struct rte_flow_error *error); -int mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf, - struct rte_flow_error *error); -int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex, - struct rte_flow_error *error); -struct mnl_socket *mlx5_nl_flow_socket_create(void); -void mlx5_nl_flow_socket_destroy(struct mnl_socket *nl); - #endif /* RTE_PMD_MLX5_H_ */ diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index 34c5b95e..d178ed6a 100644 --- a/drivers/net/mlx5/mlx5_ethdev.c +++ b/drivers/net/mlx5/mlx5_ethdev.c @@ -3,8 +3,6 @@ * Copyright 2015 Mellanox Technologies, Ltd */ -#define _GNU_SOURCE - #include <stddef.h> #include <assert.h> #include <inttypes.h> @@ -129,7 +127,7 @@ struct ethtool_link_settings { * @return * 0 on success, a negative errno value otherwise and rte_errno is set. */ -int +static int mlx5_get_master_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]) { @@ -270,16 +268,12 @@ mlx5_ifindex(const struct rte_eth_dev *dev) * Request number to pass to ioctl(). * @param[out] ifr * Interface request structure output buffer. - * @param master - * When device is a port representor, perform request on master device - * instead. * * @return * 0 on success, a negative errno value otherwise and rte_errno is set. */ int -mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr, - int master) +mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr) { int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); int ret = 0; @@ -288,10 +282,7 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr, rte_errno = errno; return -rte_errno; } - if (master) - ret = mlx5_get_master_ifname(dev, &ifr->ifr_name); - else - ret = mlx5_get_ifname(dev, &ifr->ifr_name); + ret = mlx5_get_ifname(dev, &ifr->ifr_name); if (ret) goto error; ret = ioctl(sock, req, ifr); @@ -321,7 +312,7 @@ int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu) { struct ifreq request; - int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0); + int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request); if (ret) return ret; @@ -345,7 +336,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) { struct ifreq request = { .ifr_mtu = mtu, }; - return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0); + return mlx5_ifreq(dev, SIOCSIFMTU, &request); } /** @@ -365,13 +356,13 @@ int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags) { struct ifreq request; - int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0); + int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request); if (ret) return ret; request.ifr_flags &= keep; request.ifr_flags |= flags & ~keep; - return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0); + return mlx5_ifreq(dev, SIOCSIFFLAGS, &request); } /** @@ -627,17 +618,20 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int link_speed = 0; int ret; - ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1); + ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); if (ret) { DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", dev->data->port_id, strerror(rte_errno)); return ret; } - memset(&dev_link, 0, sizeof(dev_link)); - dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && - (ifr.ifr_flags & IFF_RUNNING)); - ifr.ifr_data = (void *)&edata; - ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + dev_link = (struct rte_eth_link) { + .link_status = ((ifr.ifr_flags & IFF_UP) && + (ifr.ifr_flags & IFF_RUNNING)), + }; + ifr = (struct ifreq) { + .ifr_data = (void *)&edata, + }; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); if (ret) { DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", @@ -666,8 +660,8 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & ETH_LINK_SPEED_FIXED); - if ((dev_link.link_speed && !dev_link.link_status) || - (!dev_link.link_speed && dev_link.link_status)) { + if (((dev_link.link_speed && !dev_link.link_status) || + (!dev_link.link_speed && dev_link.link_status))) { rte_errno = EAGAIN; return -rte_errno; } @@ -698,17 +692,20 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, uint64_t sc; int ret; - ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1); + ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr); if (ret) { DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s", dev->data->port_id, strerror(rte_errno)); return ret; } - memset(&dev_link, 0, sizeof(dev_link)); - dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && - (ifr.ifr_flags & IFF_RUNNING)); - ifr.ifr_data = (void *)&gcmd; - ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + dev_link = (struct rte_eth_link) { + .link_status = ((ifr.ifr_flags & IFF_UP) && + (ifr.ifr_flags & IFF_RUNNING)), + }; + ifr = (struct ifreq) { + .ifr_data = (void *)&gcmd, + }; + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); if (ret) { DRV_LOG(DEBUG, "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)" @@ -725,7 +722,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, *ecmd = gcmd; ifr.ifr_data = (void *)ecmd; - ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); if (ret) { DRV_LOG(DEBUG, "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)" @@ -775,8 +772,8 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & ETH_LINK_SPEED_FIXED); - if ((dev_link.link_speed && !dev_link.link_status) || - (!dev_link.link_speed && dev_link.link_status)) { + if (((dev_link.link_speed && !dev_link.link_status) || + (!dev_link.link_speed && dev_link.link_status))) { rte_errno = EAGAIN; return -rte_errno; } @@ -888,7 +885,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) int ret; ifr.ifr_data = (void *)ðpause; - ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); if (ret) { DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:" @@ -941,7 +938,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) ethpause.tx_pause = 1; else ethpause.tx_pause = 0; - ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0); + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); if (ret) { DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" @@ -1306,10 +1303,7 @@ mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list, RTE_ETH_FOREACH_DEV(id) { struct rte_eth_dev *ldev = &rte_eth_devices[id]; - if (!ldev->device || - !ldev->device->driver || - strcmp(ldev->device->driver->name, MLX5_DRIVER_NAME) || - ldev->device != dev) + if (ldev->device != dev) continue; if (n < port_list_n) port_list[n] = id; diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index ca4625b6..280af0ab 100644 --- a/drivers/net/mlx5/mlx5_flow.c +++ b/drivers/net/mlx5/mlx5_flow.c @@ -3,6 +3,7 @@ * Copyright 2016 Mellanox Technologies, Ltd */ +#include <netinet/in.h> #include <sys/queue.h> #include <stdalign.h> #include <stdint.h> @@ -31,74 +32,30 @@ #include "mlx5_defs.h" #include "mlx5_prm.h" #include "mlx5_glue.h" +#include "mlx5_flow.h" /* Dev ops structure defined in mlx5.c */ extern const struct eth_dev_ops mlx5_dev_ops; extern const struct eth_dev_ops mlx5_dev_ops_isolate; -/* Pattern outer Layer bits. */ -#define MLX5_FLOW_LAYER_OUTER_L2 (1u << 0) -#define MLX5_FLOW_LAYER_OUTER_L3_IPV4 (1u << 1) -#define MLX5_FLOW_LAYER_OUTER_L3_IPV6 (1u << 2) -#define MLX5_FLOW_LAYER_OUTER_L4_UDP (1u << 3) -#define MLX5_FLOW_LAYER_OUTER_L4_TCP (1u << 4) -#define MLX5_FLOW_LAYER_OUTER_VLAN (1u << 5) - -/* Pattern inner Layer bits. */ -#define MLX5_FLOW_LAYER_INNER_L2 (1u << 6) -#define MLX5_FLOW_LAYER_INNER_L3_IPV4 (1u << 7) -#define MLX5_FLOW_LAYER_INNER_L3_IPV6 (1u << 8) -#define MLX5_FLOW_LAYER_INNER_L4_UDP (1u << 9) -#define MLX5_FLOW_LAYER_INNER_L4_TCP (1u << 10) -#define MLX5_FLOW_LAYER_INNER_VLAN (1u << 11) - -/* Pattern tunnel Layer bits. */ -#define MLX5_FLOW_LAYER_VXLAN (1u << 12) -#define MLX5_FLOW_LAYER_VXLAN_GPE (1u << 13) -#define MLX5_FLOW_LAYER_GRE (1u << 14) -#define MLX5_FLOW_LAYER_MPLS (1u << 15) - -/* Outer Masks. */ -#define MLX5_FLOW_LAYER_OUTER_L3 \ - (MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6) -#define MLX5_FLOW_LAYER_OUTER_L4 \ - (MLX5_FLOW_LAYER_OUTER_L4_UDP | MLX5_FLOW_LAYER_OUTER_L4_TCP) -#define MLX5_FLOW_LAYER_OUTER \ - (MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_L3 | \ - MLX5_FLOW_LAYER_OUTER_L4) - -/* Tunnel Masks. */ -#define MLX5_FLOW_LAYER_TUNNEL \ - (MLX5_FLOW_LAYER_VXLAN | MLX5_FLOW_LAYER_VXLAN_GPE | \ - MLX5_FLOW_LAYER_GRE | MLX5_FLOW_LAYER_MPLS) - -/* Inner Masks. */ -#define MLX5_FLOW_LAYER_INNER_L3 \ - (MLX5_FLOW_LAYER_INNER_L3_IPV4 | MLX5_FLOW_LAYER_INNER_L3_IPV6) -#define MLX5_FLOW_LAYER_INNER_L4 \ - (MLX5_FLOW_LAYER_INNER_L4_UDP | MLX5_FLOW_LAYER_INNER_L4_TCP) -#define MLX5_FLOW_LAYER_INNER \ - (MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_L3 | \ - MLX5_FLOW_LAYER_INNER_L4) - -/* Actions that modify the fate of matching traffic. */ -#define MLX5_FLOW_FATE_DROP (1u << 0) -#define MLX5_FLOW_FATE_QUEUE (1u << 1) -#define MLX5_FLOW_FATE_RSS (1u << 2) - -/* Modify a packet. */ -#define MLX5_FLOW_MOD_FLAG (1u << 0) -#define MLX5_FLOW_MOD_MARK (1u << 1) -#define MLX5_FLOW_MOD_COUNT (1u << 2) - -/* possible L3 layers protocols filtering. */ -#define MLX5_IP_PROTOCOL_TCP 6 -#define MLX5_IP_PROTOCOL_UDP 17 -#define MLX5_IP_PROTOCOL_GRE 47 -#define MLX5_IP_PROTOCOL_MPLS 147 - -/* Priority reserved for default flows. */ -#define MLX5_FLOW_PRIO_RSVD ((uint32_t)-1) +/** Device flow drivers. */ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT +extern const struct mlx5_flow_driver_ops mlx5_flow_dv_drv_ops; +#endif +extern const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops; +extern const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops; + +const struct mlx5_flow_driver_ops mlx5_flow_null_drv_ops; + +const struct mlx5_flow_driver_ops *flow_drv_ops[] = { + [MLX5_FLOW_TYPE_MIN] = &mlx5_flow_null_drv_ops, +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + [MLX5_FLOW_TYPE_DV] = &mlx5_flow_dv_drv_ops, +#endif + [MLX5_FLOW_TYPE_TCF] = &mlx5_flow_tcf_drv_ops, + [MLX5_FLOW_TYPE_VERBS] = &mlx5_flow_verbs_drv_ops, + [MLX5_FLOW_TYPE_MAX] = &mlx5_flow_null_drv_ops +}; enum mlx5_expansion { MLX5_EXPANSION_ROOT, @@ -270,53 +227,6 @@ static const struct rte_flow_expand_node mlx5_support_expansion[] = { }, }; -/** Handles information leading to a drop fate. */ -struct mlx5_flow_verbs { - LIST_ENTRY(mlx5_flow_verbs) next; - unsigned int size; /**< Size of the attribute. */ - struct { - struct ibv_flow_attr *attr; - /**< Pointer to the Specification buffer. */ - uint8_t *specs; /**< Pointer to the specifications. */ - }; - struct ibv_flow *flow; /**< Verbs flow pointer. */ - struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */ - uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */ -}; - -/* Counters information. */ -struct mlx5_flow_counter { - LIST_ENTRY(mlx5_flow_counter) next; /**< Pointer to the next counter. */ - uint32_t shared:1; /**< Share counter ID with other flow rules. */ - uint32_t ref_cnt:31; /**< Reference counter. */ - uint32_t id; /**< Counter ID. */ - struct ibv_counter_set *cs; /**< Holds the counters for the rule. */ - uint64_t hits; /**< Number of packets matched by the rule. */ - uint64_t bytes; /**< Number of bytes matched by the rule. */ -}; - -/* Flow structure. */ -struct rte_flow { - TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */ - struct rte_flow_attr attributes; /**< User flow attribute. */ - uint32_t l3_protocol_en:1; /**< Protocol filtering requested. */ - uint32_t layers; - /**< Bit-fields of present layers see MLX5_FLOW_LAYER_*. */ - uint32_t modifier; - /**< Bit-fields of present modifier see MLX5_FLOW_MOD_*. */ - uint32_t fate; - /**< Bit-fields of present fate see MLX5_FLOW_FATE_*. */ - uint8_t l3_protocol; /**< valid when l3_protocol_en is set. */ - LIST_HEAD(verbs, mlx5_flow_verbs) verbs; /**< Verbs flows list. */ - struct mlx5_flow_verbs *cur_verbs; - /**< Current Verbs flow structure being filled. */ - struct mlx5_flow_counter *counter; /**< Holds Verbs flow counter. */ - struct rte_flow_action_rss rss;/**< RSS context. */ - uint8_t key[MLX5_RSS_HASH_KEY_LEN]; /**< RSS hash key. */ - uint16_t (*queue)[]; /**< Destination queues to redirect traffic to. */ - void *nl_flow; /**< Netlink flow buffer if relevant. */ -}; - static const struct rte_flow_ops mlx5_flow_ops = { .validate = mlx5_flow_validate, .create = mlx5_flow_create, @@ -352,23 +262,6 @@ struct mlx5_fdir { struct rte_flow_action_queue queue; }; -/* Verbs specification header. */ -struct ibv_spec_header { - enum ibv_flow_spec_type type; - uint16_t size; -}; - -/* - * Number of sub priorities. - * For each kind of pattern matching i.e. L2, L3, L4 to have a correct - * matching on the NIC (firmware dependent) L4 most have the higher priority - * followed by L3 and ending with L2. - */ -#define MLX5_PRIORITY_MAP_L2 2 -#define MLX5_PRIORITY_MAP_L3 1 -#define MLX5_PRIORITY_MAP_L4 0 -#define MLX5_PRIORITY_MAP_MAX 3 - /* Map of Verbs to Flow priority with 8 Verbs priorities. */ static const uint32_t priority_map_3[][MLX5_PRIORITY_MAP_MAX] = { { 0, 1, 2 }, { 2, 3, 4 }, { 5, 6, 7 }, @@ -413,7 +306,7 @@ static struct mlx5_flow_tunnel_info tunnels_info[] = { * Discover the maximum number of priority available. * * @param[in] dev - * Pointer to Ethernet device. + * Pointer to the Ethernet device structure. * * @return * number of supported flow priority on success, a negative errno @@ -478,160 +371,33 @@ mlx5_flow_discover_priorities(struct rte_eth_dev *dev) } /** - * Adjust flow priority. + * Adjust flow priority based on the highest layer and the request priority. * - * @param dev - * Pointer to Ethernet device. - * @param flow - * Pointer to an rte flow. + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] priority + * The rule base priority. + * @param[in] subpriority + * The priority based on the items. + * + * @return + * The new priority. */ -static void -mlx5_flow_adjust_priority(struct rte_eth_dev *dev, struct rte_flow *flow) +uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority, + uint32_t subpriority) { + uint32_t res = 0; struct priv *priv = dev->data->dev_private; - uint32_t priority = flow->attributes.priority; - uint32_t subpriority = flow->cur_verbs->attr->priority; switch (priv->config.flow_prio) { case RTE_DIM(priority_map_3): - priority = priority_map_3[priority][subpriority]; + res = priority_map_3[priority][subpriority]; break; case RTE_DIM(priority_map_5): - priority = priority_map_5[priority][subpriority]; + res = priority_map_5[priority][subpriority]; break; } - flow->cur_verbs->attr->priority = priority; -} - -/** - * Get a flow counter. - * - * @param[in] dev - * Pointer to Ethernet device. - * @param[in] shared - * Indicate if this counter is shared with other flows. - * @param[in] id - * Counter identifier. - * - * @return - * A pointer to the counter, NULL otherwise and rte_errno is set. - */ -static struct mlx5_flow_counter * -mlx5_flow_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id) -{ - struct priv *priv = dev->data->dev_private; - struct mlx5_flow_counter *cnt; - - LIST_FOREACH(cnt, &priv->flow_counters, next) { - if (!cnt->shared || cnt->shared != shared) - continue; - if (cnt->id != id) - continue; - cnt->ref_cnt++; - return cnt; - } -#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT - - struct mlx5_flow_counter tmpl = { - .shared = shared, - .id = id, - .cs = mlx5_glue->create_counter_set - (priv->ctx, - &(struct ibv_counter_set_init_attr){ - .counter_set_id = id, - }), - .hits = 0, - .bytes = 0, - }; - - if (!tmpl.cs) { - rte_errno = errno; - return NULL; - } - cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0); - if (!cnt) { - rte_errno = ENOMEM; - return NULL; - } - *cnt = tmpl; - LIST_INSERT_HEAD(&priv->flow_counters, cnt, next); - return cnt; -#endif - rte_errno = ENOTSUP; - return NULL; -} - -/** - * Release a flow counter. - * - * @param[in] counter - * Pointer to the counter handler. - */ -static void -mlx5_flow_counter_release(struct mlx5_flow_counter *counter) -{ - if (--counter->ref_cnt == 0) { - claim_zero(mlx5_glue->destroy_counter_set(counter->cs)); - LIST_REMOVE(counter, next); - rte_free(counter); - } -} - -/** - * Verify the @p attributes will be correctly understood by the NIC and store - * them in the @p flow if everything is correct. - * - * @param[in] dev - * Pointer to Ethernet device. - * @param[in] attributes - * Pointer to flow attributes - * @param[in, out] flow - * Pointer to the rte_flow structure. - * @param[out] error - * Pointer to error structure. - * - * @return - * 0 on success, a negative errno value otherwise and rte_errno is set. - */ -static int -mlx5_flow_attributes(struct rte_eth_dev *dev, - const struct rte_flow_attr *attributes, - struct rte_flow *flow, - struct rte_flow_error *error) -{ - uint32_t priority_max = - ((struct priv *)dev->data->dev_private)->config.flow_prio - 1; - - if (attributes->group) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_GROUP, - NULL, - "groups is not supported"); - if (attributes->priority != MLX5_FLOW_PRIO_RSVD && - attributes->priority >= priority_max) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, - NULL, - "priority out of range"); - if (attributes->egress) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, - NULL, - "egress is not supported"); - if (attributes->transfer) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, - NULL, - "transfer is not supported"); - if (!attributes->ingress) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, - NULL, - "ingress attribute is mandatory"); - flow->attributes = *attributes; - if (attributes->priority == MLX5_FLOW_PRIO_RSVD) - flow->attributes.priority = priority_max; - return 0; + return res; } /** @@ -652,7 +418,7 @@ mlx5_flow_attributes(struct rte_eth_dev *dev, * @return * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int +int mlx5_flow_item_acceptable(const struct rte_flow_item *item, const uint8_t *mask, const uint8_t *nic_mask, @@ -671,8 +437,7 @@ mlx5_flow_item_acceptable(const struct rte_flow_item *item, " bits"); if (!item->spec && (item->mask || item->last)) return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "mask/last without a spec is not" " supported"); if (item->spec && item->last) { @@ -687,206 +452,635 @@ mlx5_flow_item_acceptable(const struct rte_flow_item *item, } ret = memcmp(spec, last, size); if (ret != 0) - return rte_flow_error_set(error, ENOTSUP, + return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, - "range is not supported"); + "range is not valid"); } return 0; } /** - * Add a verbs item specification into @p flow. + * Adjust the hash fields according to the @p flow information. * - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] src - * Create specification. - * @param[in] size - * Size in bytes of the specification to copy. + * @param[in] dev_flow. + * Pointer to the mlx5_flow. + * @param[in] tunnel + * 1 when the hash field is for a tunnel item. + * @param[in] layer_types + * ETH_RSS_* types. + * @param[in] hash_fields + * Item hash fields. + * + * @return + * The hash fileds that should be used. + */ +uint64_t +mlx5_flow_hashfields_adjust(struct mlx5_flow *dev_flow, + int tunnel __rte_unused, uint64_t layer_types, + uint64_t hash_fields) +{ + struct rte_flow *flow = dev_flow->flow; +#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT + int rss_request_inner = flow->rss.level >= 2; + + /* Check RSS hash level for tunnel. */ + if (tunnel && rss_request_inner) + hash_fields |= IBV_RX_HASH_INNER; + else if (tunnel || rss_request_inner) + return 0; +#endif + /* Check if requested layer matches RSS hash fields. */ + if (!(flow->rss.types & layer_types)) + return 0; + return hash_fields; +} + +/** + * Lookup and set the ptype in the data Rx part. A single Ptype can be used, + * if several tunnel rules are used on this queue, the tunnel ptype will be + * cleared. + * + * @param rxq_ctrl + * Rx queue to update. */ static void -mlx5_flow_spec_verbs_add(struct rte_flow *flow, void *src, unsigned int size) +flow_rxq_tunnel_ptype_update(struct mlx5_rxq_ctrl *rxq_ctrl) { - struct mlx5_flow_verbs *verbs = flow->cur_verbs; + unsigned int i; + uint32_t tunnel_ptype = 0; + + /* Look up for the ptype to use. */ + for (i = 0; i != MLX5_FLOW_TUNNEL; ++i) { + if (!rxq_ctrl->flow_tunnels_n[i]) + continue; + if (!tunnel_ptype) { + tunnel_ptype = tunnels_info[i].ptype; + } else { + tunnel_ptype = 0; + break; + } + } + rxq_ctrl->rxq.tunnel = tunnel_ptype; +} + +/** + * Set the Rx queue flags (Mark/Flag and Tunnel Ptypes) according to the devive + * flow. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] dev_flow + * Pointer to device flow structure. + */ +static void +flow_drv_rxq_flags_set(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow) +{ + struct priv *priv = dev->data->dev_private; + struct rte_flow *flow = dev_flow->flow; + const int mark = !!(flow->actions & + (MLX5_FLOW_ACTION_FLAG | MLX5_FLOW_ACTION_MARK)); + const int tunnel = !!(dev_flow->layers & MLX5_FLOW_LAYER_TUNNEL); + unsigned int i; + + for (i = 0; i != flow->rss.queue_num; ++i) { + int idx = (*flow->queue)[i]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of((*priv->rxqs)[idx], + struct mlx5_rxq_ctrl, rxq); - if (verbs->specs) { - void *dst; + if (mark) { + rxq_ctrl->rxq.mark = 1; + rxq_ctrl->flow_mark_n++; + } + if (tunnel) { + unsigned int j; - dst = (void *)(verbs->specs + verbs->size); - memcpy(dst, src, size); - ++verbs->attr->num_of_specs; + /* Increase the counter matching the flow. */ + for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) { + if ((tunnels_info[j].tunnel & + dev_flow->layers) == + tunnels_info[j].tunnel) { + rxq_ctrl->flow_tunnels_n[j]++; + break; + } + } + flow_rxq_tunnel_ptype_update(rxq_ctrl); + } } - verbs->size += size; } /** - * Adjust verbs hash fields according to the @p flow information. + * Set the Rx queue flags (Mark/Flag and Tunnel Ptypes) for a flow * - * @param[in, out] flow. + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] flow * Pointer to flow structure. - * @param[in] tunnel - * 1 when the hash field is for a tunnel item. - * @param[in] layer_types - * ETH_RSS_* types. - * @param[in] hash_fields - * Item hash fields. */ static void -mlx5_flow_verbs_hashfields_adjust(struct rte_flow *flow, - int tunnel __rte_unused, - uint32_t layer_types, uint64_t hash_fields) +flow_rxq_flags_set(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_flow *dev_flow; + + LIST_FOREACH(dev_flow, &flow->dev_flows, next) + flow_drv_rxq_flags_set(dev, dev_flow); +} + +/** + * Clear the Rx queue flags (Mark/Flag and Tunnel Ptype) associated with the + * device flow if no other flow uses it with the same kind of request. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] dev_flow + * Pointer to the device flow. + */ +static void +flow_drv_rxq_flags_trim(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow) +{ + struct priv *priv = dev->data->dev_private; + struct rte_flow *flow = dev_flow->flow; + const int mark = !!(flow->actions & + (MLX5_FLOW_ACTION_FLAG | MLX5_FLOW_ACTION_MARK)); + const int tunnel = !!(dev_flow->layers & MLX5_FLOW_LAYER_TUNNEL); + unsigned int i; + + assert(dev->data->dev_started); + for (i = 0; i != flow->rss.queue_num; ++i) { + int idx = (*flow->queue)[i]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of((*priv->rxqs)[idx], + struct mlx5_rxq_ctrl, rxq); + + if (mark) { + rxq_ctrl->flow_mark_n--; + rxq_ctrl->rxq.mark = !!rxq_ctrl->flow_mark_n; + } + if (tunnel) { + unsigned int j; + + /* Decrease the counter matching the flow. */ + for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) { + if ((tunnels_info[j].tunnel & + dev_flow->layers) == + tunnels_info[j].tunnel) { + rxq_ctrl->flow_tunnels_n[j]--; + break; + } + } + flow_rxq_tunnel_ptype_update(rxq_ctrl); + } + } +} + +/** + * Clear the Rx queue flags (Mark/Flag and Tunnel Ptype) associated with the + * @p flow if no other flow uses it with the same kind of request. + * + * @param dev + * Pointer to Ethernet device. + * @param[in] flow + * Pointer to the flow. + */ +static void +flow_rxq_flags_trim(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_flow *dev_flow; + + LIST_FOREACH(dev_flow, &flow->dev_flows, next) + flow_drv_rxq_flags_trim(dev, dev_flow); +} + +/** + * Clear the Mark/Flag and Tunnel ptype information in all Rx queues. + * + * @param dev + * Pointer to Ethernet device. + */ +static void +flow_rxq_flags_clear(struct rte_eth_dev *dev) +{ + struct priv *priv = dev->data->dev_private; + unsigned int i; + + for (i = 0; i != priv->rxqs_n; ++i) { + struct mlx5_rxq_ctrl *rxq_ctrl; + unsigned int j; + + if (!(*priv->rxqs)[i]) + continue; + rxq_ctrl = container_of((*priv->rxqs)[i], + struct mlx5_rxq_ctrl, rxq); + rxq_ctrl->flow_mark_n = 0; + rxq_ctrl->rxq.mark = 0; + for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) + rxq_ctrl->flow_tunnels_n[j] = 0; + rxq_ctrl->rxq.tunnel = 0; + } +} + +/* + * Validate the flag action. + * + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_action_flag(uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + + if (action_flags & MLX5_FLOW_ACTION_DROP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't drop and flag in same flow"); + if (action_flags & MLX5_FLOW_ACTION_MARK) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't mark and flag in same flow"); + if (action_flags & MLX5_FLOW_ACTION_FLAG) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 flag" + " actions in same flow"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "flag action not supported for " + "egress"); + return 0; +} + +/* + * Validate the mark action. + * + * @param[in] action + * Pointer to the queue action. + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_action_mark(const struct rte_flow_action *action, + uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct rte_flow_action_mark *mark = action->conf; + + if (!mark) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "configuration cannot be null"); + if (mark->id >= MLX5_FLOW_MARK_MAX) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &mark->id, + "mark id must in 0 <= id < " + RTE_STR(MLX5_FLOW_MARK_MAX)); + if (action_flags & MLX5_FLOW_ACTION_DROP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't drop and mark in same flow"); + if (action_flags & MLX5_FLOW_ACTION_FLAG) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't flag and mark in same flow"); + if (action_flags & MLX5_FLOW_ACTION_MARK) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 mark actions in same" + " flow"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "mark action not supported for " + "egress"); + return 0; +} + +/* + * Validate the drop action. + * + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + */ +int +mlx5_flow_validate_action_drop(uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + if (action_flags & MLX5_FLOW_ACTION_FLAG) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't drop and flag in same flow"); + if (action_flags & MLX5_FLOW_ACTION_MARK) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't drop and mark in same flow"); + if (action_flags & MLX5_FLOW_FATE_ACTIONS) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 fate actions in" + " same flow"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "drop action not supported for " + "egress"); + return 0; +} + +/* + * Validate the queue action. + * + * @param[in] action + * Pointer to the queue action. + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + */ +int +mlx5_flow_validate_action_queue(const struct rte_flow_action *action, + uint64_t action_flags, + struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) { + struct priv *priv = dev->data->dev_private; + const struct rte_flow_action_queue *queue = action->conf; + + if (action_flags & MLX5_FLOW_FATE_ACTIONS) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 fate actions in" + " same flow"); + if (queue->index >= priv->rxqs_n) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &queue->index, + "queue index out of range"); + if (!(*priv->rxqs)[queue->index]) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &queue->index, + "queue is not configured"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "queue action not supported for " + "egress"); + return 0; +} + +/* + * Validate the rss action. + * + * @param[in] action + * Pointer to the queue action. + * @param[in] action_flags + * Bit-fields that holds the actions detected until now. + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + */ +int +mlx5_flow_validate_action_rss(const struct rte_flow_action *action, + uint64_t action_flags, + struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + const struct rte_flow_action_rss *rss = action->conf; + unsigned int i; + + if (action_flags & MLX5_FLOW_FATE_ACTIONS) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have 2 fate actions" + " in same flow"); + if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT && + rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->func, + "RSS hash function not supported"); #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT - hash_fields |= (tunnel ? IBV_RX_HASH_INNER : 0); - if (flow->rss.level == 2 && !tunnel) - hash_fields = 0; - else if (flow->rss.level < 2 && tunnel) - hash_fields = 0; + if (rss->level > 2) +#else + if (rss->level > 1) #endif - if (!(flow->rss.types & layer_types)) - hash_fields = 0; - flow->cur_verbs->hash_fields |= hash_fields; + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->level, + "tunnel RSS is not supported"); + if (rss->key_len < MLX5_RSS_HASH_KEY_LEN) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->key_len, + "RSS hash key too small"); + if (rss->key_len > MLX5_RSS_HASH_KEY_LEN) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->key_len, + "RSS hash key too large"); + if (rss->queue_num > priv->config.ind_table_max_size) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->queue_num, + "number of queues too large"); + if (rss->types & MLX5_RSS_HF_MASK) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->types, + "some RSS protocols are not" + " supported"); + for (i = 0; i != rss->queue_num; ++i) { + if (!(*priv->rxqs)[rss->queue[i]]) + return rte_flow_error_set + (error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &rss->queue[i], "queue is not configured"); + } + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "rss action not supported for " + "egress"); + return 0; +} + +/* + * Validate the count action. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] attr + * Attributes of flow that includes this action. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + */ +int +mlx5_flow_validate_action_count(struct rte_eth_dev *dev __rte_unused, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "count action not supported for " + "egress"); + return 0; } /** - * Convert the @p item into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. + * Verify the @p attributes will be correctly understood by the NIC and store + * them in the @p flow if everything is correct. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] attributes + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_validate_attributes(struct rte_eth_dev *dev, + const struct rte_flow_attr *attributes, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + uint32_t priority_max = priv->config.flow_prio - 1; + + if (attributes->group) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + NULL, "groups is not supported"); + if (attributes->priority != MLX5_FLOW_PRIO_RSVD && + attributes->priority >= priority_max) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, "priority out of range"); + if (attributes->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, + "egress is not supported"); + if (attributes->transfer) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, + NULL, "transfer is not supported"); + if (!attributes->ingress) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + NULL, + "ingress attribute is mandatory"); + return 0; +} + +/** + * Validate Ethernet item. * * @param[in] item * Item specification. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. * @param[out] error * Pointer to error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p item has fully been converted, - * otherwise another call with this returned memory size should be done. - * On error, a negative errno value is returned and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -mlx5_flow_item_eth(const struct rte_flow_item *item, struct rte_flow *flow, - const size_t flow_size, struct rte_flow_error *error) +int +mlx5_flow_validate_item_eth(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_flow_error *error) { - const struct rte_flow_item_eth *spec = item->spec; const struct rte_flow_item_eth *mask = item->mask; const struct rte_flow_item_eth nic_mask = { .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", .type = RTE_BE16(0xffff), }; - const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); - const unsigned int size = sizeof(struct ibv_flow_spec_eth); - struct ibv_flow_spec_eth eth = { - .type = IBV_FLOW_SPEC_ETH | (tunnel ? IBV_FLOW_SPEC_INNER : 0), - .size = size, - }; int ret; + int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); - if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L2 : - MLX5_FLOW_LAYER_OUTER_L2)) + if (item_flags & MLX5_FLOW_LAYER_OUTER_L2) return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, - "L2 layers already configured"); + RTE_FLOW_ERROR_TYPE_ITEM, item, + "3 levels of l2 are not supported"); + if ((item_flags & MLX5_FLOW_LAYER_INNER_L2) && !tunnel) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "2 L2 without tunnel are not supported"); if (!mask) mask = &rte_flow_item_eth_mask; ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, (const uint8_t *)&nic_mask, sizeof(struct rte_flow_item_eth), error); - if (ret) - return ret; - flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 : - MLX5_FLOW_LAYER_OUTER_L2; - if (size > flow_size) - return size; - if (spec) { - unsigned int i; - - memcpy(ð.val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN); - memcpy(ð.val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN); - eth.val.ether_type = spec->type; - memcpy(ð.mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN); - memcpy(ð.mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN); - eth.mask.ether_type = mask->type; - /* Remove unwanted bits from values. */ - for (i = 0; i < ETHER_ADDR_LEN; ++i) { - eth.val.dst_mac[i] &= eth.mask.dst_mac[i]; - eth.val.src_mac[i] &= eth.mask.src_mac[i]; - } - eth.val.ether_type &= eth.mask.ether_type; - } - flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; - mlx5_flow_spec_verbs_add(flow, ð, size); - return size; -} - -/** - * Update the VLAN tag in the Verbs Ethernet specification. - * - * @param[in, out] attr - * Pointer to Verbs attributes structure. - * @param[in] eth - * Verbs structure containing the VLAN information to copy. - */ -static void -mlx5_flow_item_vlan_update(struct ibv_flow_attr *attr, - struct ibv_flow_spec_eth *eth) -{ - unsigned int i; - const enum ibv_flow_spec_type search = eth->type; - struct ibv_spec_header *hdr = (struct ibv_spec_header *) - ((uint8_t *)attr + sizeof(struct ibv_flow_attr)); - - for (i = 0; i != attr->num_of_specs; ++i) { - if (hdr->type == search) { - struct ibv_flow_spec_eth *e = - (struct ibv_flow_spec_eth *)hdr; - - e->val.vlan_tag = eth->val.vlan_tag; - e->mask.vlan_tag = eth->mask.vlan_tag; - e->val.ether_type = eth->val.ether_type; - e->mask.ether_type = eth->mask.ether_type; - break; - } - hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size); - } + return ret; } /** - * Convert the @p item into @p flow (or by updating the already present - * Ethernet Verbs) specification after ensuring the NIC will understand and - * process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. + * Validate VLAN item. * * @param[in] item * Item specification. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. * @param[out] error * Pointer to error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p item has fully been converted, - * otherwise another call with this returned memory size should be done. - * On error, a negative errno value is returned and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -mlx5_flow_item_vlan(const struct rte_flow_item *item, struct rte_flow *flow, - const size_t flow_size, struct rte_flow_error *error) +int +mlx5_flow_validate_item_vlan(const struct rte_flow_item *item, + int64_t item_flags, + struct rte_flow_error *error) { const struct rte_flow_item_vlan *spec = item->spec; const struct rte_flow_item_vlan *mask = item->mask; @@ -894,100 +1088,66 @@ mlx5_flow_item_vlan(const struct rte_flow_item *item, struct rte_flow *flow, .tci = RTE_BE16(0x0fff), .inner_type = RTE_BE16(0xffff), }; - unsigned int size = sizeof(struct ibv_flow_spec_eth); - const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); - struct ibv_flow_spec_eth eth = { - .type = IBV_FLOW_SPEC_ETH | (tunnel ? IBV_FLOW_SPEC_INNER : 0), - .size = size, - }; + uint16_t vlan_tag = 0; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); int ret; const uint32_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 | MLX5_FLOW_LAYER_INNER_L4) : - (MLX5_FLOW_LAYER_OUTER_L3 | MLX5_FLOW_LAYER_OUTER_L4); + (MLX5_FLOW_LAYER_OUTER_L3 | + MLX5_FLOW_LAYER_OUTER_L4); const uint32_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : - MLX5_FLOW_LAYER_OUTER_VLAN; - const uint32_t l2m = tunnel ? MLX5_FLOW_LAYER_INNER_L2 : - MLX5_FLOW_LAYER_OUTER_L2; + MLX5_FLOW_LAYER_OUTER_VLAN; - if (flow->layers & vlanm) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (item_flags & vlanm) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "VLAN layer already configured"); - else if ((flow->layers & l34m) != 0) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + else if ((item_flags & l34m) != 0) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "L2 layer cannot follow L3/L4 layer"); if (!mask) mask = &rte_flow_item_vlan_mask; - ret = mlx5_flow_item_acceptable - (item, (const uint8_t *)mask, - (const uint8_t *)&nic_mask, - sizeof(struct rte_flow_item_vlan), error); + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_vlan), + error); if (ret) return ret; if (spec) { - eth.val.vlan_tag = spec->tci; - eth.mask.vlan_tag = mask->tci; - eth.val.vlan_tag &= eth.mask.vlan_tag; - eth.val.ether_type = spec->inner_type; - eth.mask.ether_type = mask->inner_type; - eth.val.ether_type &= eth.mask.ether_type; + vlan_tag = spec->tci; + vlan_tag &= mask->tci; } /* * From verbs perspective an empty VLAN is equivalent * to a packet without VLAN layer. */ - if (!eth.mask.vlan_tag) + if (!vlan_tag) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM_SPEC, item->spec, "VLAN cannot be empty"); - if (!(flow->layers & l2m)) { - if (size <= flow_size) { - flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; - mlx5_flow_spec_verbs_add(flow, ð, size); - } - } else { - if (flow->cur_verbs) - mlx5_flow_item_vlan_update(flow->cur_verbs->attr, - ð); - size = 0; /* Only an update is done in eth specification. */ - } - flow->layers |= tunnel ? - (MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_VLAN) : - (MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_VLAN); - return size; + return 0; } /** - * Convert the @p item into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. + * Validate IPV4 item. * * @param[in] item * Item specification. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. * @param[out] error * Pointer to error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p item has fully been converted, - * otherwise another call with this returned memory size should be done. - * On error, a negative errno value is returned and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -mlx5_flow_item_ipv4(const struct rte_flow_item *item, struct rte_flow *flow, - const size_t flow_size, struct rte_flow_error *error) +int +mlx5_flow_validate_item_ipv4(const struct rte_flow_item *item, + int64_t item_flags, + struct rte_flow_error *error) { - const struct rte_flow_item_ipv4 *spec = item->spec; const struct rte_flow_item_ipv4 *mask = item->mask; const struct rte_flow_item_ipv4 nic_mask = { .hdr = { @@ -997,97 +1157,48 @@ mlx5_flow_item_ipv4(const struct rte_flow_item *item, struct rte_flow *flow, .next_proto_id = 0xff, }, }; - const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); - unsigned int size = sizeof(struct ibv_flow_spec_ipv4_ext); - struct ibv_flow_spec_ipv4_ext ipv4 = { - .type = IBV_FLOW_SPEC_IPV4_EXT | - (tunnel ? IBV_FLOW_SPEC_INNER : 0), - .size = size, - }; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); int ret; - if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : - MLX5_FLOW_LAYER_OUTER_L3)) + if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3)) return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "multiple L3 layers not supported"); - else if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : - MLX5_FLOW_LAYER_OUTER_L4)) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + else if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "L3 cannot follow an L4 layer."); if (!mask) mask = &rte_flow_item_ipv4_mask; - ret = mlx5_flow_item_acceptable - (item, (const uint8_t *)mask, - (const uint8_t *)&nic_mask, - sizeof(struct rte_flow_item_ipv4), error); + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_ipv4), + error); if (ret < 0) return ret; - flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : - MLX5_FLOW_LAYER_OUTER_L3_IPV4; - if (spec) { - ipv4.val = (struct ibv_flow_ipv4_ext_filter){ - .src_ip = spec->hdr.src_addr, - .dst_ip = spec->hdr.dst_addr, - .proto = spec->hdr.next_proto_id, - .tos = spec->hdr.type_of_service, - }; - ipv4.mask = (struct ibv_flow_ipv4_ext_filter){ - .src_ip = mask->hdr.src_addr, - .dst_ip = mask->hdr.dst_addr, - .proto = mask->hdr.next_proto_id, - .tos = mask->hdr.type_of_service, - }; - /* Remove unwanted bits from values. */ - ipv4.val.src_ip &= ipv4.mask.src_ip; - ipv4.val.dst_ip &= ipv4.mask.dst_ip; - ipv4.val.proto &= ipv4.mask.proto; - ipv4.val.tos &= ipv4.mask.tos; - } - flow->l3_protocol_en = !!ipv4.mask.proto; - flow->l3_protocol = ipv4.val.proto; - if (size <= flow_size) { - mlx5_flow_verbs_hashfields_adjust - (flow, tunnel, - (ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 | - ETH_RSS_NONFRAG_IPV4_OTHER), - (IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4)); - flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L3; - mlx5_flow_spec_verbs_add(flow, &ipv4, size); - } - return size; + return 0; } /** - * Convert the @p item into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. + * Validate IPV6 item. * * @param[in] item * Item specification. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. * @param[out] error * Pointer to error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p item has fully been converted, - * otherwise another call with this returned memory size should be done. - * On error, a negative errno value is returned and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -mlx5_flow_item_ipv6(const struct rte_flow_item *item, struct rte_flow *flow, - const size_t flow_size, struct rte_flow_error *error) +int +mlx5_flow_validate_item_ipv6(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_flow_error *error) { - const struct rte_flow_item_ipv6 *spec = item->spec; const struct rte_flow_item_ipv6 *mask = item->mask; const struct rte_flow_item_ipv6 nic_mask = { .hdr = { @@ -1102,25 +1213,18 @@ mlx5_flow_item_ipv6(const struct rte_flow_item *item, struct rte_flow *flow, .hop_limits = 0xff, }, }; - const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); - unsigned int size = sizeof(struct ibv_flow_spec_ipv6); - struct ibv_flow_spec_ipv6 ipv6 = { - .type = IBV_FLOW_SPEC_IPV6 | (tunnel ? IBV_FLOW_SPEC_INNER : 0), - .size = size, - }; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); int ret; - if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : - MLX5_FLOW_LAYER_OUTER_L3)) + if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3)) return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "multiple L3 layers not supported"); - else if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : - MLX5_FLOW_LAYER_OUTER_L4)) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + else if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "L3 cannot follow an L4 layer."); /* * IPv6 is not recognised by the NIC inside a GRE tunnel. @@ -1128,130 +1232,64 @@ mlx5_flow_item_ipv6(const struct rte_flow_item *item, struct rte_flow *flow, * accepted. Issue reproduced with Mellanox OFED 4.3-3.0.2.1 and * Mellanox OFED 4.4-1.0.0.0. */ - if (tunnel && flow->layers & MLX5_FLOW_LAYER_GRE) + if (tunnel && item_flags & MLX5_FLOW_LAYER_GRE) return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "IPv6 inside a GRE tunnel is" " not recognised."); if (!mask) mask = &rte_flow_item_ipv6_mask; - ret = mlx5_flow_item_acceptable - (item, (const uint8_t *)mask, - (const uint8_t *)&nic_mask, - sizeof(struct rte_flow_item_ipv6), error); + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_ipv6), + error); if (ret < 0) return ret; - flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : - MLX5_FLOW_LAYER_OUTER_L3_IPV6; - if (spec) { - unsigned int i; - uint32_t vtc_flow_val; - uint32_t vtc_flow_mask; - - memcpy(&ipv6.val.src_ip, spec->hdr.src_addr, - RTE_DIM(ipv6.val.src_ip)); - memcpy(&ipv6.val.dst_ip, spec->hdr.dst_addr, - RTE_DIM(ipv6.val.dst_ip)); - memcpy(&ipv6.mask.src_ip, mask->hdr.src_addr, - RTE_DIM(ipv6.mask.src_ip)); - memcpy(&ipv6.mask.dst_ip, mask->hdr.dst_addr, - RTE_DIM(ipv6.mask.dst_ip)); - vtc_flow_val = rte_be_to_cpu_32(spec->hdr.vtc_flow); - vtc_flow_mask = rte_be_to_cpu_32(mask->hdr.vtc_flow); - ipv6.val.flow_label = - rte_cpu_to_be_32((vtc_flow_val & IPV6_HDR_FL_MASK) >> - IPV6_HDR_FL_SHIFT); - ipv6.val.traffic_class = (vtc_flow_val & IPV6_HDR_TC_MASK) >> - IPV6_HDR_TC_SHIFT; - ipv6.val.next_hdr = spec->hdr.proto; - ipv6.val.hop_limit = spec->hdr.hop_limits; - ipv6.mask.flow_label = - rte_cpu_to_be_32((vtc_flow_mask & IPV6_HDR_FL_MASK) >> - IPV6_HDR_FL_SHIFT); - ipv6.mask.traffic_class = (vtc_flow_mask & IPV6_HDR_TC_MASK) >> - IPV6_HDR_TC_SHIFT; - ipv6.mask.next_hdr = mask->hdr.proto; - ipv6.mask.hop_limit = mask->hdr.hop_limits; - /* Remove unwanted bits from values. */ - for (i = 0; i < RTE_DIM(ipv6.val.src_ip); ++i) { - ipv6.val.src_ip[i] &= ipv6.mask.src_ip[i]; - ipv6.val.dst_ip[i] &= ipv6.mask.dst_ip[i]; - } - ipv6.val.flow_label &= ipv6.mask.flow_label; - ipv6.val.traffic_class &= ipv6.mask.traffic_class; - ipv6.val.next_hdr &= ipv6.mask.next_hdr; - ipv6.val.hop_limit &= ipv6.mask.hop_limit; - } - flow->l3_protocol_en = !!ipv6.mask.next_hdr; - flow->l3_protocol = ipv6.val.next_hdr; - if (size <= flow_size) { - mlx5_flow_verbs_hashfields_adjust - (flow, tunnel, - (ETH_RSS_IPV6 | ETH_RSS_NONFRAG_IPV6_OTHER), - (IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6)); - flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L3; - mlx5_flow_spec_verbs_add(flow, &ipv6, size); - } - return size; + return 0; } /** - * Convert the @p item into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. + * Validate UDP item. * * @param[in] item * Item specification. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] target_protocol + * The next protocol in the previous item. + * @param[in] flow_mask + * mlx5 flow-specific (TCF, DV, verbs, etc.) supported header fields mask. * @param[out] error * Pointer to error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p item has fully been converted, - * otherwise another call with this returned memory size should be done. - * On error, a negative errno value is returned and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -mlx5_flow_item_udp(const struct rte_flow_item *item, struct rte_flow *flow, - const size_t flow_size, struct rte_flow_error *error) +int +mlx5_flow_validate_item_udp(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error) { - const struct rte_flow_item_udp *spec = item->spec; const struct rte_flow_item_udp *mask = item->mask; - const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); - unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp); - struct ibv_flow_spec_tcp_udp udp = { - .type = IBV_FLOW_SPEC_UDP | (tunnel ? IBV_FLOW_SPEC_INNER : 0), - .size = size, - }; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); int ret; - if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_UDP) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (target_protocol != 0xff && target_protocol != IPPROTO_UDP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "protocol filtering not compatible" " with UDP layer"); - if (!(flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : - MLX5_FLOW_LAYER_OUTER_L3))) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, - "L3 is mandatory to filter" - " on L4"); - if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : - MLX5_FLOW_LAYER_OUTER_L4)) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, - "L4 layer is already" - " present"); + if (!(item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 is mandatory to filter on L4"); + if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L4 layer is already present"); if (!mask) mask = &rte_flow_item_udp_mask; ret = mlx5_flow_item_acceptable @@ -1260,178 +1298,118 @@ mlx5_flow_item_udp(const struct rte_flow_item *item, struct rte_flow *flow, sizeof(struct rte_flow_item_udp), error); if (ret < 0) return ret; - flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : - MLX5_FLOW_LAYER_OUTER_L4_UDP; - if (spec) { - udp.val.dst_port = spec->hdr.dst_port; - udp.val.src_port = spec->hdr.src_port; - udp.mask.dst_port = mask->hdr.dst_port; - udp.mask.src_port = mask->hdr.src_port; - /* Remove unwanted bits from values. */ - udp.val.src_port &= udp.mask.src_port; - udp.val.dst_port &= udp.mask.dst_port; - } - if (size <= flow_size) { - mlx5_flow_verbs_hashfields_adjust(flow, tunnel, ETH_RSS_UDP, - (IBV_RX_HASH_SRC_PORT_UDP | - IBV_RX_HASH_DST_PORT_UDP)); - flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L4; - mlx5_flow_spec_verbs_add(flow, &udp, size); - } - return size; + return 0; } /** - * Convert the @p item into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. + * Validate TCP item. * * @param[in] item * Item specification. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] target_protocol + * The next protocol in the previous item. * @param[out] error * Pointer to error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p item has fully been converted, - * otherwise another call with this returned memory size should be done. - * On error, a negative errno value is returned and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -mlx5_flow_item_tcp(const struct rte_flow_item *item, struct rte_flow *flow, - const size_t flow_size, struct rte_flow_error *error) +int +mlx5_flow_validate_item_tcp(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + const struct rte_flow_item_tcp *flow_mask, + struct rte_flow_error *error) { - const struct rte_flow_item_tcp *spec = item->spec; const struct rte_flow_item_tcp *mask = item->mask; - const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); - unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp); - struct ibv_flow_spec_tcp_udp tcp = { - .type = IBV_FLOW_SPEC_TCP | (tunnel ? IBV_FLOW_SPEC_INNER : 0), - .size = size, - }; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); int ret; - if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_TCP) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + assert(flow_mask); + if (target_protocol != 0xff && target_protocol != IPPROTO_TCP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "protocol filtering not compatible" " with TCP layer"); - if (!(flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : - MLX5_FLOW_LAYER_OUTER_L3))) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (!(item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L3 : + MLX5_FLOW_LAYER_OUTER_L3))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "L3 is mandatory to filter on L4"); - if (flow->layers & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : - MLX5_FLOW_LAYER_OUTER_L4)) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (item_flags & (tunnel ? MLX5_FLOW_LAYER_INNER_L4 : + MLX5_FLOW_LAYER_OUTER_L4)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "L4 layer is already present"); if (!mask) mask = &rte_flow_item_tcp_mask; ret = mlx5_flow_item_acceptable (item, (const uint8_t *)mask, - (const uint8_t *)&rte_flow_item_tcp_mask, + (const uint8_t *)flow_mask, sizeof(struct rte_flow_item_tcp), error); if (ret < 0) return ret; - flow->layers |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : - MLX5_FLOW_LAYER_OUTER_L4_TCP; - if (spec) { - tcp.val.dst_port = spec->hdr.dst_port; - tcp.val.src_port = spec->hdr.src_port; - tcp.mask.dst_port = mask->hdr.dst_port; - tcp.mask.src_port = mask->hdr.src_port; - /* Remove unwanted bits from values. */ - tcp.val.src_port &= tcp.mask.src_port; - tcp.val.dst_port &= tcp.mask.dst_port; - } - if (size <= flow_size) { - mlx5_flow_verbs_hashfields_adjust(flow, tunnel, ETH_RSS_TCP, - (IBV_RX_HASH_SRC_PORT_TCP | - IBV_RX_HASH_DST_PORT_TCP)); - flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L4; - mlx5_flow_spec_verbs_add(flow, &tcp, size); - } - return size; + return 0; } /** - * Convert the @p item into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. + * Validate VXLAN item. * * @param[in] item * Item specification. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] target_protocol + * The next protocol in the previous item. * @param[out] error * Pointer to error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p item has fully been converted, - * otherwise another call with this returned memory size should be done. - * On error, a negative errno value is returned and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -mlx5_flow_item_vxlan(const struct rte_flow_item *item, struct rte_flow *flow, - const size_t flow_size, struct rte_flow_error *error) +int +mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_flow_error *error) { const struct rte_flow_item_vxlan *spec = item->spec; const struct rte_flow_item_vxlan *mask = item->mask; - unsigned int size = sizeof(struct ibv_flow_spec_tunnel); - struct ibv_flow_spec_tunnel vxlan = { - .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, - .size = size, - }; int ret; union vni { uint32_t vlan_id; uint8_t vni[4]; } id = { .vlan_id = 0, }; + uint32_t vlan_id = 0; + - if (flow->layers & MLX5_FLOW_LAYER_TUNNEL) + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "a tunnel is already present"); /* * Verify only UDPv4 is present as defined in * https://tools.ietf.org/html/rfc7348 */ - if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L4_UDP)) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "no outer UDP layer found"); if (!mask) mask = &rte_flow_item_vxlan_mask; ret = mlx5_flow_item_acceptable (item, (const uint8_t *)mask, (const uint8_t *)&rte_flow_item_vxlan_mask, - sizeof(struct rte_flow_item_vxlan), error); + sizeof(struct rte_flow_item_vxlan), + error); if (ret < 0) return ret; if (spec) { memcpy(&id.vni[1], spec->vni, 3); - vxlan.val.tunnel_id = id.vlan_id; + vlan_id = id.vlan_id; memcpy(&id.vni[1], mask->vni, 3); - vxlan.mask.tunnel_id = id.vlan_id; - /* Remove unwanted bits from values. */ - vxlan.val.tunnel_id &= vxlan.mask.tunnel_id; + vlan_id &= id.vlan_id; } /* * Tunnel id 0 is equivalent as not adding a VXLAN layer, if @@ -1442,109 +1420,88 @@ mlx5_flow_item_vxlan(const struct rte_flow_item *item, struct rte_flow *flow, * match this rule. To avoid such situation, VNI 0 is * currently refused. */ - if (!vxlan.val.tunnel_id) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (!vlan_id) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, "VXLAN vni cannot be 0"); - if (!(flow->layers & MLX5_FLOW_LAYER_OUTER)) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (!(item_flags & MLX5_FLOW_LAYER_OUTER)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, "VXLAN tunnel must be fully defined"); - if (size <= flow_size) { - mlx5_flow_spec_verbs_add(flow, &vxlan, size); - flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; - } - flow->layers |= MLX5_FLOW_LAYER_VXLAN; - return size; + return 0; } /** - * Convert the @p item into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. + * Validate VXLAN_GPE item. * - * @param dev - * Pointer to Ethernet device. * @param[in] item * Item specification. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] priv + * Pointer to the private data structure. + * @param[in] target_protocol + * The next protocol in the previous item. * @param[out] error * Pointer to error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p item has fully been converted, - * otherwise another call with this returned memory size should be done. - * On error, a negative errno value is returned and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -mlx5_flow_item_vxlan_gpe(struct rte_eth_dev *dev, - const struct rte_flow_item *item, - struct rte_flow *flow, const size_t flow_size, - struct rte_flow_error *error) +int +mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_eth_dev *dev, + struct rte_flow_error *error) { + struct priv *priv = dev->data->dev_private; const struct rte_flow_item_vxlan_gpe *spec = item->spec; const struct rte_flow_item_vxlan_gpe *mask = item->mask; - unsigned int size = sizeof(struct ibv_flow_spec_tunnel); - struct ibv_flow_spec_tunnel vxlan_gpe = { - .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, - .size = size, - }; int ret; union vni { uint32_t vlan_id; uint8_t vni[4]; } id = { .vlan_id = 0, }; + uint32_t vlan_id = 0; - if (!((struct priv *)dev->data->dev_private)->config.l3_vxlan_en) + if (!priv->config.l3_vxlan_en) return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "L3 VXLAN is not enabled by device" " parameter and/or not configured in" " firmware"); - if (flow->layers & MLX5_FLOW_LAYER_TUNNEL) + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "a tunnel is already present"); /* * Verify only UDPv4 is present as defined in * https://tools.ietf.org/html/rfc7348 */ - if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L4_UDP)) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "no outer UDP layer found"); if (!mask) mask = &rte_flow_item_vxlan_gpe_mask; ret = mlx5_flow_item_acceptable (item, (const uint8_t *)mask, (const uint8_t *)&rte_flow_item_vxlan_gpe_mask, - sizeof(struct rte_flow_item_vxlan_gpe), error); + sizeof(struct rte_flow_item_vxlan_gpe), + error); if (ret < 0) return ret; if (spec) { + if (spec->protocol) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + item, + "VxLAN-GPE protocol" + " not supported"); memcpy(&id.vni[1], spec->vni, 3); - vxlan_gpe.val.tunnel_id = id.vlan_id; + vlan_id = id.vlan_id; memcpy(&id.vni[1], mask->vni, 3); - vxlan_gpe.mask.tunnel_id = id.vlan_id; - if (spec->protocol) - return rte_flow_error_set - (error, EINVAL, - RTE_FLOW_ERROR_TYPE_ITEM, - item, - "VxLAN-GPE protocol not supported"); - /* Remove unwanted bits from values. */ - vxlan_gpe.val.tunnel_id &= vxlan_gpe.mask.tunnel_id; + vlan_id &= id.vlan_id; } /* * Tunnel id 0 is equivalent as not adding a VXLAN layer, if only this @@ -1554,141 +1511,55 @@ mlx5_flow_item_vxlan_gpe(struct rte_eth_dev *dev, * before will also match this rule. To avoid such situation, VNI 0 * is currently refused. */ - if (!vxlan_gpe.val.tunnel_id) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (!vlan_id) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, "VXLAN-GPE vni cannot be 0"); - if (!(flow->layers & MLX5_FLOW_LAYER_OUTER)) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (!(item_flags & MLX5_FLOW_LAYER_OUTER)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, "VXLAN-GPE tunnel must be fully" " defined"); - if (size <= flow_size) { - mlx5_flow_spec_verbs_add(flow, &vxlan_gpe, size); - flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; - } - flow->layers |= MLX5_FLOW_LAYER_VXLAN_GPE; - return size; -} - -/** - * Update the protocol in Verbs IPv4/IPv6 spec. - * - * @param[in, out] attr - * Pointer to Verbs attributes structure. - * @param[in] search - * Specification type to search in order to update the IP protocol. - * @param[in] protocol - * Protocol value to set if none is present in the specification. - */ -static void -mlx5_flow_item_gre_ip_protocol_update(struct ibv_flow_attr *attr, - enum ibv_flow_spec_type search, - uint8_t protocol) -{ - unsigned int i; - struct ibv_spec_header *hdr = (struct ibv_spec_header *) - ((uint8_t *)attr + sizeof(struct ibv_flow_attr)); - - if (!attr) - return; - for (i = 0; i != attr->num_of_specs; ++i) { - if (hdr->type == search) { - union { - struct ibv_flow_spec_ipv4_ext *ipv4; - struct ibv_flow_spec_ipv6 *ipv6; - } ip; - - switch (search) { - case IBV_FLOW_SPEC_IPV4_EXT: - ip.ipv4 = (struct ibv_flow_spec_ipv4_ext *)hdr; - if (!ip.ipv4->val.proto) { - ip.ipv4->val.proto = protocol; - ip.ipv4->mask.proto = 0xff; - } - break; - case IBV_FLOW_SPEC_IPV6: - ip.ipv6 = (struct ibv_flow_spec_ipv6 *)hdr; - if (!ip.ipv6->val.next_hdr) { - ip.ipv6->val.next_hdr = protocol; - ip.ipv6->mask.next_hdr = 0xff; - } - break; - default: - break; - } - break; - } - hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size); - } + return 0; } /** - * Convert the @p item into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * It will also update the previous L3 layer with the protocol value matching - * the GRE. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. + * Validate GRE item. * - * @param dev - * Pointer to Ethernet device. * @param[in] item * Item specification. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. + * @param[in] item_flags + * Bit flags to mark detected items. + * @param[in] target_protocol + * The next protocol in the previous item. * @param[out] error * Pointer to error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p item has fully been converted, - * otherwise another call with this returned memory size should be done. - * On error, a negative errno value is returned and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -mlx5_flow_item_gre(const struct rte_flow_item *item, - struct rte_flow *flow, const size_t flow_size, - struct rte_flow_error *error) +int +mlx5_flow_validate_item_gre(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error) { - struct mlx5_flow_verbs *verbs = flow->cur_verbs; - const struct rte_flow_item_gre *spec = item->spec; + const struct rte_flow_item_gre *spec __rte_unused = item->spec; const struct rte_flow_item_gre *mask = item->mask; -#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT - unsigned int size = sizeof(struct ibv_flow_spec_gre); - struct ibv_flow_spec_gre tunnel = { - .type = IBV_FLOW_SPEC_GRE, - .size = size, - }; -#else - unsigned int size = sizeof(struct ibv_flow_spec_tunnel); - struct ibv_flow_spec_tunnel tunnel = { - .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, - .size = size, - }; -#endif int ret; - if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_GRE) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (target_protocol != 0xff && target_protocol != IPPROTO_GRE) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "protocol filtering not compatible" " with this GRE layer"); - if (flow->layers & MLX5_FLOW_LAYER_TUNNEL) + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "a tunnel is already present"); - if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L3)) + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3)) return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "L3 Layer is missing"); if (!mask) mask = &rte_flow_item_gre_mask; @@ -1698,92 +1569,50 @@ mlx5_flow_item_gre(const struct rte_flow_item *item, sizeof(struct rte_flow_item_gre), error); if (ret < 0) return ret; -#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT - if (spec) { - tunnel.val.c_ks_res0_ver = spec->c_rsvd0_ver; - tunnel.val.protocol = spec->protocol; - tunnel.mask.c_ks_res0_ver = mask->c_rsvd0_ver; - tunnel.mask.protocol = mask->protocol; - /* Remove unwanted bits from values. */ - tunnel.val.c_ks_res0_ver &= tunnel.mask.c_ks_res0_ver; - tunnel.val.protocol &= tunnel.mask.protocol; - tunnel.val.key &= tunnel.mask.key; - } -#else +#ifndef HAVE_IBV_DEVICE_MPLS_SUPPORT if (spec && (spec->protocol & mask->protocol)) return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "without MPLS support the" " specification cannot be used for" " filtering"); -#endif /* !HAVE_IBV_DEVICE_MPLS_SUPPORT */ - if (size <= flow_size) { - if (flow->layers & MLX5_FLOW_LAYER_OUTER_L3_IPV4) - mlx5_flow_item_gre_ip_protocol_update - (verbs->attr, IBV_FLOW_SPEC_IPV4_EXT, - MLX5_IP_PROTOCOL_GRE); - else - mlx5_flow_item_gre_ip_protocol_update - (verbs->attr, IBV_FLOW_SPEC_IPV6, - MLX5_IP_PROTOCOL_GRE); - mlx5_flow_spec_verbs_add(flow, &tunnel, size); - flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; - } - flow->layers |= MLX5_FLOW_LAYER_GRE; - return size; +#endif + return 0; } /** - * Convert the @p item into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. + * Validate MPLS item. * * @param[in] item * Item specification. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. + * @param[in] item_flags + * Bit-fields that holds the items detected until now. + * @param[in] target_protocol + * The next protocol in the previous item. * @param[out] error * Pointer to error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p item has fully been converted, - * otherwise another call with this returned memory size should be done. - * On error, a negative errno value is returned and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static int -mlx5_flow_item_mpls(const struct rte_flow_item *item __rte_unused, - struct rte_flow *flow __rte_unused, - const size_t flow_size __rte_unused, - struct rte_flow_error *error) +int +mlx5_flow_validate_item_mpls(const struct rte_flow_item *item __rte_unused, + uint64_t item_flags __rte_unused, + uint8_t target_protocol __rte_unused, + struct rte_flow_error *error) { #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT - const struct rte_flow_item_mpls *spec = item->spec; const struct rte_flow_item_mpls *mask = item->mask; - unsigned int size = sizeof(struct ibv_flow_spec_mpls); - struct ibv_flow_spec_mpls mpls = { - .type = IBV_FLOW_SPEC_MPLS, - .size = size, - }; int ret; - if (flow->l3_protocol_en && flow->l3_protocol != MLX5_IP_PROTOCOL_MPLS) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + if (target_protocol != 0xff && target_protocol != IPPROTO_MPLS) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, "protocol filtering not compatible" " with MPLS layer"); - /* Multi-tunnel isn't allowed but MPLS over GRE is an exception. */ - if (flow->layers & MLX5_FLOW_LAYER_TUNNEL && - (flow->layers & MLX5_FLOW_LAYER_GRE) != MLX5_FLOW_LAYER_GRE) + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "a tunnel is already" " present"); if (!mask) @@ -1794,1056 +1623,298 @@ mlx5_flow_item_mpls(const struct rte_flow_item *item __rte_unused, sizeof(struct rte_flow_item_mpls), error); if (ret < 0) return ret; - if (spec) { - memcpy(&mpls.val.label, spec, sizeof(mpls.val.label)); - memcpy(&mpls.mask.label, mask, sizeof(mpls.mask.label)); - /* Remove unwanted bits from values. */ - mpls.val.label &= mpls.mask.label; - } - if (size <= flow_size) { - mlx5_flow_spec_verbs_add(flow, &mpls, size); - flow->cur_verbs->attr->priority = MLX5_PRIORITY_MAP_L2; - } - flow->layers |= MLX5_FLOW_LAYER_MPLS; - return size; -#endif /* !HAVE_IBV_DEVICE_MPLS_SUPPORT */ + return 0; +#endif return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - item, + RTE_FLOW_ERROR_TYPE_ITEM, item, "MPLS is not supported by Verbs, please" " update."); } -/** - * Convert the @p pattern into a Verbs specifications after ensuring the NIC - * will understand and process it correctly. - * The conversion is performed item per item, each of them is written into - * the @p flow if its size is lesser or equal to @p flow_size. - * Validation and memory consumption computation are still performed until the - * end of @p pattern, unless an error is encountered. - * - * @param[in] pattern - * Flow pattern. - * @param[in, out] flow - * Pointer to the rte_flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small some - * garbage may be present. - * @param[out] error - * Pointer to error structure. - * - * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @pattern has fully been - * converted, otherwise another call with this returned memory size should - * be done. - * On error, a negative errno value is returned and rte_errno is set. - */ static int -mlx5_flow_items(struct rte_eth_dev *dev, - const struct rte_flow_item pattern[], - struct rte_flow *flow, const size_t flow_size, - struct rte_flow_error *error) +flow_null_validate(struct rte_eth_dev *dev __rte_unused, + const struct rte_flow_attr *attr __rte_unused, + const struct rte_flow_item items[] __rte_unused, + const struct rte_flow_action actions[] __rte_unused, + struct rte_flow_error *error __rte_unused) { - int remain = flow_size; - size_t size = 0; - - for (; pattern->type != RTE_FLOW_ITEM_TYPE_END; pattern++) { - int ret = 0; - - switch (pattern->type) { - case RTE_FLOW_ITEM_TYPE_VOID: - break; - case RTE_FLOW_ITEM_TYPE_ETH: - ret = mlx5_flow_item_eth(pattern, flow, remain, error); - break; - case RTE_FLOW_ITEM_TYPE_VLAN: - ret = mlx5_flow_item_vlan(pattern, flow, remain, error); - break; - case RTE_FLOW_ITEM_TYPE_IPV4: - ret = mlx5_flow_item_ipv4(pattern, flow, remain, error); - break; - case RTE_FLOW_ITEM_TYPE_IPV6: - ret = mlx5_flow_item_ipv6(pattern, flow, remain, error); - break; - case RTE_FLOW_ITEM_TYPE_UDP: - ret = mlx5_flow_item_udp(pattern, flow, remain, error); - break; - case RTE_FLOW_ITEM_TYPE_TCP: - ret = mlx5_flow_item_tcp(pattern, flow, remain, error); - break; - case RTE_FLOW_ITEM_TYPE_VXLAN: - ret = mlx5_flow_item_vxlan(pattern, flow, remain, - error); - break; - case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: - ret = mlx5_flow_item_vxlan_gpe(dev, pattern, flow, - remain, error); - break; - case RTE_FLOW_ITEM_TYPE_GRE: - ret = mlx5_flow_item_gre(pattern, flow, remain, error); - break; - case RTE_FLOW_ITEM_TYPE_MPLS: - ret = mlx5_flow_item_mpls(pattern, flow, remain, error); - break; - default: - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - pattern, - "item not supported"); - } - if (ret < 0) - return ret; - if (remain > ret) - remain -= ret; - else - remain = 0; - size += ret; - } - if (!flow->layers) { - const struct rte_flow_item item = { - .type = RTE_FLOW_ITEM_TYPE_ETH, - }; - - return mlx5_flow_item_eth(&item, flow, flow_size, error); - } - return size; + rte_errno = ENOTSUP; + return -rte_errno; } -/** - * Convert the @p action into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. - * - * @param[in] action - * Action configuration. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. - * @param[out] error - * Pointer to error structure. - * - * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p action has fully been - * converted, otherwise another call with this returned memory size should - * be done. - * On error, a negative errno value is returned and rte_errno is set. - */ -static int -mlx5_flow_action_drop(const struct rte_flow_action *action, - struct rte_flow *flow, const size_t flow_size, - struct rte_flow_error *error) +static struct mlx5_flow * +flow_null_prepare(const struct rte_flow_attr *attr __rte_unused, + const struct rte_flow_item items[] __rte_unused, + const struct rte_flow_action actions[] __rte_unused, + uint64_t *item_flags __rte_unused, + uint64_t *action_flags __rte_unused, + struct rte_flow_error *error __rte_unused) { - unsigned int size = sizeof(struct ibv_flow_spec_action_drop); - struct ibv_flow_spec_action_drop drop = { - .type = IBV_FLOW_SPEC_ACTION_DROP, - .size = size, - }; - - if (flow->fate) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - action, - "multiple fate actions are not" - " supported"); - if (flow->modifier & (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK)) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - action, - "drop is not compatible with" - " flag/mark action"); - if (size < flow_size) - mlx5_flow_spec_verbs_add(flow, &drop, size); - flow->fate |= MLX5_FLOW_FATE_DROP; - return size; + rte_errno = ENOTSUP; + return NULL; } -/** - * Convert the @p action into @p flow after ensuring the NIC will understand - * and process it correctly. - * - * @param[in] dev - * Pointer to Ethernet device structure. - * @param[in] action - * Action configuration. - * @param[in, out] flow - * Pointer to flow structure. - * @param[out] error - * Pointer to error structure. - * - * @return - * 0 on success, a negative errno value otherwise and rte_errno is set. - */ static int -mlx5_flow_action_queue(struct rte_eth_dev *dev, - const struct rte_flow_action *action, - struct rte_flow *flow, - struct rte_flow_error *error) +flow_null_translate(struct rte_eth_dev *dev __rte_unused, + struct mlx5_flow *dev_flow __rte_unused, + const struct rte_flow_attr *attr __rte_unused, + const struct rte_flow_item items[] __rte_unused, + const struct rte_flow_action actions[] __rte_unused, + struct rte_flow_error *error __rte_unused) { - struct priv *priv = dev->data->dev_private; - const struct rte_flow_action_queue *queue = action->conf; - - if (flow->fate) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - action, - "multiple fate actions are not" - " supported"); - if (queue->index >= priv->rxqs_n) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - &queue->index, - "queue index out of range"); - if (!(*priv->rxqs)[queue->index]) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - &queue->index, - "queue is not configured"); - if (flow->queue) - (*flow->queue)[0] = queue->index; - flow->rss.queue_num = 1; - flow->fate |= MLX5_FLOW_FATE_QUEUE; - return 0; + rte_errno = ENOTSUP; + return -rte_errno; } -/** - * Ensure the @p action will be understood and used correctly by the NIC. - * - * @param dev - * Pointer to Ethernet device structure. - * @param action[in] - * Pointer to flow actions array. - * @param flow[in, out] - * Pointer to the rte_flow structure. - * @param error[in, out] - * Pointer to error structure. - * - * @return - * On success @p flow->queue array and @p flow->rss are filled and valid. - * On error, a negative errno value is returned and rte_errno is set. - */ static int -mlx5_flow_action_rss(struct rte_eth_dev *dev, - const struct rte_flow_action *action, - struct rte_flow *flow, - struct rte_flow_error *error) +flow_null_apply(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow __rte_unused, + struct rte_flow_error *error __rte_unused) { - struct priv *priv = dev->data->dev_private; - const struct rte_flow_action_rss *rss = action->conf; - unsigned int i; - - if (flow->fate) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - action, - "multiple fate actions are not" - " supported"); - if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT && - rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - &rss->func, - "RSS hash function not supported"); -#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT - if (rss->level > 2) -#else - if (rss->level > 1) -#endif - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - &rss->level, - "tunnel RSS is not supported"); - if (rss->key_len < MLX5_RSS_HASH_KEY_LEN) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - &rss->key_len, - "RSS hash key too small"); - if (rss->key_len > MLX5_RSS_HASH_KEY_LEN) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - &rss->key_len, - "RSS hash key too large"); - if (!rss->queue_num) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - rss, - "no queues were provided for RSS"); - if (rss->queue_num > priv->config.ind_table_max_size) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - &rss->queue_num, - "number of queues too large"); - if (rss->types & MLX5_RSS_HF_MASK) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - &rss->types, - "some RSS protocols are not" - " supported"); - for (i = 0; i != rss->queue_num; ++i) { - if (rss->queue[i] >= priv->rxqs_n) - return rte_flow_error_set - (error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - rss, - "queue index out of range"); - if (!(*priv->rxqs)[rss->queue[i]]) - return rte_flow_error_set - (error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - &rss->queue[i], - "queue is not configured"); - } - if (flow->queue) - memcpy((*flow->queue), rss->queue, - rss->queue_num * sizeof(uint16_t)); - flow->rss.queue_num = rss->queue_num; - memcpy(flow->key, rss->key, MLX5_RSS_HASH_KEY_LEN); - flow->rss.types = rss->types; - flow->rss.level = rss->level; - flow->fate |= MLX5_FLOW_FATE_RSS; - return 0; + rte_errno = ENOTSUP; + return -rte_errno; } -/** - * Convert the @p action into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. - * - * @param[in] action - * Action configuration. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. - * @param[out] error - * Pointer to error structure. - * - * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p action has fully been - * converted, otherwise another call with this returned memory size should - * be done. - * On error, a negative errno value is returned and rte_errno is set. - */ -static int -mlx5_flow_action_flag(const struct rte_flow_action *action, - struct rte_flow *flow, const size_t flow_size, - struct rte_flow_error *error) -{ - unsigned int size = sizeof(struct ibv_flow_spec_action_tag); - struct ibv_flow_spec_action_tag tag = { - .type = IBV_FLOW_SPEC_ACTION_TAG, - .size = size, - .tag_id = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT), - }; - struct mlx5_flow_verbs *verbs = flow->cur_verbs; - - if (flow->modifier & MLX5_FLOW_MOD_FLAG) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - action, - "flag action already present"); - if (flow->fate & MLX5_FLOW_FATE_DROP) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - action, - "flag is not compatible with drop" - " action"); - if (flow->modifier & MLX5_FLOW_MOD_MARK) - size = 0; - else if (size <= flow_size && verbs) - mlx5_flow_spec_verbs_add(flow, &tag, size); - flow->modifier |= MLX5_FLOW_MOD_FLAG; - return size; +static void +flow_null_remove(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow __rte_unused) +{ } -/** - * Update verbs specification to modify the flag to mark. - * - * @param[in, out] verbs - * Pointer to the mlx5_flow_verbs structure. - * @param[in] mark_id - * Mark identifier to replace the flag. - */ static void -mlx5_flow_verbs_mark_update(struct mlx5_flow_verbs *verbs, uint32_t mark_id) +flow_null_destroy(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow __rte_unused) { - struct ibv_spec_header *hdr; - int i; - - if (!verbs) - return; - /* Update Verbs specification. */ - hdr = (struct ibv_spec_header *)verbs->specs; - if (!hdr) - return; - for (i = 0; i != verbs->attr->num_of_specs; ++i) { - if (hdr->type == IBV_FLOW_SPEC_ACTION_TAG) { - struct ibv_flow_spec_action_tag *t = - (struct ibv_flow_spec_action_tag *)hdr; - - t->tag_id = mlx5_flow_mark_set(mark_id); - } - hdr = (struct ibv_spec_header *)((uintptr_t)hdr + hdr->size); - } } -/** - * Convert the @p action into @p flow (or by updating the already present - * Flag Verbs specification) after ensuring the NIC will understand and - * process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. - * - * @param[in] action - * Action configuration. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. - * @param[out] error - * Pointer to error structure. - * - * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p action has fully been - * converted, otherwise another call with this returned memory size should - * be done. - * On error, a negative errno value is returned and rte_errno is set. - */ static int -mlx5_flow_action_mark(const struct rte_flow_action *action, - struct rte_flow *flow, const size_t flow_size, - struct rte_flow_error *error) +flow_null_query(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow __rte_unused, + const struct rte_flow_action *actions __rte_unused, + void *data __rte_unused, + struct rte_flow_error *error __rte_unused) { - const struct rte_flow_action_mark *mark = action->conf; - unsigned int size = sizeof(struct ibv_flow_spec_action_tag); - struct ibv_flow_spec_action_tag tag = { - .type = IBV_FLOW_SPEC_ACTION_TAG, - .size = size, - }; - struct mlx5_flow_verbs *verbs = flow->cur_verbs; - - if (!mark) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION, - action, - "configuration cannot be null"); - if (mark->id >= MLX5_FLOW_MARK_MAX) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION_CONF, - &mark->id, - "mark id must in 0 <= id < " - RTE_STR(MLX5_FLOW_MARK_MAX)); - if (flow->modifier & MLX5_FLOW_MOD_MARK) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - action, - "mark action already present"); - if (flow->fate & MLX5_FLOW_FATE_DROP) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - action, - "mark is not compatible with drop" - " action"); - if (flow->modifier & MLX5_FLOW_MOD_FLAG) { - mlx5_flow_verbs_mark_update(verbs, mark->id); - size = 0; - } else if (size <= flow_size) { - tag.tag_id = mlx5_flow_mark_set(mark->id); - mlx5_flow_spec_verbs_add(flow, &tag, size); - } - flow->modifier |= MLX5_FLOW_MOD_MARK; - return size; + rte_errno = ENOTSUP; + return -rte_errno; } +/* Void driver to protect from null pointer reference. */ +const struct mlx5_flow_driver_ops mlx5_flow_null_drv_ops = { + .validate = flow_null_validate, + .prepare = flow_null_prepare, + .translate = flow_null_translate, + .apply = flow_null_apply, + .remove = flow_null_remove, + .destroy = flow_null_destroy, + .query = flow_null_query, +}; + /** - * Convert the @p action into a Verbs specification after ensuring the NIC - * will understand and process it correctly. - * If the necessary size for the conversion is greater than the @p flow_size, - * nothing is written in @p flow, the validation is still performed. - * - * @param action[in] - * Action configuration. - * @param flow[in, out] - * Pointer to flow structure. - * @param flow_size[in] - * Size in bytes of the available space in @p flow, if too small, nothing is - * written. - * @param error[int, out] - * Pointer to error structure. + * Select flow driver type according to flow attributes and device + * configuration. + * + * @param[in] dev + * Pointer to the dev structure. + * @param[in] attr + * Pointer to the flow attributes. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p action has fully been - * converted, otherwise another call with this returned memory size should - * be done. - * On error, a negative errno value is returned and rte_errno is set. + * flow driver type, MLX5_FLOW_TYPE_MAX otherwise. */ -static int -mlx5_flow_action_count(struct rte_eth_dev *dev, - const struct rte_flow_action *action, - struct rte_flow *flow, - const size_t flow_size __rte_unused, - struct rte_flow_error *error) -{ - const struct rte_flow_action_count *count = action->conf; -#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT - unsigned int size = sizeof(struct ibv_flow_spec_counter_action); - struct ibv_flow_spec_counter_action counter = { - .type = IBV_FLOW_SPEC_ACTION_COUNT, - .size = size, - }; -#endif +static enum mlx5_flow_drv_type +flow_get_drv_type(struct rte_eth_dev *dev, const struct rte_flow_attr *attr) +{ + struct priv *priv = dev->data->dev_private; + enum mlx5_flow_drv_type type = MLX5_FLOW_TYPE_MAX; - if (!flow->counter) { - flow->counter = mlx5_flow_counter_new(dev, count->shared, - count->id); - if (!flow->counter) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - action, - "cannot get counter" - " context."); - } - if (!((struct priv *)dev->data->dev_private)->config.flow_counter_en) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - action, - "flow counters are not supported."); - flow->modifier |= MLX5_FLOW_MOD_COUNT; -#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT - counter.counter_set_handle = flow->counter->cs->handle; - if (size <= flow_size) - mlx5_flow_spec_verbs_add(flow, &counter, size); - return size; -#endif - return 0; + if (attr->transfer) + type = MLX5_FLOW_TYPE_TCF; + else + type = priv->config.dv_flow_en ? MLX5_FLOW_TYPE_DV : + MLX5_FLOW_TYPE_VERBS; + return type; } +#define flow_get_drv_ops(type) flow_drv_ops[type] + /** - * Convert the @p action into @p flow after ensuring the NIC will understand - * and process it correctly. - * The conversion is performed action per action, each of them is written into - * the @p flow if its size is lesser or equal to @p flow_size. - * Validation and memory consumption computation are still performed until the - * end of @p action, unless an error is encountered. + * Flow driver validation API. This abstracts calling driver specific functions. + * The type of flow driver is determined according to flow attributes. * * @param[in] dev - * Pointer to Ethernet device structure. + * Pointer to the dev structure. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. * @param[in] actions - * Pointer to flow actions array. - * @param[in, out] flow - * Pointer to the rte_flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small some - * garbage may be present. + * Pointer to the list of actions. * @param[out] error - * Pointer to error structure. + * Pointer to the error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the @p actions has fully been - * converted, otherwise another call with this returned memory size should - * be done. - * On error, a negative errno value is returned and rte_errno is set. + * 0 on success, a negative errno value otherwise and rte_ernno is set. */ -static int -mlx5_flow_actions(struct rte_eth_dev *dev, +static inline int +flow_drv_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], const struct rte_flow_action actions[], - struct rte_flow *flow, const size_t flow_size, struct rte_flow_error *error) { - size_t size = 0; - int remain = flow_size; - int ret = 0; + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = flow_get_drv_type(dev, attr); - for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { - switch (actions->type) { - case RTE_FLOW_ACTION_TYPE_VOID: - break; - case RTE_FLOW_ACTION_TYPE_FLAG: - ret = mlx5_flow_action_flag(actions, flow, remain, - error); - break; - case RTE_FLOW_ACTION_TYPE_MARK: - ret = mlx5_flow_action_mark(actions, flow, remain, - error); - break; - case RTE_FLOW_ACTION_TYPE_DROP: - ret = mlx5_flow_action_drop(actions, flow, remain, - error); - break; - case RTE_FLOW_ACTION_TYPE_QUEUE: - ret = mlx5_flow_action_queue(dev, actions, flow, error); - break; - case RTE_FLOW_ACTION_TYPE_RSS: - ret = mlx5_flow_action_rss(dev, actions, flow, error); - break; - case RTE_FLOW_ACTION_TYPE_COUNT: - ret = mlx5_flow_action_count(dev, actions, flow, remain, - error); - break; - default: - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "action not supported"); - } - if (ret < 0) - return ret; - if (remain > ret) - remain -= ret; - else - remain = 0; - size += ret; - } - if (!flow->fate) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "no fate action found"); - return size; + fops = flow_get_drv_ops(type); + return fops->validate(dev, attr, items, actions, error); } /** - * Validate flow rule and fill flow structure accordingly. + * Flow driver preparation API. This abstracts calling driver specific + * functions. Parent flow (rte_flow) should have driver type (drv_type). It + * calculates the size of memory required for device flow, allocates the memory, + * initializes the device flow and returns the pointer. * - * @param dev - * Pointer to Ethernet device. - * @param[out] flow - * Pointer to flow structure. - * @param flow_size - * Size of allocated space for @p flow. * @param[in] attr - * Flow rule attributes. - * @param[in] pattern - * Pattern specification (list terminated by the END pattern item). + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. * @param[in] actions - * Associated actions (list terminated by the END action). + * Pointer to the list of actions. + * @param[out] item_flags + * Pointer to bit mask of all items detected. + * @param[out] action_flags + * Pointer to bit mask of all actions detected. * @param[out] error - * Perform verbose error reporting if not NULL. + * Pointer to the error structure. * * @return - * A positive value representing the size of the flow object in bytes - * regardless of @p flow_size on success, a negative errno value otherwise - * and rte_errno is set. + * Pointer to device flow on success, otherwise NULL and rte_ernno is set. */ -static int -mlx5_flow_merge_switch(struct rte_eth_dev *dev, - struct rte_flow *flow, - size_t flow_size, - const struct rte_flow_attr *attr, - const struct rte_flow_item pattern[], - const struct rte_flow_action actions[], - struct rte_flow_error *error) -{ - unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0); - uint16_t port_id[!n + n]; - struct mlx5_nl_flow_ptoi ptoi[!n + n + 1]; - size_t off = RTE_ALIGN_CEIL(sizeof(*flow), alignof(max_align_t)); - unsigned int i; - unsigned int own = 0; - int ret; - - /* At least one port is needed when no switch domain is present. */ - if (!n) { - n = 1; - port_id[0] = dev->data->port_id; - } else { - n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n); - } - for (i = 0; i != n; ++i) { - struct rte_eth_dev_info dev_info; - - rte_eth_dev_info_get(port_id[i], &dev_info); - if (port_id[i] == dev->data->port_id) - own = i; - ptoi[i].port_id = port_id[i]; - ptoi[i].ifindex = dev_info.if_index; - } - /* Ensure first entry of ptoi[] is the current device. */ - if (own) { - ptoi[n] = ptoi[0]; - ptoi[0] = ptoi[own]; - ptoi[own] = ptoi[n]; - } - /* An entry with zero ifindex terminates ptoi[]. */ - ptoi[n].port_id = 0; - ptoi[n].ifindex = 0; - if (flow_size < off) - flow_size = 0; - ret = mlx5_nl_flow_transpose((uint8_t *)flow + off, - flow_size ? flow_size - off : 0, - ptoi, attr, pattern, actions, error); - if (ret < 0) - return ret; - if (flow_size) { - *flow = (struct rte_flow){ - .attributes = *attr, - .nl_flow = (uint8_t *)flow + off, - }; - /* - * Generate a reasonably unique handle based on the address - * of the target buffer. - * - * This is straightforward on 32-bit systems where the flow - * pointer can be used directly. Otherwise, its least - * significant part is taken after shifting it by the - * previous power of two of the pointed buffer size. - */ - if (sizeof(flow) <= 4) - mlx5_nl_flow_brand(flow->nl_flow, (uintptr_t)flow); - else - mlx5_nl_flow_brand - (flow->nl_flow, - (uintptr_t)flow >> - rte_log2_u32(rte_align32prevpow2(flow_size))); - } - return off + ret; -} - -static unsigned int -mlx5_find_graph_root(const struct rte_flow_item pattern[], uint32_t rss_level) +static inline struct mlx5_flow * +flow_drv_prepare(struct rte_flow *flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + uint64_t *item_flags, + uint64_t *action_flags, + struct rte_flow_error *error) { - const struct rte_flow_item *item; - unsigned int has_vlan = 0; + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = flow->drv_type; - for (item = pattern; item->type != RTE_FLOW_ITEM_TYPE_END; item++) { - if (item->type == RTE_FLOW_ITEM_TYPE_VLAN) { - has_vlan = 1; - break; - } - } - if (has_vlan) - return rss_level < 2 ? MLX5_EXPANSION_ROOT_ETH_VLAN : - MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN; - return rss_level < 2 ? MLX5_EXPANSION_ROOT : - MLX5_EXPANSION_ROOT_OUTER; + assert(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(type); + return fops->prepare(attr, items, actions, item_flags, action_flags, + error); } /** - * Convert the @p attributes, @p pattern, @p action, into an flow for the NIC - * after ensuring the NIC will understand and process it correctly. - * The conversion is only performed item/action per item/action, each of - * them is written into the @p flow if its size is lesser or equal to @p - * flow_size. - * Validation and memory consumption computation are still performed until the - * end, unless an error is encountered. + * Flow driver translation API. This abstracts calling driver specific + * functions. Parent flow (rte_flow) should have driver type (drv_type). It + * translates a generic flow into a driver flow. flow_drv_prepare() must + * precede. + * * * @param[in] dev - * Pointer to Ethernet device. - * @param[in, out] flow - * Pointer to flow structure. - * @param[in] flow_size - * Size in bytes of the available space in @p flow, if too small some - * garbage may be present. - * @param[in] attributes - * Flow rule attributes. - * @param[in] pattern - * Pattern specification (list terminated by the END pattern item). + * Pointer to the rte dev structure. + * @param[in, out] dev_flow + * Pointer to the mlx5 flow. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. * @param[in] actions - * Associated actions (list terminated by the END action). + * Pointer to the list of actions. * @param[out] error - * Perform verbose error reporting if not NULL. + * Pointer to the error structure. * * @return - * On success the number of bytes consumed/necessary, if the returned value - * is lesser or equal to @p flow_size, the flow has fully been converted and - * can be applied, otherwise another call with this returned memory size - * should be done. - * On error, a negative errno value is returned and rte_errno is set. - */ -static int -mlx5_flow_merge(struct rte_eth_dev *dev, struct rte_flow *flow, - const size_t flow_size, - const struct rte_flow_attr *attributes, - const struct rte_flow_item pattern[], - const struct rte_flow_action actions[], - struct rte_flow_error *error) -{ - struct rte_flow local_flow = { .layers = 0, }; - size_t size = sizeof(*flow); - union { - struct rte_flow_expand_rss buf; - uint8_t buffer[2048]; - } expand_buffer; - struct rte_flow_expand_rss *buf = &expand_buffer.buf; - struct mlx5_flow_verbs *original_verbs = NULL; - size_t original_verbs_size = 0; - uint32_t original_layers = 0; - int expanded_pattern_idx = 0; - int ret; - uint32_t i; - - if (attributes->transfer) - return mlx5_flow_merge_switch(dev, flow, flow_size, - attributes, pattern, - actions, error); - if (size > flow_size) - flow = &local_flow; - ret = mlx5_flow_attributes(dev, attributes, flow, error); - if (ret < 0) - return ret; - ret = mlx5_flow_actions(dev, actions, &local_flow, 0, error); - if (ret < 0) - return ret; - if (local_flow.rss.types) { - unsigned int graph_root; - - graph_root = mlx5_find_graph_root(pattern, - local_flow.rss.level); - ret = rte_flow_expand_rss(buf, sizeof(expand_buffer.buffer), - pattern, local_flow.rss.types, - mlx5_support_expansion, - graph_root); - assert(ret > 0 && - (unsigned int)ret < sizeof(expand_buffer.buffer)); - } else { - buf->entries = 1; - buf->entry[0].pattern = (void *)(uintptr_t)pattern; - } - size += RTE_ALIGN_CEIL(local_flow.rss.queue_num * sizeof(uint16_t), - sizeof(void *)); - if (size <= flow_size) - flow->queue = (void *)(flow + 1); - LIST_INIT(&flow->verbs); - flow->layers = 0; - flow->modifier = 0; - flow->fate = 0; - for (i = 0; i != buf->entries; ++i) { - size_t off = size; - size_t off2; - - flow->layers = original_layers; - size += sizeof(struct ibv_flow_attr) + - sizeof(struct mlx5_flow_verbs); - off2 = size; - if (size < flow_size) { - flow->cur_verbs = (void *)((uintptr_t)flow + off); - flow->cur_verbs->attr = (void *)(flow->cur_verbs + 1); - flow->cur_verbs->specs = - (void *)(flow->cur_verbs->attr + 1); - } - /* First iteration convert the pattern into Verbs. */ - if (i == 0) { - /* Actions don't need to be converted several time. */ - ret = mlx5_flow_actions(dev, actions, flow, - (size < flow_size) ? - flow_size - size : 0, - error); - if (ret < 0) - return ret; - size += ret; - } else { - /* - * Next iteration means the pattern has already been - * converted and an expansion is necessary to match - * the user RSS request. For that only the expanded - * items will be converted, the common part with the - * user pattern are just copied into the next buffer - * zone. - */ - size += original_verbs_size; - if (size < flow_size) { - rte_memcpy(flow->cur_verbs->attr, - original_verbs->attr, - original_verbs_size + - sizeof(struct ibv_flow_attr)); - flow->cur_verbs->size = original_verbs_size; - } - } - ret = mlx5_flow_items - (dev, - (const struct rte_flow_item *) - &buf->entry[i].pattern[expanded_pattern_idx], - flow, - (size < flow_size) ? flow_size - size : 0, error); - if (ret < 0) - return ret; - size += ret; - if (size <= flow_size) { - mlx5_flow_adjust_priority(dev, flow); - LIST_INSERT_HEAD(&flow->verbs, flow->cur_verbs, next); - } - /* - * Keep a pointer of the first verbs conversion and the layers - * it has encountered. - */ - if (i == 0) { - original_verbs = flow->cur_verbs; - original_verbs_size = size - off2; - original_layers = flow->layers; - /* - * move the index of the expanded pattern to the - * first item not addressed yet. - */ - if (pattern->type == RTE_FLOW_ITEM_TYPE_END) { - expanded_pattern_idx++; - } else { - const struct rte_flow_item *item = pattern; - - for (item = pattern; - item->type != RTE_FLOW_ITEM_TYPE_END; - ++item) - expanded_pattern_idx++; - } - } - } - /* Restore the origin layers in the flow. */ - flow->layers = original_layers; - return size; -} - -/** - * Lookup and set the ptype in the data Rx part. A single Ptype can be used, - * if several tunnel rules are used on this queue, the tunnel ptype will be - * cleared. - * - * @param rxq_ctrl - * Rx queue to update. + * 0 on success, a negative errno value otherwise and rte_ernno is set. */ -static void -mlx5_flow_rxq_tunnel_ptype_update(struct mlx5_rxq_ctrl *rxq_ctrl) +static inline int +flow_drv_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) { - unsigned int i; - uint32_t tunnel_ptype = 0; + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = dev_flow->flow->drv_type; - /* Look up for the ptype to use. */ - for (i = 0; i != MLX5_FLOW_TUNNEL; ++i) { - if (!rxq_ctrl->flow_tunnels_n[i]) - continue; - if (!tunnel_ptype) { - tunnel_ptype = tunnels_info[i].ptype; - } else { - tunnel_ptype = 0; - break; - } - } - rxq_ctrl->rxq.tunnel = tunnel_ptype; + assert(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(type); + return fops->translate(dev, dev_flow, attr, items, actions, error); } /** - * Set the Rx queue flags (Mark/Flag and Tunnel Ptypes) according to the flow. + * Flow driver apply API. This abstracts calling driver specific functions. + * Parent flow (rte_flow) should have driver type (drv_type). It applies + * translated driver flows on to device. flow_drv_translate() must precede. * * @param[in] dev - * Pointer to Ethernet device. - * @param[in] flow + * Pointer to Ethernet device structure. + * @param[in, out] flow * Pointer to flow structure. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static void -mlx5_flow_rxq_flags_set(struct rte_eth_dev *dev, struct rte_flow *flow) +static inline int +flow_drv_apply(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error) { - struct priv *priv = dev->data->dev_private; - const int mark = !!(flow->modifier & - (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK)); - const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); - unsigned int i; + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = flow->drv_type; - for (i = 0; i != flow->rss.queue_num; ++i) { - int idx = (*flow->queue)[i]; - struct mlx5_rxq_ctrl *rxq_ctrl = - container_of((*priv->rxqs)[idx], - struct mlx5_rxq_ctrl, rxq); - - if (mark) { - rxq_ctrl->rxq.mark = 1; - rxq_ctrl->flow_mark_n++; - } - if (tunnel) { - unsigned int j; - - /* Increase the counter matching the flow. */ - for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) { - if ((tunnels_info[j].tunnel & flow->layers) == - tunnels_info[j].tunnel) { - rxq_ctrl->flow_tunnels_n[j]++; - break; - } - } - mlx5_flow_rxq_tunnel_ptype_update(rxq_ctrl); - } - } + assert(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(type); + return fops->apply(dev, flow, error); } /** - * Clear the Rx queue flags (Mark/Flag and Tunnel Ptype) associated with the - * @p flow if no other flow uses it with the same kind of request. + * Flow driver remove API. This abstracts calling driver specific functions. + * Parent flow (rte_flow) should have driver type (drv_type). It removes a flow + * on device. All the resources of the flow should be freed by calling + * flow_dv_destroy(). * - * @param dev + * @param[in] dev * Pointer to Ethernet device. - * @param[in] flow - * Pointer to the flow. + * @param[in, out] flow + * Pointer to flow structure. */ -static void -mlx5_flow_rxq_flags_trim(struct rte_eth_dev *dev, struct rte_flow *flow) +static inline void +flow_drv_remove(struct rte_eth_dev *dev, struct rte_flow *flow) { - struct priv *priv = dev->data->dev_private; - const int mark = !!(flow->modifier & - (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK)); - const int tunnel = !!(flow->layers & MLX5_FLOW_LAYER_TUNNEL); - unsigned int i; - - assert(dev->data->dev_started); - for (i = 0; i != flow->rss.queue_num; ++i) { - int idx = (*flow->queue)[i]; - struct mlx5_rxq_ctrl *rxq_ctrl = - container_of((*priv->rxqs)[idx], - struct mlx5_rxq_ctrl, rxq); - - if (mark) { - rxq_ctrl->flow_mark_n--; - rxq_ctrl->rxq.mark = !!rxq_ctrl->flow_mark_n; - } - if (tunnel) { - unsigned int j; + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = flow->drv_type; - /* Decrease the counter matching the flow. */ - for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) { - if ((tunnels_info[j].tunnel & flow->layers) == - tunnels_info[j].tunnel) { - rxq_ctrl->flow_tunnels_n[j]--; - break; - } - } - mlx5_flow_rxq_tunnel_ptype_update(rxq_ctrl); - } - } + assert(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(type); + fops->remove(dev, flow); } /** - * Clear the Mark/Flag and Tunnel ptype information in all Rx queues. + * Flow driver destroy API. This abstracts calling driver specific functions. + * Parent flow (rte_flow) should have driver type (drv_type). It removes a flow + * on device and releases resources of the flow. * - * @param dev + * @param[in] dev * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to flow structure. */ -static void -mlx5_flow_rxq_flags_clear(struct rte_eth_dev *dev) +static inline void +flow_drv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) { - struct priv *priv = dev->data->dev_private; - unsigned int i; - - for (i = 0; i != priv->rxqs_n; ++i) { - struct mlx5_rxq_ctrl *rxq_ctrl; - unsigned int j; + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = flow->drv_type; - if (!(*priv->rxqs)[i]) - continue; - rxq_ctrl = container_of((*priv->rxqs)[i], - struct mlx5_rxq_ctrl, rxq); - rxq_ctrl->flow_mark_n = 0; - rxq_ctrl->rxq.mark = 0; - for (j = 0; j != MLX5_FLOW_TUNNEL; ++j) - rxq_ctrl->flow_tunnels_n[j] = 0; - rxq_ctrl->rxq.tunnel = 0; - } + assert(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(type); + fops->destroy(dev, flow); } /** @@ -2859,134 +1930,55 @@ mlx5_flow_validate(struct rte_eth_dev *dev, const struct rte_flow_action actions[], struct rte_flow_error *error) { - int ret = mlx5_flow_merge(dev, NULL, 0, attr, items, actions, error); + int ret; + ret = flow_drv_validate(dev, attr, items, actions, error); if (ret < 0) return ret; return 0; } /** - * Remove the flow. + * Get RSS action from the action list. * - * @param[in] dev - * Pointer to Ethernet device. - * @param[in, out] flow - * Pointer to flow structure. + * @param[in] actions + * Pointer to the list of actions. + * + * @return + * Pointer to the RSS action if exist, else return NULL. */ -static void -mlx5_flow_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +static const struct rte_flow_action_rss* +flow_get_rss_action(const struct rte_flow_action actions[]) { - struct priv *priv = dev->data->dev_private; - struct mlx5_flow_verbs *verbs; - - if (flow->nl_flow && priv->mnl_socket) - mlx5_nl_flow_destroy(priv->mnl_socket, flow->nl_flow, NULL); - LIST_FOREACH(verbs, &flow->verbs, next) { - if (verbs->flow) { - claim_zero(mlx5_glue->destroy_flow(verbs->flow)); - verbs->flow = NULL; - } - if (verbs->hrxq) { - if (flow->fate & MLX5_FLOW_FATE_DROP) - mlx5_hrxq_drop_release(dev); - else - mlx5_hrxq_release(dev, verbs->hrxq); - verbs->hrxq = NULL; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_RSS: + return (const struct rte_flow_action_rss *) + actions->conf; + default: + break; } } - if (flow->counter) { - mlx5_flow_counter_release(flow->counter); - flow->counter = NULL; - } + return NULL; } -/** - * Apply the flow. - * - * @param[in] dev - * Pointer to Ethernet device structure. - * @param[in, out] flow - * Pointer to flow structure. - * @param[out] error - * Pointer to error structure. - * - * @return - * 0 on success, a negative errno value otherwise and rte_errno is set. - */ -static int -mlx5_flow_apply(struct rte_eth_dev *dev, struct rte_flow *flow, - struct rte_flow_error *error) +static unsigned int +find_graph_root(const struct rte_flow_item pattern[], uint32_t rss_level) { - struct priv *priv = dev->data->dev_private; - struct mlx5_flow_verbs *verbs; - int err; - - LIST_FOREACH(verbs, &flow->verbs, next) { - if (flow->fate & MLX5_FLOW_FATE_DROP) { - verbs->hrxq = mlx5_hrxq_drop_new(dev); - if (!verbs->hrxq) { - rte_flow_error_set - (error, errno, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "cannot get drop hash queue"); - goto error; - } - } else { - struct mlx5_hrxq *hrxq; - - hrxq = mlx5_hrxq_get(dev, flow->key, - MLX5_RSS_HASH_KEY_LEN, - verbs->hash_fields, - (*flow->queue), - flow->rss.queue_num); - if (!hrxq) - hrxq = mlx5_hrxq_new(dev, flow->key, - MLX5_RSS_HASH_KEY_LEN, - verbs->hash_fields, - (*flow->queue), - flow->rss.queue_num, - !!(flow->layers & - MLX5_FLOW_LAYER_TUNNEL)); - if (!hrxq) { - rte_flow_error_set - (error, rte_errno, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "cannot get hash queue"); - goto error; - } - verbs->hrxq = hrxq; - } - verbs->flow = - mlx5_glue->create_flow(verbs->hrxq->qp, verbs->attr); - if (!verbs->flow) { - rte_flow_error_set(error, errno, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "hardware refuses to create flow"); - goto error; - } - } - if (flow->nl_flow && - priv->mnl_socket && - mlx5_nl_flow_create(priv->mnl_socket, flow->nl_flow, error)) - goto error; - return 0; -error: - err = rte_errno; /* Save rte_errno before cleanup. */ - LIST_FOREACH(verbs, &flow->verbs, next) { - if (verbs->hrxq) { - if (flow->fate & MLX5_FLOW_FATE_DROP) - mlx5_hrxq_drop_release(dev); - else - mlx5_hrxq_release(dev, verbs->hrxq); - verbs->hrxq = NULL; + const struct rte_flow_item *item; + unsigned int has_vlan = 0; + + for (item = pattern; item->type != RTE_FLOW_ITEM_TYPE_END; item++) { + if (item->type == RTE_FLOW_ITEM_TYPE_VLAN) { + has_vlan = 1; + break; } } - rte_errno = err; /* Restore rte_errno. */ - return -rte_errno; + if (has_vlan) + return rss_level < 2 ? MLX5_EXPANSION_ROOT_ETH_VLAN : + MLX5_EXPANSION_ROOT_OUTER_ETH_VLAN; + return rss_level < 2 ? MLX5_EXPANSION_ROOT : + MLX5_EXPANSION_ROOT_OUTER; } /** @@ -3009,50 +2001,90 @@ error: * A flow on success, NULL otherwise and rte_errno is set. */ static struct rte_flow * -mlx5_flow_list_create(struct rte_eth_dev *dev, - struct mlx5_flows *list, - const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - const struct rte_flow_action actions[], - struct rte_flow_error *error) +flow_list_create(struct rte_eth_dev *dev, struct mlx5_flows *list, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) { struct rte_flow *flow = NULL; - size_t size = 0; + struct mlx5_flow *dev_flow; + uint64_t action_flags = 0; + uint64_t item_flags = 0; + const struct rte_flow_action_rss *rss; + union { + struct rte_flow_expand_rss buf; + uint8_t buffer[2048]; + } expand_buffer; + struct rte_flow_expand_rss *buf = &expand_buffer.buf; int ret; + uint32_t i; + uint32_t flow_size; - ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error); + ret = flow_drv_validate(dev, attr, items, actions, error); if (ret < 0) return NULL; - size = ret; - flow = rte_calloc(__func__, 1, size, 0); - if (!flow) { - rte_flow_error_set(error, ENOMEM, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "not enough memory to create flow"); - return NULL; + flow_size = sizeof(struct rte_flow); + rss = flow_get_rss_action(actions); + if (rss) + flow_size += RTE_ALIGN_CEIL(rss->queue_num * sizeof(uint16_t), + sizeof(void *)); + else + flow_size += RTE_ALIGN_CEIL(sizeof(uint16_t), sizeof(void *)); + flow = rte_calloc(__func__, 1, flow_size, 0); + flow->drv_type = flow_get_drv_type(dev, attr); + assert(flow->drv_type > MLX5_FLOW_TYPE_MIN && + flow->drv_type < MLX5_FLOW_TYPE_MAX); + flow->queue = (void *)(flow + 1); + LIST_INIT(&flow->dev_flows); + if (rss && rss->types) { + unsigned int graph_root; + + graph_root = find_graph_root(items, rss->level); + ret = rte_flow_expand_rss(buf, sizeof(expand_buffer.buffer), + items, rss->types, + mlx5_support_expansion, + graph_root); + assert(ret > 0 && + (unsigned int)ret < sizeof(expand_buffer.buffer)); + } else { + buf->entries = 1; + buf->entry[0].pattern = (void *)(uintptr_t)items; } - ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error); - if (ret < 0) { - rte_free(flow); - return NULL; + for (i = 0; i < buf->entries; ++i) { + dev_flow = flow_drv_prepare(flow, attr, buf->entry[i].pattern, + actions, &item_flags, &action_flags, + error); + if (!dev_flow) + goto error; + dev_flow->flow = flow; + dev_flow->layers = item_flags; + /* Store actions once as expanded flows have same actions. */ + if (i == 0) + flow->actions = action_flags; + assert(flow->actions == action_flags); + LIST_INSERT_HEAD(&flow->dev_flows, dev_flow, next); + ret = flow_drv_translate(dev, dev_flow, attr, + buf->entry[i].pattern, + actions, error); + if (ret < 0) + goto error; } - assert((size_t)ret == size); if (dev->data->dev_started) { - ret = mlx5_flow_apply(dev, flow, error); - if (ret < 0) { - ret = rte_errno; /* Save rte_errno before cleanup. */ - if (flow) { - mlx5_flow_remove(dev, flow); - rte_free(flow); - } - rte_errno = ret; /* Restore rte_errno. */ - return NULL; - } + ret = flow_drv_apply(dev, flow, error); + if (ret < 0) + goto error; } TAILQ_INSERT_TAIL(list, flow, next); - mlx5_flow_rxq_flags_set(dev, flow); + flow_rxq_flags_set(dev, flow); return flow; +error: + ret = rte_errno; /* Save rte_errno before cleanup. */ + assert(flow); + flow_drv_destroy(dev, flow); + rte_free(flow); + rte_errno = ret; /* Restore rte_errno. */ + return NULL; } /** @@ -3068,9 +2100,9 @@ mlx5_flow_create(struct rte_eth_dev *dev, const struct rte_flow_action actions[], struct rte_flow_error *error) { - return mlx5_flow_list_create - (dev, &((struct priv *)dev->data->dev_private)->flows, - attr, items, actions, error); + return flow_list_create(dev, + &((struct priv *)dev->data->dev_private)->flows, + attr, items, actions, error); } /** @@ -3084,17 +2116,17 @@ mlx5_flow_create(struct rte_eth_dev *dev, * Flow to destroy. */ static void -mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list, - struct rte_flow *flow) +flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list, + struct rte_flow *flow) { - mlx5_flow_remove(dev, flow); + flow_drv_destroy(dev, flow); TAILQ_REMOVE(list, flow, next); /* * Update RX queue flags only if port is started, otherwise it is * already clean. */ if (dev->data->dev_started) - mlx5_flow_rxq_flags_trim(dev, flow); + flow_rxq_flags_trim(dev, flow); rte_free(flow); } @@ -3113,7 +2145,7 @@ mlx5_flow_list_flush(struct rte_eth_dev *dev, struct mlx5_flows *list) struct rte_flow *flow; flow = TAILQ_FIRST(list); - mlx5_flow_list_destroy(dev, list, flow); + flow_list_destroy(dev, list, flow); } } @@ -3131,8 +2163,8 @@ mlx5_flow_stop(struct rte_eth_dev *dev, struct mlx5_flows *list) struct rte_flow *flow; TAILQ_FOREACH_REVERSE(flow, list, mlx5_flows, next) - mlx5_flow_remove(dev, flow); - mlx5_flow_rxq_flags_clear(dev); + flow_drv_remove(dev, flow); + flow_rxq_flags_clear(dev); } /** @@ -3154,10 +2186,10 @@ mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list) int ret = 0; TAILQ_FOREACH(flow, list, next) { - ret = mlx5_flow_apply(dev, flow, &error); + ret = flow_drv_apply(dev, flow, &error); if (ret < 0) goto error; - mlx5_flow_rxq_flags_set(dev, flow); + flow_rxq_flags_set(dev, flow); } return 0; error: @@ -3228,7 +2260,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, }, { .type = (vlan_spec) ? RTE_FLOW_ITEM_TYPE_VLAN : - RTE_FLOW_ITEM_TYPE_END, + RTE_FLOW_ITEM_TYPE_END, .spec = vlan_spec, .last = NULL, .mask = vlan_mask, @@ -3266,8 +2298,8 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, } for (i = 0; i != priv->reta_idx_n; ++i) queue[i] = (*priv->reta_idx)[i]; - flow = mlx5_flow_list_create(dev, &priv->ctrl_flows, &attr, items, - actions, &error); + flow = flow_list_create(dev, &priv->ctrl_flows, + &attr, items, actions, &error); if (!flow) return -rte_errno; return 0; @@ -3307,7 +2339,7 @@ mlx5_flow_destroy(struct rte_eth_dev *dev, { struct priv *priv = dev->data->dev_private; - mlx5_flow_list_destroy(dev, &priv->flows, flow); + flow_list_destroy(dev, &priv->flows, flow); return 0; } @@ -3356,92 +2388,45 @@ mlx5_flow_isolate(struct rte_eth_dev *dev, } /** - * Query flow counter. - * - * @param flow - * Pointer to the flow. + * Query a flow. * - * @return - * 0 on success, a negative errno value otherwise and rte_errno is set. + * @see rte_flow_query() + * @see rte_flow_ops */ static int -mlx5_flow_query_count(struct rte_flow *flow __rte_unused, - void *data __rte_unused, - struct rte_flow_error *error) -{ -#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT - if (flow->modifier & MLX5_FLOW_MOD_COUNT) { - struct rte_flow_query_count *qc = data; - uint64_t counters[2] = {0, 0}; - struct ibv_query_counter_set_attr query_cs_attr = { - .cs = flow->counter->cs, - .query_flags = IBV_COUNTER_SET_FORCE_UPDATE, - }; - struct ibv_counter_set_data query_out = { - .out = counters, - .outlen = 2 * sizeof(uint64_t), - }; - int err = mlx5_glue->query_counter_set(&query_cs_attr, - &query_out); +flow_drv_query(struct rte_eth_dev *dev, + struct rte_flow *flow, + const struct rte_flow_action *actions, + void *data, + struct rte_flow_error *error) +{ + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type ftype = flow->drv_type; - if (err) - return rte_flow_error_set - (error, err, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "cannot read counter"); - qc->hits_set = 1; - qc->bytes_set = 1; - qc->hits = counters[0] - flow->counter->hits; - qc->bytes = counters[1] - flow->counter->bytes; - if (qc->reset) { - flow->counter->hits = counters[0]; - flow->counter->bytes = counters[1]; - } - return 0; - } - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "flow does not have counter"); -#endif - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, - "counters are not available"); + assert(ftype > MLX5_FLOW_TYPE_MIN && ftype < MLX5_FLOW_TYPE_MAX); + fops = flow_get_drv_ops(ftype); + + return fops->query(dev, flow, actions, data, error); } /** - * Query a flows. + * Query a flow. * * @see rte_flow_query() * @see rte_flow_ops */ int -mlx5_flow_query(struct rte_eth_dev *dev __rte_unused, +mlx5_flow_query(struct rte_eth_dev *dev, struct rte_flow *flow, const struct rte_flow_action *actions, void *data, struct rte_flow_error *error) { - int ret = 0; + int ret; - for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { - switch (actions->type) { - case RTE_FLOW_ACTION_TYPE_VOID: - break; - case RTE_FLOW_ACTION_TYPE_COUNT: - ret = mlx5_flow_query_count(flow, data, error); - break; - default: - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ACTION, - actions, - "action not supported"); - } - if (ret < 0) - return ret; - } + ret = flow_drv_query(dev, flow, actions, data, error); + if (ret < 0) + return ret; return 0; } @@ -3511,7 +2496,6 @@ mlx5_fdir_filter_convert(struct rte_eth_dev *dev, .dst_addr = input->flow.ip4_flow.dst_ip, .time_to_live = input->flow.ip4_flow.ttl, .type_of_service = input->flow.ip4_flow.tos, - .next_proto_id = input->flow.ip4_flow.proto, }; attributes->l3_mask.ipv4.hdr = (struct ipv4_hdr){ .src_addr = mask->ipv4_mask.src_ip, @@ -3663,9 +2647,8 @@ mlx5_fdir_filter_add(struct rte_eth_dev *dev, ret = mlx5_fdir_filter_convert(dev, fdir_filter, &attributes); if (ret) return ret; - flow = mlx5_flow_list_create(dev, &priv->flows, &attributes.attr, - attributes.items, attributes.actions, - &error); + flow = flow_list_create(dev, &priv->flows, &attributes.attr, + attributes.items, attributes.actions, &error); if (flow) { DRV_LOG(DEBUG, "port %u FDIR created %p", dev->data->port_id, (void *)flow); diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h new file mode 100644 index 00000000..61299d66 --- /dev/null +++ b/drivers/net/mlx5/mlx5_flow.h @@ -0,0 +1,375 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#ifndef RTE_PMD_MLX5_FLOW_H_ +#define RTE_PMD_MLX5_FLOW_H_ + +#include <netinet/in.h> +#include <sys/queue.h> +#include <stdalign.h> +#include <stdint.h> +#include <string.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +/* Pattern outer Layer bits. */ +#define MLX5_FLOW_LAYER_OUTER_L2 (1u << 0) +#define MLX5_FLOW_LAYER_OUTER_L3_IPV4 (1u << 1) +#define MLX5_FLOW_LAYER_OUTER_L3_IPV6 (1u << 2) +#define MLX5_FLOW_LAYER_OUTER_L4_UDP (1u << 3) +#define MLX5_FLOW_LAYER_OUTER_L4_TCP (1u << 4) +#define MLX5_FLOW_LAYER_OUTER_VLAN (1u << 5) + +/* Pattern inner Layer bits. */ +#define MLX5_FLOW_LAYER_INNER_L2 (1u << 6) +#define MLX5_FLOW_LAYER_INNER_L3_IPV4 (1u << 7) +#define MLX5_FLOW_LAYER_INNER_L3_IPV6 (1u << 8) +#define MLX5_FLOW_LAYER_INNER_L4_UDP (1u << 9) +#define MLX5_FLOW_LAYER_INNER_L4_TCP (1u << 10) +#define MLX5_FLOW_LAYER_INNER_VLAN (1u << 11) + +/* Pattern tunnel Layer bits. */ +#define MLX5_FLOW_LAYER_VXLAN (1u << 12) +#define MLX5_FLOW_LAYER_VXLAN_GPE (1u << 13) +#define MLX5_FLOW_LAYER_GRE (1u << 14) +#define MLX5_FLOW_LAYER_MPLS (1u << 15) + +/* General pattern items bits. */ +#define MLX5_FLOW_ITEM_METADATA (1u << 16) + +/* Outer Masks. */ +#define MLX5_FLOW_LAYER_OUTER_L3 \ + (MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6) +#define MLX5_FLOW_LAYER_OUTER_L4 \ + (MLX5_FLOW_LAYER_OUTER_L4_UDP | MLX5_FLOW_LAYER_OUTER_L4_TCP) +#define MLX5_FLOW_LAYER_OUTER \ + (MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_L3 | \ + MLX5_FLOW_LAYER_OUTER_L4) + +/* Tunnel Masks. */ +#define MLX5_FLOW_LAYER_TUNNEL \ + (MLX5_FLOW_LAYER_VXLAN | MLX5_FLOW_LAYER_VXLAN_GPE | \ + MLX5_FLOW_LAYER_GRE | MLX5_FLOW_LAYER_MPLS) + +/* Inner Masks. */ +#define MLX5_FLOW_LAYER_INNER_L3 \ + (MLX5_FLOW_LAYER_INNER_L3_IPV4 | MLX5_FLOW_LAYER_INNER_L3_IPV6) +#define MLX5_FLOW_LAYER_INNER_L4 \ + (MLX5_FLOW_LAYER_INNER_L4_UDP | MLX5_FLOW_LAYER_INNER_L4_TCP) +#define MLX5_FLOW_LAYER_INNER \ + (MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_L3 | \ + MLX5_FLOW_LAYER_INNER_L4) + +/* Actions */ +#define MLX5_FLOW_ACTION_DROP (1u << 0) +#define MLX5_FLOW_ACTION_QUEUE (1u << 1) +#define MLX5_FLOW_ACTION_RSS (1u << 2) +#define MLX5_FLOW_ACTION_FLAG (1u << 3) +#define MLX5_FLOW_ACTION_MARK (1u << 4) +#define MLX5_FLOW_ACTION_COUNT (1u << 5) +#define MLX5_FLOW_ACTION_PORT_ID (1u << 6) +#define MLX5_FLOW_ACTION_OF_POP_VLAN (1u << 7) +#define MLX5_FLOW_ACTION_OF_PUSH_VLAN (1u << 8) +#define MLX5_FLOW_ACTION_OF_SET_VLAN_VID (1u << 9) +#define MLX5_FLOW_ACTION_OF_SET_VLAN_PCP (1u << 10) +#define MLX5_FLOW_ACTION_SET_IPV4_SRC (1u << 11) +#define MLX5_FLOW_ACTION_SET_IPV4_DST (1u << 12) +#define MLX5_FLOW_ACTION_SET_IPV6_SRC (1u << 13) +#define MLX5_FLOW_ACTION_SET_IPV6_DST (1u << 14) +#define MLX5_FLOW_ACTION_SET_TP_SRC (1u << 15) +#define MLX5_FLOW_ACTION_SET_TP_DST (1u << 16) +#define MLX5_FLOW_ACTION_JUMP (1u << 17) +#define MLX5_FLOW_ACTION_SET_TTL (1u << 18) +#define MLX5_FLOW_ACTION_DEC_TTL (1u << 19) +#define MLX5_FLOW_ACTION_SET_MAC_SRC (1u << 20) +#define MLX5_FLOW_ACTION_SET_MAC_DST (1u << 21) + +#define MLX5_FLOW_FATE_ACTIONS \ + (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_QUEUE | MLX5_FLOW_ACTION_RSS) + +#ifndef IPPROTO_MPLS +#define IPPROTO_MPLS 137 +#endif + +/* UDP port numbers for VxLAN. */ +#define MLX5_UDP_PORT_VXLAN 4789 +#define MLX5_UDP_PORT_VXLAN_GPE 4790 + +/* Priority reserved for default flows. */ +#define MLX5_FLOW_PRIO_RSVD ((uint32_t)-1) + +/* + * Number of sub priorities. + * For each kind of pattern matching i.e. L2, L3, L4 to have a correct + * matching on the NIC (firmware dependent) L4 most have the higher priority + * followed by L3 and ending with L2. + */ +#define MLX5_PRIORITY_MAP_L2 2 +#define MLX5_PRIORITY_MAP_L3 1 +#define MLX5_PRIORITY_MAP_L4 0 +#define MLX5_PRIORITY_MAP_MAX 3 + +/* Valid layer type for IPV4 RSS. */ +#define MLX5_IPV4_LAYER_TYPES \ + (ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 | \ + ETH_RSS_NONFRAG_IPV4_TCP | ETH_RSS_NONFRAG_IPV4_UDP | \ + ETH_RSS_NONFRAG_IPV4_OTHER) + +/* IBV hash source bits for IPV4. */ +#define MLX5_IPV4_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4) + +/* Valid layer type for IPV6 RSS. */ +#define MLX5_IPV6_LAYER_TYPES \ + (ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 | ETH_RSS_NONFRAG_IPV6_TCP | \ + ETH_RSS_NONFRAG_IPV6_UDP | ETH_RSS_IPV6_EX | ETH_RSS_IPV6_TCP_EX | \ + ETH_RSS_IPV6_UDP_EX | ETH_RSS_NONFRAG_IPV6_OTHER) + +/* IBV hash source bits for IPV6. */ +#define MLX5_IPV6_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6) + +/* Max number of actions per DV flow. */ +#define MLX5_DV_MAX_NUMBER_OF_ACTIONS 8 + +enum mlx5_flow_drv_type { + MLX5_FLOW_TYPE_MIN, + MLX5_FLOW_TYPE_DV, + MLX5_FLOW_TYPE_TCF, + MLX5_FLOW_TYPE_VERBS, + MLX5_FLOW_TYPE_MAX, +}; + +/* Matcher PRM representation */ +struct mlx5_flow_dv_match_params { + size_t size; + /**< Size of match value. Do NOT split size and key! */ + uint32_t buf[MLX5_ST_SZ_DW(fte_match_param)]; + /**< Matcher value. This value is used as the mask or as a key. */ +}; + +#define MLX5_DV_MAX_NUMBER_OF_ACTIONS 8 + +/* Matcher structure. */ +struct mlx5_flow_dv_matcher { + LIST_ENTRY(mlx5_flow_dv_matcher) next; + /* Pointer to the next element. */ + rte_atomic32_t refcnt; /**< Reference counter. */ + void *matcher_object; /**< Pointer to DV matcher */ + uint16_t crc; /**< CRC of key. */ + uint16_t priority; /**< Priority of matcher. */ + uint8_t egress; /**< Egress matcher. */ + struct mlx5_flow_dv_match_params mask; /**< Matcher mask. */ +}; + +/* DV flows structure. */ +struct mlx5_flow_dv { + uint64_t hash_fields; /**< Fields that participate in the hash. */ + struct mlx5_hrxq *hrxq; /**< Hash Rx queues. */ + /* Flow DV api: */ + struct mlx5_flow_dv_matcher *matcher; /**< Cache to matcher. */ + struct mlx5_flow_dv_match_params value; + /**< Holds the value that the packet is compared to. */ + struct ibv_flow *flow; /**< Installed flow. */ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + struct mlx5dv_flow_action_attr actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS]; + /**< Action list. */ +#endif + int actions_n; /**< number of actions. */ +}; + +/** Linux TC flower driver for E-Switch flow. */ +struct mlx5_flow_tcf { + struct nlmsghdr *nlh; + struct tcmsg *tcm; +}; + +/* Verbs specification header. */ +struct ibv_spec_header { + enum ibv_flow_spec_type type; + uint16_t size; +}; + +/** Handles information leading to a drop fate. */ +struct mlx5_flow_verbs { + LIST_ENTRY(mlx5_flow_verbs) next; + unsigned int size; /**< Size of the attribute. */ + struct { + struct ibv_flow_attr *attr; + /**< Pointer to the Specification buffer. */ + uint8_t *specs; /**< Pointer to the specifications. */ + }; + struct ibv_flow *flow; /**< Verbs flow pointer. */ + struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */ + uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */ +}; + +/** Device flow structure. */ +struct mlx5_flow { + LIST_ENTRY(mlx5_flow) next; + struct rte_flow *flow; /**< Pointer to the main flow. */ + uint64_t layers; + /**< Bit-fields of present layers, see MLX5_FLOW_LAYER_*. */ + union { +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + struct mlx5_flow_dv dv; +#endif + struct mlx5_flow_tcf tcf; + struct mlx5_flow_verbs verbs; + }; +}; + +/* Counters information. */ +struct mlx5_flow_counter { + LIST_ENTRY(mlx5_flow_counter) next; /**< Pointer to the next counter. */ + uint32_t shared:1; /**< Share counter ID with other flow rules. */ + uint32_t ref_cnt:31; /**< Reference counter. */ + uint32_t id; /**< Counter ID. */ +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) + struct ibv_counter_set *cs; /**< Holds the counters for the rule. */ +#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + struct ibv_counters *cs; /**< Holds the counters for the rule. */ +#endif + uint64_t hits; /**< Number of packets matched by the rule. */ + uint64_t bytes; /**< Number of bytes matched by the rule. */ +}; + +/* Flow structure. */ +struct rte_flow { + TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */ + enum mlx5_flow_drv_type drv_type; /**< Drvier type. */ + struct mlx5_flow_counter *counter; /**< Holds flow counter. */ + struct rte_flow_action_rss rss;/**< RSS context. */ + uint8_t key[MLX5_RSS_HASH_KEY_LEN]; /**< RSS hash key. */ + uint16_t (*queue)[]; /**< Destination queues to redirect traffic to. */ + LIST_HEAD(dev_flows, mlx5_flow) dev_flows; + /**< Device flows that are part of the flow. */ + uint64_t actions; + /**< Bit-fields of detected actions, see MLX5_FLOW_ACTION_*. */ +}; +typedef int (*mlx5_flow_validate_t)(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); +typedef struct mlx5_flow *(*mlx5_flow_prepare_t) + (const struct rte_flow_attr *attr, const struct rte_flow_item items[], + const struct rte_flow_action actions[], uint64_t *item_flags, + uint64_t *action_flags, struct rte_flow_error *error); +typedef int (*mlx5_flow_translate_t)(struct rte_eth_dev *dev, + struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error); +typedef int (*mlx5_flow_apply_t)(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error); +typedef void (*mlx5_flow_remove_t)(struct rte_eth_dev *dev, + struct rte_flow *flow); +typedef void (*mlx5_flow_destroy_t)(struct rte_eth_dev *dev, + struct rte_flow *flow); +typedef int (*mlx5_flow_query_t)(struct rte_eth_dev *dev, + struct rte_flow *flow, + const struct rte_flow_action *actions, + void *data, + struct rte_flow_error *error); +struct mlx5_flow_driver_ops { + mlx5_flow_validate_t validate; + mlx5_flow_prepare_t prepare; + mlx5_flow_translate_t translate; + mlx5_flow_apply_t apply; + mlx5_flow_remove_t remove; + mlx5_flow_destroy_t destroy; + mlx5_flow_query_t query; +}; + +/* mlx5_flow.c */ + +uint64_t mlx5_flow_hashfields_adjust(struct mlx5_flow *dev_flow, int tunnel, + uint64_t layer_types, + uint64_t hash_fields); +uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority, + uint32_t subpriority); +int mlx5_flow_validate_action_count(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +int mlx5_flow_validate_action_drop(uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +int mlx5_flow_validate_action_flag(uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +int mlx5_flow_validate_action_mark(const struct rte_flow_action *action, + uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +int mlx5_flow_validate_action_queue(const struct rte_flow_action *action, + uint64_t action_flags, + struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +int mlx5_flow_validate_action_rss(const struct rte_flow_action *action, + uint64_t action_flags, + struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +int mlx5_flow_validate_attributes(struct rte_eth_dev *dev, + const struct rte_flow_attr *attributes, + struct rte_flow_error *error); +int mlx5_flow_item_acceptable(const struct rte_flow_item *item, + const uint8_t *mask, + const uint8_t *nic_mask, + unsigned int size, + struct rte_flow_error *error); +int mlx5_flow_validate_item_eth(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_flow_error *error); +int mlx5_flow_validate_item_gre(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error); +int mlx5_flow_validate_item_ipv4(const struct rte_flow_item *item, + int64_t item_flags, + struct rte_flow_error *error); +int mlx5_flow_validate_item_ipv6(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_flow_error *error); +int mlx5_flow_validate_item_mpls(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error); +int mlx5_flow_validate_item_tcp(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + const struct rte_flow_item_tcp *flow_mask, + struct rte_flow_error *error); +int mlx5_flow_validate_item_udp(const struct rte_flow_item *item, + uint64_t item_flags, + uint8_t target_protocol, + struct rte_flow_error *error); +int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item, + int64_t item_flags, + struct rte_flow_error *error); +int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_flow_error *error); +int mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item, + uint64_t item_flags, + struct rte_eth_dev *dev, + struct rte_flow_error *error); + +/* mlx5_flow_tcf.c */ + +int mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx, + unsigned int ifindex, struct rte_flow_error *error); +struct mlx5_flow_tcf_context *mlx5_flow_tcf_context_create(void); +void mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx); + +#endif /* RTE_PMD_MLX5_FLOW_H_ */ diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c new file mode 100644 index 00000000..8f729f44 --- /dev/null +++ b/drivers/net/mlx5/mlx5_flow_dv.c @@ -0,0 +1,1492 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include <sys/queue.h> +#include <stdalign.h> +#include <stdint.h> +#include <string.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_common.h> +#include <rte_ether.h> +#include <rte_eth_ctrl.h> +#include <rte_ethdev_driver.h> +#include <rte_flow.h> +#include <rte_flow_driver.h> +#include <rte_malloc.h> +#include <rte_ip.h> + +#include "mlx5.h" +#include "mlx5_defs.h" +#include "mlx5_prm.h" +#include "mlx5_glue.h" +#include "mlx5_flow.h" + +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + +/** + * Validate META item. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] item + * Item specification. + * @param[in] attr + * Attributes of flow that includes this item. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_item_meta(struct rte_eth_dev *dev, + const struct rte_flow_item *item, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + const struct rte_flow_item_meta *spec = item->spec; + const struct rte_flow_item_meta *mask = item->mask; + const struct rte_flow_item_meta nic_mask = { + .data = RTE_BE32(UINT32_MAX) + }; + int ret; + uint64_t offloads = dev->data->dev_conf.txmode.offloads; + + if (!(offloads & DEV_TX_OFFLOAD_MATCH_METADATA)) + return rte_flow_error_set(error, EPERM, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, + "match on metadata offload " + "configuration is off for this port"); + if (!spec) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + item->spec, + "data cannot be empty"); + if (!spec->data) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + NULL, + "data cannot be zero"); + if (!mask) + mask = &rte_flow_item_meta_mask; + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_meta), + error); + if (ret < 0) + return ret; + if (attr->ingress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + NULL, + "pattern not supported for ingress"); + return 0; +} + +/** + * Verify the @p attributes will be correctly understood by the NIC and store + * them in the @p flow if everything is correct. + * + * @param[in] dev + * Pointer to dev struct. + * @param[in] attributes + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_validate_attributes(struct rte_eth_dev *dev, + const struct rte_flow_attr *attributes, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + uint32_t priority_max = priv->config.flow_prio - 1; + + if (attributes->group) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + NULL, + "groups is not supported"); + if (attributes->priority != MLX5_FLOW_PRIO_RSVD && + attributes->priority >= priority_max) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, + "priority out of range"); + if (attributes->transfer) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, + NULL, + "transfer is not supported"); + if (!(attributes->egress ^ attributes->ingress)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR, NULL, + "must specify exactly one of " + "ingress or egress"); + return 0; +} + +/** + * Internal validation function. For validating both actions and items. + * + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + */ +static int +flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + int ret; + uint64_t action_flags = 0; + uint64_t item_flags = 0; + int tunnel = 0; + uint8_t next_protocol = 0xff; + int actions_n = 0; + + if (items == NULL) + return -1; + ret = flow_dv_validate_attributes(dev, attr, error); + if (ret < 0) + return ret; + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + ret = mlx5_flow_validate_item_eth(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + ret = mlx5_flow_validate_item_vlan(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : + MLX5_FLOW_LAYER_OUTER_VLAN; + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + ret = mlx5_flow_validate_item_ipv4(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; + if (items->mask != NULL && + ((const struct rte_flow_item_ipv4 *) + items->mask)->hdr.next_proto_id) + next_protocol = + ((const struct rte_flow_item_ipv4 *) + (items->spec))->hdr.next_proto_id; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + ret = mlx5_flow_validate_item_ipv6(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; + if (items->mask != NULL && + ((const struct rte_flow_item_ipv6 *) + items->mask)->hdr.proto) + next_protocol = + ((const struct rte_flow_item_ipv6 *) + items->spec)->hdr.proto; + break; + case RTE_FLOW_ITEM_TYPE_TCP: + ret = mlx5_flow_validate_item_tcp + (items, item_flags, + next_protocol, + &rte_flow_item_tcp_mask, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + ret = mlx5_flow_validate_item_udp(items, item_flags, + next_protocol, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; + break; + case RTE_FLOW_ITEM_TYPE_GRE: + case RTE_FLOW_ITEM_TYPE_NVGRE: + ret = mlx5_flow_validate_item_gre(items, item_flags, + next_protocol, error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + ret = mlx5_flow_validate_item_vxlan(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_VXLAN; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + ret = mlx5_flow_validate_item_vxlan_gpe(items, + item_flags, dev, + error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_VXLAN_GPE; + break; + case RTE_FLOW_ITEM_TYPE_META: + ret = flow_dv_validate_item_meta(dev, items, attr, + error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_ITEM_METADATA; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "item not supported"); + } + } + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + if (actions_n == MLX5_DV_MAX_NUMBER_OF_ACTIONS) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, "too many actions"); + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + ret = mlx5_flow_validate_action_flag(action_flags, + attr, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_FLAG; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_MARK: + ret = mlx5_flow_validate_action_mark(actions, + action_flags, + attr, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_MARK; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_DROP: + ret = mlx5_flow_validate_action_drop(action_flags, + attr, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_DROP; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + ret = mlx5_flow_validate_action_queue(actions, + action_flags, dev, + attr, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_QUEUE; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_RSS: + ret = mlx5_flow_validate_action_rss(actions, + action_flags, dev, + attr, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_RSS; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = mlx5_flow_validate_action_count(dev, attr, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_COUNT; + ++actions_n; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + } + if (!(action_flags & MLX5_FLOW_FATE_ACTIONS) && attr->ingress) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "no fate action is found"); + return 0; +} + +/** + * Internal preparation function. Allocates the DV flow size, + * this size is constant. + * + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] item_flags + * Pointer to bit mask of all items detected. + * @param[out] action_flags + * Pointer to bit mask of all actions detected. + * @param[out] error + * Pointer to the error structure. + * + * @return + * Pointer to mlx5_flow object on success, + * otherwise NULL and rte_ernno is set. + */ +static struct mlx5_flow * +flow_dv_prepare(const struct rte_flow_attr *attr __rte_unused, + const struct rte_flow_item items[] __rte_unused, + const struct rte_flow_action actions[] __rte_unused, + uint64_t *item_flags __rte_unused, + uint64_t *action_flags __rte_unused, + struct rte_flow_error *error) +{ + uint32_t size = sizeof(struct mlx5_flow); + struct mlx5_flow *flow; + + flow = rte_calloc(__func__, 1, size, 0); + if (!flow) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "not enough memory to create flow"); + return NULL; + } + flow->dv.value.size = MLX5_ST_SZ_DB(fte_match_param); + return flow; +} + +/** + * Add Ethernet item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_eth(void *matcher, void *key, + const struct rte_flow_item *item, int inner) +{ + const struct rte_flow_item_eth *eth_m = item->mask; + const struct rte_flow_item_eth *eth_v = item->spec; + const struct rte_flow_item_eth nic_mask = { + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .type = RTE_BE16(0xffff), + }; + void *headers_m; + void *headers_v; + char *l24_v; + unsigned int i; + + if (!eth_v) + return; + if (!eth_m) + eth_m = &nic_mask; + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, dmac_47_16), + ð_m->dst, sizeof(eth_m->dst)); + /* The value must be in the range of the mask. */ + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, dmac_47_16); + for (i = 0; i < sizeof(eth_m->dst); ++i) + l24_v[i] = eth_m->dst.addr_bytes[i] & eth_v->dst.addr_bytes[i]; + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, smac_47_16), + ð_m->src, sizeof(eth_m->src)); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, smac_47_16); + /* The value must be in the range of the mask. */ + for (i = 0; i < sizeof(eth_m->dst); ++i) + l24_v[i] = eth_m->src.addr_bytes[i] & eth_v->src.addr_bytes[i]; + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ethertype, + rte_be_to_cpu_16(eth_m->type)); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, ethertype); + *(uint16_t *)(l24_v) = eth_m->type & eth_v->type; +} + +/** + * Add VLAN item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_vlan(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_vlan *vlan_m = item->mask; + const struct rte_flow_item_vlan *vlan_v = item->spec; + const struct rte_flow_item_vlan nic_mask = { + .tci = RTE_BE16(0x0fff), + .inner_type = RTE_BE16(0xffff), + }; + void *headers_m; + void *headers_v; + uint16_t tci_m; + uint16_t tci_v; + + if (!vlan_v) + return; + if (!vlan_m) + vlan_m = &nic_mask; + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + tci_m = rte_be_to_cpu_16(vlan_m->tci); + tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_vid, tci_m); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, tci_v); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_cfi, tci_m >> 12); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_cfi, tci_v >> 12); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_prio, tci_m >> 13); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, tci_v >> 13); +} + +/** + * Add IPV4 item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_ipv4(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_ipv4 *ipv4_m = item->mask; + const struct rte_flow_item_ipv4 *ipv4_v = item->spec; + const struct rte_flow_item_ipv4 nic_mask = { + .hdr = { + .src_addr = RTE_BE32(0xffffffff), + .dst_addr = RTE_BE32(0xffffffff), + .type_of_service = 0xff, + .next_proto_id = 0xff, + }, + }; + void *headers_m; + void *headers_v; + char *l24_m; + char *l24_v; + uint8_t tos; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 0xf); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version, 4); + if (!ipv4_v) + return; + if (!ipv4_m) + ipv4_m = &nic_mask; + l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + *(uint32_t *)l24_m = ipv4_m->hdr.dst_addr; + *(uint32_t *)l24_v = ipv4_m->hdr.dst_addr & ipv4_v->hdr.dst_addr; + l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, + src_ipv4_src_ipv6.ipv4_layout.ipv4); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv4_layout.ipv4); + *(uint32_t *)l24_m = ipv4_m->hdr.src_addr; + *(uint32_t *)l24_v = ipv4_m->hdr.src_addr & ipv4_v->hdr.src_addr; + tos = ipv4_m->hdr.type_of_service & ipv4_v->hdr.type_of_service; + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ecn, + ipv4_m->hdr.type_of_service); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, tos); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_dscp, + ipv4_m->hdr.type_of_service >> 2); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, tos >> 2); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, + ipv4_m->hdr.next_proto_id); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + ipv4_v->hdr.next_proto_id & ipv4_m->hdr.next_proto_id); +} + +/** + * Add IPV6 item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_ipv6(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_ipv6 *ipv6_m = item->mask; + const struct rte_flow_item_ipv6 *ipv6_v = item->spec; + const struct rte_flow_item_ipv6 nic_mask = { + .hdr = { + .src_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .dst_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .vtc_flow = RTE_BE32(0xffffffff), + .proto = 0xff, + .hop_limits = 0xff, + }, + }; + void *headers_m; + void *headers_v; + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + char *l24_m; + char *l24_v; + uint32_t vtc_m; + uint32_t vtc_v; + int i; + int size; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 0xf); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version, 6); + if (!ipv6_v) + return; + if (!ipv6_m) + ipv6_m = &nic_mask; + size = sizeof(ipv6_m->hdr.dst_addr); + l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6); + memcpy(l24_m, ipv6_m->hdr.dst_addr, size); + for (i = 0; i < size; ++i) + l24_v[i] = l24_m[i] & ipv6_v->hdr.dst_addr[i]; + l24_m = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_m, + src_ipv4_src_ipv6.ipv6_layout.ipv6); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6); + memcpy(l24_m, ipv6_m->hdr.src_addr, size); + for (i = 0; i < size; ++i) + l24_v[i] = l24_m[i] & ipv6_v->hdr.src_addr[i]; + /* TOS. */ + vtc_m = rte_be_to_cpu_32(ipv6_m->hdr.vtc_flow); + vtc_v = rte_be_to_cpu_32(ipv6_m->hdr.vtc_flow & ipv6_v->hdr.vtc_flow); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_ecn, vtc_m >> 20); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, vtc_v >> 20); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_dscp, vtc_m >> 22); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, vtc_v >> 22); + /* Label. */ + if (inner) { + MLX5_SET(fte_match_set_misc, misc_m, inner_ipv6_flow_label, + vtc_m); + MLX5_SET(fte_match_set_misc, misc_v, inner_ipv6_flow_label, + vtc_v); + } else { + MLX5_SET(fte_match_set_misc, misc_m, outer_ipv6_flow_label, + vtc_m); + MLX5_SET(fte_match_set_misc, misc_v, outer_ipv6_flow_label, + vtc_v); + } + /* Protocol. */ + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, + ipv6_m->hdr.proto); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + ipv6_v->hdr.proto & ipv6_m->hdr.proto); +} + +/** + * Add TCP item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_tcp(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_tcp *tcp_m = item->mask; + const struct rte_flow_item_tcp *tcp_v = item->spec; + void *headers_m; + void *headers_v; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_TCP); + if (!tcp_v) + return; + if (!tcp_m) + tcp_m = &rte_flow_item_tcp_mask; + MLX5_SET(fte_match_set_lyr_2_4, headers_m, tcp_sport, + rte_be_to_cpu_16(tcp_m->hdr.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport, + rte_be_to_cpu_16(tcp_v->hdr.src_port & tcp_m->hdr.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, tcp_dport, + rte_be_to_cpu_16(tcp_m->hdr.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport, + rte_be_to_cpu_16(tcp_v->hdr.dst_port & tcp_m->hdr.dst_port)); +} + +/** + * Add UDP item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_udp(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_udp *udp_m = item->mask; + const struct rte_flow_item_udp *udp_v = item->spec; + void *headers_m; + void *headers_v; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_UDP); + if (!udp_v) + return; + if (!udp_m) + udp_m = &rte_flow_item_udp_mask; + MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_sport, + rte_be_to_cpu_16(udp_m->hdr.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport, + rte_be_to_cpu_16(udp_v->hdr.src_port & udp_m->hdr.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, + rte_be_to_cpu_16(udp_m->hdr.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, + rte_be_to_cpu_16(udp_v->hdr.dst_port & udp_m->hdr.dst_port)); +} + +/** + * Add GRE item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_gre(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_gre *gre_m = item->mask; + const struct rte_flow_item_gre *gre_v = item->spec; + void *headers_m; + void *headers_v; + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_GRE); + if (!gre_v) + return; + if (!gre_m) + gre_m = &rte_flow_item_gre_mask; + MLX5_SET(fte_match_set_misc, misc_m, gre_protocol, + rte_be_to_cpu_16(gre_m->protocol)); + MLX5_SET(fte_match_set_misc, misc_v, gre_protocol, + rte_be_to_cpu_16(gre_v->protocol & gre_m->protocol)); +} + +/** + * Add NVGRE item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_nvgre(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_nvgre *nvgre_m = item->mask; + const struct rte_flow_item_nvgre *nvgre_v = item->spec; + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + const char *tni_flow_id_m = (const char *)nvgre_m->tni; + const char *tni_flow_id_v = (const char *)nvgre_v->tni; + char *gre_key_m; + char *gre_key_v; + int size; + int i; + + flow_dv_translate_item_gre(matcher, key, item, inner); + if (!nvgre_v) + return; + if (!nvgre_m) + nvgre_m = &rte_flow_item_nvgre_mask; + size = sizeof(nvgre_m->tni) + sizeof(nvgre_m->flow_id); + gre_key_m = MLX5_ADDR_OF(fte_match_set_misc, misc_m, gre_key_h); + gre_key_v = MLX5_ADDR_OF(fte_match_set_misc, misc_v, gre_key_h); + memcpy(gre_key_m, tni_flow_id_m, size); + for (i = 0; i < size; ++i) + gre_key_v[i] = gre_key_m[i] & tni_flow_id_v[i]; +} + +/** + * Add VXLAN item to matcher and to the value. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_vxlan(void *matcher, void *key, + const struct rte_flow_item *item, + int inner) +{ + const struct rte_flow_item_vxlan *vxlan_m = item->mask; + const struct rte_flow_item_vxlan *vxlan_v = item->spec; + void *headers_m; + void *headers_v; + void *misc_m = MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + char *vni_m; + char *vni_v; + uint16_t dport; + int size; + int i; + + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); + } else { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); + } + dport = item->type == RTE_FLOW_ITEM_TYPE_VXLAN ? + MLX5_UDP_PORT_VXLAN : MLX5_UDP_PORT_VXLAN_GPE; + if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) { + MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport); + } + if (!vxlan_v) + return; + if (!vxlan_m) + vxlan_m = &rte_flow_item_vxlan_mask; + size = sizeof(vxlan_m->vni); + vni_m = MLX5_ADDR_OF(fte_match_set_misc, misc_m, vxlan_vni); + vni_v = MLX5_ADDR_OF(fte_match_set_misc, misc_v, vxlan_vni); + memcpy(vni_m, vxlan_m->vni, size); + for (i = 0; i < size; ++i) + vni_v[i] = vni_m[i] & vxlan_v->vni[i]; +} + +/** + * Add META item to matcher + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_translate_item_meta(void *matcher, void *key, + const struct rte_flow_item *item) +{ + const struct rte_flow_item_meta *meta_m; + const struct rte_flow_item_meta *meta_v; + void *misc2_m = + MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_2); + void *misc2_v = + MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2); + + meta_m = (const void *)item->mask; + if (!meta_m) + meta_m = &rte_flow_item_meta_mask; + meta_v = (const void *)item->spec; + if (meta_v) { + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_a, + rte_be_to_cpu_32(meta_m->data)); + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_a, + rte_be_to_cpu_32(meta_v->data & meta_m->data)); + } +} + +/** + * Update the matcher and the value based the selected item. + * + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. + * @param[in, out] dev_flow + * Pointer to the mlx5_flow. + * @param[in] inner + * Item is inner pattern. + */ +static void +flow_dv_create_item(void *matcher, void *key, + const struct rte_flow_item *item, + struct mlx5_flow *dev_flow, + int inner) +{ + struct mlx5_flow_dv_matcher *tmatcher = matcher; + + switch (item->type) { + case RTE_FLOW_ITEM_TYPE_ETH: + flow_dv_translate_item_eth(tmatcher->mask.buf, key, item, + inner); + tmatcher->priority = MLX5_PRIORITY_MAP_L2; + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + flow_dv_translate_item_vlan(tmatcher->mask.buf, key, item, + inner); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + flow_dv_translate_item_ipv4(tmatcher->mask.buf, key, item, + inner); + tmatcher->priority = MLX5_PRIORITY_MAP_L3; + dev_flow->dv.hash_fields |= + mlx5_flow_hashfields_adjust(dev_flow, inner, + MLX5_IPV4_LAYER_TYPES, + MLX5_IPV4_IBV_RX_HASH); + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + flow_dv_translate_item_ipv6(tmatcher->mask.buf, key, item, + inner); + tmatcher->priority = MLX5_PRIORITY_MAP_L3; + dev_flow->dv.hash_fields |= + mlx5_flow_hashfields_adjust(dev_flow, inner, + MLX5_IPV6_LAYER_TYPES, + MLX5_IPV6_IBV_RX_HASH); + break; + case RTE_FLOW_ITEM_TYPE_TCP: + flow_dv_translate_item_tcp(tmatcher->mask.buf, key, item, + inner); + tmatcher->priority = MLX5_PRIORITY_MAP_L4; + dev_flow->dv.hash_fields |= + mlx5_flow_hashfields_adjust(dev_flow, inner, + ETH_RSS_TCP, + (IBV_RX_HASH_SRC_PORT_TCP | + IBV_RX_HASH_DST_PORT_TCP)); + break; + case RTE_FLOW_ITEM_TYPE_UDP: + flow_dv_translate_item_udp(tmatcher->mask.buf, key, item, + inner); + tmatcher->priority = MLX5_PRIORITY_MAP_L4; + dev_flow->verbs.hash_fields |= + mlx5_flow_hashfields_adjust(dev_flow, inner, + ETH_RSS_UDP, + (IBV_RX_HASH_SRC_PORT_UDP | + IBV_RX_HASH_DST_PORT_UDP)); + break; + case RTE_FLOW_ITEM_TYPE_GRE: + flow_dv_translate_item_gre(tmatcher->mask.buf, key, item, + inner); + break; + case RTE_FLOW_ITEM_TYPE_NVGRE: + flow_dv_translate_item_nvgre(tmatcher->mask.buf, key, item, + inner); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + flow_dv_translate_item_vxlan(tmatcher->mask.buf, key, item, + inner); + break; + case RTE_FLOW_ITEM_TYPE_META: + flow_dv_translate_item_meta(tmatcher->mask.buf, key, item); + break; + default: + break; + } +} + +/** + * Store the requested actions in an array. + * + * @param[in] action + * Flow action to translate. + * @param[in, out] dev_flow + * Pointer to the mlx5_flow. + */ +static void +flow_dv_create_action(const struct rte_flow_action *action, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_action_queue *queue; + const struct rte_flow_action_rss *rss; + int actions_n = dev_flow->dv.actions_n; + struct rte_flow *flow = dev_flow->flow; + + switch (action->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + dev_flow->dv.actions[actions_n].type = MLX5DV_FLOW_ACTION_TAG; + dev_flow->dv.actions[actions_n].tag_value = + mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT); + actions_n++; + flow->actions |= MLX5_FLOW_ACTION_FLAG; + break; + case RTE_FLOW_ACTION_TYPE_MARK: + dev_flow->dv.actions[actions_n].type = MLX5DV_FLOW_ACTION_TAG; + dev_flow->dv.actions[actions_n].tag_value = + mlx5_flow_mark_set + (((const struct rte_flow_action_mark *) + (action->conf))->id); + flow->actions |= MLX5_FLOW_ACTION_MARK; + actions_n++; + break; + case RTE_FLOW_ACTION_TYPE_DROP: + dev_flow->dv.actions[actions_n].type = MLX5DV_FLOW_ACTION_DROP; + flow->actions |= MLX5_FLOW_ACTION_DROP; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + queue = action->conf; + flow->rss.queue_num = 1; + (*flow->queue)[0] = queue->index; + flow->actions |= MLX5_FLOW_ACTION_QUEUE; + break; + case RTE_FLOW_ACTION_TYPE_RSS: + rss = action->conf; + if (flow->queue) + memcpy((*flow->queue), rss->queue, + rss->queue_num * sizeof(uint16_t)); + flow->rss.queue_num = rss->queue_num; + memcpy(flow->key, rss->key, MLX5_RSS_HASH_KEY_LEN); + flow->rss.types = rss->types; + flow->rss.level = rss->level; + /* Added to array only in apply since we need the QP */ + flow->actions |= MLX5_FLOW_ACTION_RSS; + break; + default: + break; + } + dev_flow->dv.actions_n = actions_n; +} + +static uint32_t matcher_zero[MLX5_ST_SZ_DW(fte_match_param)] = { 0 }; + +#define HEADER_IS_ZERO(match_criteria, headers) \ + !(memcmp(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ + matcher_zero, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ + +/** + * Calculate flow matcher enable bitmap. + * + * @param match_criteria + * Pointer to flow matcher criteria. + * + * @return + * Bitmap of enabled fields. + */ +static uint8_t +flow_dv_matcher_enable(uint32_t *match_criteria) +{ + uint8_t match_criteria_enable; + + match_criteria_enable = + (!HEADER_IS_ZERO(match_criteria, outer_headers)) << + MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters)) << + MLX5_MATCH_CRITERIA_ENABLE_MISC_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, inner_headers)) << + MLX5_MATCH_CRITERIA_ENABLE_INNER_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) << + MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT; + + return match_criteria_enable; +} + +/** + * Register the flow matcher. + * + * @param dev[in, out] + * Pointer to rte_eth_dev structure. + * @param[in, out] matcher + * Pointer to flow matcher. + * @parm[in, out] dev_flow + * Pointer to the dev_flow. + * @param[out] error + * pointer to error structure. + * + * @return + * 0 on success otherwise -errno and errno is set. + */ +static int +flow_dv_matcher_register(struct rte_eth_dev *dev, + struct mlx5_flow_dv_matcher *matcher, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_flow_dv_matcher *cache_matcher; + struct mlx5dv_flow_matcher_attr dv_attr = { + .type = IBV_FLOW_ATTR_NORMAL, + .match_mask = (void *)&matcher->mask, + }; + + /* Lookup from cache. */ + LIST_FOREACH(cache_matcher, &priv->matchers, next) { + if (matcher->crc == cache_matcher->crc && + matcher->priority == cache_matcher->priority && + matcher->egress == cache_matcher->egress && + !memcmp((const void *)matcher->mask.buf, + (const void *)cache_matcher->mask.buf, + cache_matcher->mask.size)) { + DRV_LOG(DEBUG, + "priority %hd use %s matcher %p: refcnt %d++", + cache_matcher->priority, + cache_matcher->egress ? "tx" : "rx", + (void *)cache_matcher, + rte_atomic32_read(&cache_matcher->refcnt)); + rte_atomic32_inc(&cache_matcher->refcnt); + dev_flow->dv.matcher = cache_matcher; + return 0; + } + } + /* Register new matcher. */ + cache_matcher = rte_calloc(__func__, 1, sizeof(*cache_matcher), 0); + if (!cache_matcher) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot allocate matcher memory"); + *cache_matcher = *matcher; + dv_attr.match_criteria_enable = + flow_dv_matcher_enable(cache_matcher->mask.buf); + dv_attr.priority = matcher->priority; + if (matcher->egress) + dv_attr.flags |= IBV_FLOW_ATTR_FLAGS_EGRESS; + cache_matcher->matcher_object = + mlx5_glue->dv_create_flow_matcher(priv->ctx, &dv_attr); + if (!cache_matcher->matcher_object) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "cannot create matcher"); + rte_atomic32_inc(&cache_matcher->refcnt); + LIST_INSERT_HEAD(&priv->matchers, cache_matcher, next); + dev_flow->dv.matcher = cache_matcher; + DRV_LOG(DEBUG, "priority %hd new %s matcher %p: refcnt %d", + cache_matcher->priority, + cache_matcher->egress ? "tx" : "rx", (void *)cache_matcher, + rte_atomic32_read(&cache_matcher->refcnt)); + return 0; +} + + +/** + * Fill the flow with DV spec. + * + * @param[in] dev + * Pointer to rte_eth_dev structure. + * @param[in, out] dev_flow + * Pointer to the sub flow. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + */ +static int +flow_dv_translate(struct rte_eth_dev *dev, + struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[] __rte_unused, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + uint64_t priority = attr->priority; + struct mlx5_flow_dv_matcher matcher = { + .mask = { + .size = sizeof(matcher.mask.buf), + }, + }; + void *match_value = dev_flow->dv.value.buf; + int tunnel = 0; + + if (priority == MLX5_FLOW_PRIO_RSVD) + priority = priv->config.flow_prio - 1; + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + tunnel = !!(dev_flow->layers & MLX5_FLOW_LAYER_TUNNEL); + flow_dv_create_item(&matcher, match_value, items, dev_flow, + tunnel); + } + matcher.crc = rte_raw_cksum((const void *)matcher.mask.buf, + matcher.mask.size); + if (priority == MLX5_FLOW_PRIO_RSVD) + priority = priv->config.flow_prio - 1; + matcher.priority = mlx5_flow_adjust_priority(dev, priority, + matcher.priority); + matcher.egress = attr->egress; + if (flow_dv_matcher_register(dev, &matcher, dev_flow, error)) + return -rte_errno; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) + flow_dv_create_action(actions, dev_flow); + return 0; +} + +/** + * Apply the flow to the NIC. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_dv_apply(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct mlx5_flow_dv *dv; + struct mlx5_flow *dev_flow; + int n; + int err; + + LIST_FOREACH(dev_flow, &flow->dev_flows, next) { + dv = &dev_flow->dv; + n = dv->actions_n; + if (flow->actions & MLX5_FLOW_ACTION_DROP) { + dv->hrxq = mlx5_hrxq_drop_new(dev); + if (!dv->hrxq) { + rte_flow_error_set + (error, errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot get drop hash queue"); + goto error; + } + dv->actions[n].type = MLX5DV_FLOW_ACTION_DEST_IBV_QP; + dv->actions[n].qp = dv->hrxq->qp; + n++; + } else if (flow->actions & + (MLX5_FLOW_ACTION_QUEUE | MLX5_FLOW_ACTION_RSS)) { + struct mlx5_hrxq *hrxq; + hrxq = mlx5_hrxq_get(dev, flow->key, + MLX5_RSS_HASH_KEY_LEN, + dv->hash_fields, + (*flow->queue), + flow->rss.queue_num); + if (!hrxq) + hrxq = mlx5_hrxq_new + (dev, flow->key, MLX5_RSS_HASH_KEY_LEN, + dv->hash_fields, (*flow->queue), + flow->rss.queue_num, + !!(dev_flow->layers & + MLX5_FLOW_LAYER_TUNNEL)); + if (!hrxq) { + rte_flow_error_set + (error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot get hash queue"); + goto error; + } + dv->hrxq = hrxq; + dv->actions[n].type = MLX5DV_FLOW_ACTION_DEST_IBV_QP; + dv->actions[n].qp = hrxq->qp; + n++; + } + dv->flow = + mlx5_glue->dv_create_flow(dv->matcher->matcher_object, + (void *)&dv->value, n, + dv->actions); + if (!dv->flow) { + rte_flow_error_set(error, errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "hardware refuses to create flow"); + goto error; + } + } + return 0; +error: + err = rte_errno; /* Save rte_errno before cleanup. */ + LIST_FOREACH(dev_flow, &flow->dev_flows, next) { + struct mlx5_flow_dv *dv = &dev_flow->dv; + if (dv->hrxq) { + if (flow->actions & MLX5_FLOW_ACTION_DROP) + mlx5_hrxq_drop_release(dev); + else + mlx5_hrxq_release(dev, dv->hrxq); + dv->hrxq = NULL; + } + } + rte_errno = err; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Release the flow matcher. + * + * @param dev + * Pointer to Ethernet device. + * @param flow + * Pointer to mlx5_flow. + * + * @return + * 1 while a reference on it exists, 0 when freed. + */ +static int +flow_dv_matcher_release(struct rte_eth_dev *dev, + struct mlx5_flow *flow) +{ + struct mlx5_flow_dv_matcher *matcher = flow->dv.matcher; + + assert(matcher->matcher_object); + DRV_LOG(DEBUG, "port %u matcher %p: refcnt %d--", + dev->data->port_id, (void *)matcher, + rte_atomic32_read(&matcher->refcnt)); + if (rte_atomic32_dec_and_test(&matcher->refcnt)) { + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (matcher->matcher_object)); + LIST_REMOVE(matcher, next); + rte_free(matcher); + DRV_LOG(DEBUG, "port %u matcher %p: removed", + dev->data->port_id, (void *)matcher); + return 0; + } + return 1; +} + +/** + * Remove the flow from the NIC but keeps it in memory. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to flow structure. + */ +static void +flow_dv_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_flow_dv *dv; + struct mlx5_flow *dev_flow; + + if (!flow) + return; + LIST_FOREACH(dev_flow, &flow->dev_flows, next) { + dv = &dev_flow->dv; + if (dv->flow) { + claim_zero(mlx5_glue->destroy_flow(dv->flow)); + dv->flow = NULL; + } + if (dv->hrxq) { + if (flow->actions & MLX5_FLOW_ACTION_DROP) + mlx5_hrxq_drop_release(dev); + else + mlx5_hrxq_release(dev, dv->hrxq); + dv->hrxq = NULL; + } + } + if (flow->counter) + flow->counter = NULL; +} + +/** + * Remove the flow from the NIC and the memory. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + */ +static void +flow_dv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_flow *dev_flow; + + if (!flow) + return; + flow_dv_remove(dev, flow); + while (!LIST_EMPTY(&flow->dev_flows)) { + dev_flow = LIST_FIRST(&flow->dev_flows); + LIST_REMOVE(dev_flow, next); + if (dev_flow->dv.matcher) + flow_dv_matcher_release(dev, dev_flow); + rte_free(dev_flow); + } +} + +/** + * Query a flow. + * + * @see rte_flow_query() + * @see rte_flow_ops + */ +static int +flow_dv_query(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow __rte_unused, + const struct rte_flow_action *actions __rte_unused, + void *data __rte_unused, + struct rte_flow_error *error __rte_unused) +{ + rte_errno = ENOTSUP; + return -rte_errno; +} + + +const struct mlx5_flow_driver_ops mlx5_flow_dv_drv_ops = { + .validate = flow_dv_validate, + .prepare = flow_dv_prepare, + .translate = flow_dv_translate, + .apply = flow_dv_apply, + .remove = flow_dv_remove, + .destroy = flow_dv_destroy, + .query = flow_dv_query, +}; + +#endif /* HAVE_IBV_FLOW_DV_SUPPORT */ diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c new file mode 100644 index 00000000..719fb106 --- /dev/null +++ b/drivers/net/mlx5/mlx5_flow_tcf.c @@ -0,0 +1,2913 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 6WIND S.A. + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include <assert.h> +#include <errno.h> +#include <libmnl/libmnl.h> +#include <linux/gen_stats.h> +#include <linux/if_ether.h> +#include <linux/netlink.h> +#include <linux/pkt_cls.h> +#include <linux/pkt_sched.h> +#include <linux/rtnetlink.h> +#include <linux/tc_act/tc_gact.h> +#include <linux/tc_act/tc_mirred.h> +#include <netinet/in.h> +#include <stdalign.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> +#include <sys/socket.h> + +#include <rte_byteorder.h> +#include <rte_errno.h> +#include <rte_ether.h> +#include <rte_flow.h> +#include <rte_malloc.h> +#include <rte_common.h> + +#include "mlx5.h" +#include "mlx5_flow.h" +#include "mlx5_autoconf.h" + +#ifdef HAVE_TC_ACT_VLAN + +#include <linux/tc_act/tc_vlan.h> + +#else /* HAVE_TC_ACT_VLAN */ + +#define TCA_VLAN_ACT_POP 1 +#define TCA_VLAN_ACT_PUSH 2 +#define TCA_VLAN_ACT_MODIFY 3 +#define TCA_VLAN_PARMS 2 +#define TCA_VLAN_PUSH_VLAN_ID 3 +#define TCA_VLAN_PUSH_VLAN_PROTOCOL 4 +#define TCA_VLAN_PAD 5 +#define TCA_VLAN_PUSH_VLAN_PRIORITY 6 + +struct tc_vlan { + tc_gen; + int v_action; +}; + +#endif /* HAVE_TC_ACT_VLAN */ + +#ifdef HAVE_TC_ACT_PEDIT + +#include <linux/tc_act/tc_pedit.h> + +#else /* HAVE_TC_ACT_VLAN */ + +enum { + TCA_PEDIT_UNSPEC, + TCA_PEDIT_TM, + TCA_PEDIT_PARMS, + TCA_PEDIT_PAD, + TCA_PEDIT_PARMS_EX, + TCA_PEDIT_KEYS_EX, + TCA_PEDIT_KEY_EX, + __TCA_PEDIT_MAX +}; + +enum { + TCA_PEDIT_KEY_EX_HTYPE = 1, + TCA_PEDIT_KEY_EX_CMD = 2, + __TCA_PEDIT_KEY_EX_MAX +}; + +enum pedit_header_type { + TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0, + TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1, + TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2, + TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3, + TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4, + TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5, + __PEDIT_HDR_TYPE_MAX, +}; + +enum pedit_cmd { + TCA_PEDIT_KEY_EX_CMD_SET = 0, + TCA_PEDIT_KEY_EX_CMD_ADD = 1, + __PEDIT_CMD_MAX, +}; + +struct tc_pedit_key { + __u32 mask; /* AND */ + __u32 val; /*XOR */ + __u32 off; /*offset */ + __u32 at; + __u32 offmask; + __u32 shift; +}; + +__extension__ +struct tc_pedit_sel { + tc_gen; + unsigned char nkeys; + unsigned char flags; + struct tc_pedit_key keys[0]; +}; + +#endif /* HAVE_TC_ACT_VLAN */ + +/* Normally found in linux/netlink.h. */ +#ifndef NETLINK_CAP_ACK +#define NETLINK_CAP_ACK 10 +#endif + +/* Normally found in linux/pkt_sched.h. */ +#ifndef TC_H_MIN_INGRESS +#define TC_H_MIN_INGRESS 0xfff2u +#endif + +/* Normally found in linux/pkt_cls.h. */ +#ifndef TCA_CLS_FLAGS_SKIP_SW +#define TCA_CLS_FLAGS_SKIP_SW (1 << 1) +#endif +#ifndef HAVE_TCA_CHAIN +#define TCA_CHAIN 11 +#endif +#ifndef HAVE_TCA_FLOWER_ACT +#define TCA_FLOWER_ACT 3 +#endif +#ifndef HAVE_TCA_FLOWER_FLAGS +#define TCA_FLOWER_FLAGS 22 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE +#define TCA_FLOWER_KEY_ETH_TYPE 8 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ETH_DST +#define TCA_FLOWER_KEY_ETH_DST 4 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK +#define TCA_FLOWER_KEY_ETH_DST_MASK 5 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC +#define TCA_FLOWER_KEY_ETH_SRC 6 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK +#define TCA_FLOWER_KEY_ETH_SRC_MASK 7 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO +#define TCA_FLOWER_KEY_IP_PROTO 9 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC +#define TCA_FLOWER_KEY_IPV4_SRC 10 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK +#define TCA_FLOWER_KEY_IPV4_SRC_MASK 11 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST +#define TCA_FLOWER_KEY_IPV4_DST 12 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK +#define TCA_FLOWER_KEY_IPV4_DST_MASK 13 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC +#define TCA_FLOWER_KEY_IPV6_SRC 14 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK +#define TCA_FLOWER_KEY_IPV6_SRC_MASK 15 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST +#define TCA_FLOWER_KEY_IPV6_DST 16 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK +#define TCA_FLOWER_KEY_IPV6_DST_MASK 17 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC +#define TCA_FLOWER_KEY_TCP_SRC 18 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK +#define TCA_FLOWER_KEY_TCP_SRC_MASK 35 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_TCP_DST +#define TCA_FLOWER_KEY_TCP_DST 19 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK +#define TCA_FLOWER_KEY_TCP_DST_MASK 36 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC +#define TCA_FLOWER_KEY_UDP_SRC 20 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK +#define TCA_FLOWER_KEY_UDP_SRC_MASK 37 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_UDP_DST +#define TCA_FLOWER_KEY_UDP_DST 21 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK +#define TCA_FLOWER_KEY_UDP_DST_MASK 38 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID +#define TCA_FLOWER_KEY_VLAN_ID 23 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO +#define TCA_FLOWER_KEY_VLAN_PRIO 24 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE +#define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS +#define TCA_FLOWER_KEY_TCP_FLAGS 71 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK +#define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72 +#endif +#ifndef HAVE_TC_ACT_GOTO_CHAIN +#define TC_ACT_GOTO_CHAIN 0x20000000 +#endif + +#ifndef IPV6_ADDR_LEN +#define IPV6_ADDR_LEN 16 +#endif + +#ifndef IPV4_ADDR_LEN +#define IPV4_ADDR_LEN 4 +#endif + +#ifndef TP_PORT_LEN +#define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */ +#endif + +#ifndef TTL_LEN +#define TTL_LEN 1 +#endif + +#ifndef TCA_ACT_MAX_PRIO +#define TCA_ACT_MAX_PRIO 32 +#endif + +/** + * Structure for holding netlink context. + * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE. + * Using this (8KB) buffer size ensures that netlink messages will never be + * truncated. + */ +struct mlx5_flow_tcf_context { + struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */ + uint32_t seq; /* Message sequence number. */ + uint32_t buf_size; /* Message buffer size. */ + uint8_t *buf; /* Message buffer. */ +}; + +/** Structure used when extracting the values of a flow counters + * from a netlink message. + */ +struct flow_tcf_stats_basic { + bool valid; + struct gnet_stats_basic counters; +}; + +/** Empty masks for known item types. */ +static const union { + struct rte_flow_item_port_id port_id; + struct rte_flow_item_eth eth; + struct rte_flow_item_vlan vlan; + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv6 ipv6; + struct rte_flow_item_tcp tcp; + struct rte_flow_item_udp udp; +} flow_tcf_mask_empty; + +/** Supported masks for known item types. */ +static const struct { + struct rte_flow_item_port_id port_id; + struct rte_flow_item_eth eth; + struct rte_flow_item_vlan vlan; + struct rte_flow_item_ipv4 ipv4; + struct rte_flow_item_ipv6 ipv6; + struct rte_flow_item_tcp tcp; + struct rte_flow_item_udp udp; +} flow_tcf_mask_supported = { + .port_id = { + .id = 0xffffffff, + }, + .eth = { + .type = RTE_BE16(0xffff), + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }, + .vlan = { + /* PCP and VID only, no DEI. */ + .tci = RTE_BE16(0xefff), + .inner_type = RTE_BE16(0xffff), + }, + .ipv4.hdr = { + .next_proto_id = 0xff, + .src_addr = RTE_BE32(0xffffffff), + .dst_addr = RTE_BE32(0xffffffff), + }, + .ipv6.hdr = { + .proto = 0xff, + .src_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .dst_addr = + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + }, + .tcp.hdr = { + .src_port = RTE_BE16(0xffff), + .dst_port = RTE_BE16(0xffff), + .tcp_flags = 0xff, + }, + .udp.hdr = { + .src_port = RTE_BE16(0xffff), + .dst_port = RTE_BE16(0xffff), + }, +}; + +#define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr)) +#define SZ_NLATTR_NEST SZ_NLATTR_HDR +#define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len)) +#define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ)) +#define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1) + +#define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2) + +/** DPDK port to network interface index (ifindex) conversion. */ +struct flow_tcf_ptoi { + uint16_t port_id; /**< DPDK port ID. */ + unsigned int ifindex; /**< Network interface index. */ +}; + +/* Due to a limitation on driver/FW. */ +#define MLX5_TCF_GROUP_ID_MAX 3 +#define MLX5_TCF_GROUP_PRIORITY_MAX 14 + +#define MLX5_TCF_FATE_ACTIONS \ + (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \ + MLX5_FLOW_ACTION_JUMP) + +#define MLX5_TCF_VLAN_ACTIONS \ + (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \ + MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP) + +#define MLX5_TCF_PEDIT_ACTIONS \ + (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \ + MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \ + MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \ + MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \ + MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST) + +#define MLX5_TCF_CONFIG_ACTIONS \ + (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \ + MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \ + MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \ + (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL)) + +#define MAX_PEDIT_KEYS 128 +#define SZ_PEDIT_KEY_VAL 4 + +#define NUM_OF_PEDIT_KEYS(sz) \ + (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0)) + +struct pedit_key_ex { + enum pedit_header_type htype; + enum pedit_cmd cmd; +}; + +struct pedit_parser { + struct tc_pedit_sel sel; + struct tc_pedit_key keys[MAX_PEDIT_KEYS]; + struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS]; +}; + +/** + * Create space for using the implicitly created TC flow counter. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * + * @return + * A pointer to the counter data structure, NULL otherwise and + * rte_errno is set. + */ +static struct mlx5_flow_counter * +flow_tcf_counter_new(void) +{ + struct mlx5_flow_counter *cnt; + + /* + * eswitch counter cannot be shared and its id is unknown. + * currently returning all with id 0. + * in the future maybe better to switch to unique numbers. + */ + struct mlx5_flow_counter tmpl = { + .ref_cnt = 1, + }; + cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0); + if (!cnt) { + rte_errno = ENOMEM; + return NULL; + } + *cnt = tmpl; + /* Implicit counter, do not add to list. */ + return cnt; +} + +/** + * Set pedit key of MAC address + * + * @param[in] actions + * pointer to action specification + * @param[in,out] p_parser + * pointer to pedit_parser + */ +static void +flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions, + struct pedit_parser *p_parser) +{ + int idx = p_parser->sel.nkeys; + uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ? + offsetof(struct ether_hdr, s_addr) : + offsetof(struct ether_hdr, d_addr); + const struct rte_flow_action_set_mac *conf = + (const struct rte_flow_action_set_mac *)actions->conf; + + p_parser->keys[idx].off = off; + p_parser->keys[idx].mask = ~UINT32_MAX; + p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH; + p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET; + memcpy(&p_parser->keys[idx].val, + conf->mac_addr, SZ_PEDIT_KEY_VAL); + idx++; + p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL; + p_parser->keys[idx].mask = 0xFFFF0000; + p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH; + p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET; + memcpy(&p_parser->keys[idx].val, + conf->mac_addr + SZ_PEDIT_KEY_VAL, + ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL); + p_parser->sel.nkeys = (++idx); +} + +/** + * Set pedit key of decrease/set ttl + * + * @param[in] actions + * pointer to action specification + * @param[in,out] p_parser + * pointer to pedit_parser + * @param[in] item_flags + * flags of all items presented + */ +static void +flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions, + struct pedit_parser *p_parser, + uint64_t item_flags) +{ + int idx = p_parser->sel.nkeys; + + p_parser->keys[idx].mask = 0xFFFFFF00; + if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) { + p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4; + p_parser->keys[idx].off = + offsetof(struct ipv4_hdr, time_to_live); + } + if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) { + p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6; + p_parser->keys[idx].off = + offsetof(struct ipv6_hdr, hop_limits); + } + if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) { + p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD; + p_parser->keys[idx].val = 0x000000FF; + } else { + p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET; + p_parser->keys[idx].val = + (__u32)((const struct rte_flow_action_set_ttl *) + actions->conf)->ttl_value; + } + p_parser->sel.nkeys = (++idx); +} + +/** + * Set pedit key of transport (TCP/UDP) port value + * + * @param[in] actions + * pointer to action specification + * @param[in,out] p_parser + * pointer to pedit_parser + * @param[in] item_flags + * flags of all items presented + */ +static void +flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions, + struct pedit_parser *p_parser, + uint64_t item_flags) +{ + int idx = p_parser->sel.nkeys; + + if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP) + p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP; + if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP) + p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP; + p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET; + /* offset of src/dst port is same for TCP and UDP */ + p_parser->keys[idx].off = + actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ? + offsetof(struct tcp_hdr, src_port) : + offsetof(struct tcp_hdr, dst_port); + p_parser->keys[idx].mask = 0xFFFF0000; + p_parser->keys[idx].val = + (__u32)((const struct rte_flow_action_set_tp *) + actions->conf)->port; + p_parser->sel.nkeys = (++idx); +} + +/** + * Set pedit key of ipv6 address + * + * @param[in] actions + * pointer to action specification + * @param[in,out] p_parser + * pointer to pedit_parser + */ +static void +flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions, + struct pedit_parser *p_parser) +{ + int idx = p_parser->sel.nkeys; + int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN); + int off_base = + actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ? + offsetof(struct ipv6_hdr, src_addr) : + offsetof(struct ipv6_hdr, dst_addr); + const struct rte_flow_action_set_ipv6 *conf = + (const struct rte_flow_action_set_ipv6 *)actions->conf; + + for (int i = 0; i < keys; i++, idx++) { + p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6; + p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET; + p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL; + p_parser->keys[idx].mask = ~UINT32_MAX; + memcpy(&p_parser->keys[idx].val, + conf->ipv6_addr + i * SZ_PEDIT_KEY_VAL, + SZ_PEDIT_KEY_VAL); + } + p_parser->sel.nkeys += keys; +} + +/** + * Set pedit key of ipv4 address + * + * @param[in] actions + * pointer to action specification + * @param[in,out] p_parser + * pointer to pedit_parser + */ +static void +flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions, + struct pedit_parser *p_parser) +{ + int idx = p_parser->sel.nkeys; + + p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4; + p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET; + p_parser->keys[idx].off = + actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ? + offsetof(struct ipv4_hdr, src_addr) : + offsetof(struct ipv4_hdr, dst_addr); + p_parser->keys[idx].mask = ~UINT32_MAX; + p_parser->keys[idx].val = + ((const struct rte_flow_action_set_ipv4 *) + actions->conf)->ipv4_addr; + p_parser->sel.nkeys = (++idx); +} + +/** + * Create the pedit's na attribute in netlink message + * on pre-allocate message buffer + * + * @param[in,out] nl + * pointer to pre-allocated netlink message buffer + * @param[in,out] actions + * pointer to pointer of actions specification. + * @param[in,out] action_flags + * pointer to actions flags + * @param[in] item_flags + * flags of all item presented + */ +static void +flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl, + const struct rte_flow_action **actions, + uint64_t item_flags) +{ + struct pedit_parser p_parser; + struct nlattr *na_act_options; + struct nlattr *na_pedit_keys; + + memset(&p_parser, 0, sizeof(p_parser)); + mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit"); + na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS); + /* all modify header actions should be in one tc-pedit action */ + for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) { + switch ((*actions)->type) { + case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: + flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser); + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: + flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser); + break; + case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: + case RTE_FLOW_ACTION_TYPE_SET_TP_DST: + flow_tcf_pedit_key_set_tp_port(*actions, + &p_parser, item_flags); + break; + case RTE_FLOW_ACTION_TYPE_SET_TTL: + case RTE_FLOW_ACTION_TYPE_DEC_TTL: + flow_tcf_pedit_key_set_dec_ttl(*actions, + &p_parser, item_flags); + break; + case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: + case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: + flow_tcf_pedit_key_set_mac(*actions, &p_parser); + break; + default: + goto pedit_mnl_msg_done; + } + } +pedit_mnl_msg_done: + p_parser.sel.action = TC_ACT_PIPE; + mnl_attr_put(nl, TCA_PEDIT_PARMS_EX, + sizeof(p_parser.sel) + + p_parser.sel.nkeys * sizeof(struct tc_pedit_key), + &p_parser); + na_pedit_keys = + mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED); + for (int i = 0; i < p_parser.sel.nkeys; i++) { + struct nlattr *na_pedit_key = + mnl_attr_nest_start(nl, + TCA_PEDIT_KEY_EX | NLA_F_NESTED); + mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE, + p_parser.keys_ex[i].htype); + mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD, + p_parser.keys_ex[i].cmd); + mnl_attr_nest_end(nl, na_pedit_key); + } + mnl_attr_nest_end(nl, na_pedit_keys); + mnl_attr_nest_end(nl, na_act_options); + (*actions)--; +} + +/** + * Calculate max memory size of one TC-pedit actions. + * One TC-pedit action can contain set of keys each defining + * a rewrite element (rte_flow action) + * + * @param[in,out] actions + * actions specification. + * @param[in,out] action_flags + * actions flags + * @param[in,out] size + * accumulated size + * @return + * Max memory size of one TC-pedit action + */ +static int +flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions, + uint64_t *action_flags) +{ + int pedit_size = 0; + int keys = 0; + uint64_t flags = 0; + + pedit_size += SZ_NLATTR_NEST + /* na_act_index. */ + SZ_NLATTR_STRZ_OF("pedit") + + SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */ + for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) { + switch ((*actions)->type) { + case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: + keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN); + flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: + keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN); + flags |= MLX5_FLOW_ACTION_SET_IPV4_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: + keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN); + flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: + keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN); + flags |= MLX5_FLOW_ACTION_SET_IPV6_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: + /* TCP is as same as UDP */ + keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN); + flags |= MLX5_FLOW_ACTION_SET_TP_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_TP_DST: + /* TCP is as same as UDP */ + keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN); + flags |= MLX5_FLOW_ACTION_SET_TP_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_TTL: + keys += NUM_OF_PEDIT_KEYS(TTL_LEN); + flags |= MLX5_FLOW_ACTION_SET_TTL; + break; + case RTE_FLOW_ACTION_TYPE_DEC_TTL: + keys += NUM_OF_PEDIT_KEYS(TTL_LEN); + flags |= MLX5_FLOW_ACTION_DEC_TTL; + break; + case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: + keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN); + flags |= MLX5_FLOW_ACTION_SET_MAC_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: + keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN); + flags |= MLX5_FLOW_ACTION_SET_MAC_DST; + break; + default: + goto get_pedit_action_size_done; + } + } +get_pedit_action_size_done: + /* TCA_PEDIT_PARAMS_EX */ + pedit_size += + SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) + + keys * sizeof(struct tc_pedit_key)); + pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */ + pedit_size += keys * + /* TCA_PEDIT_KEY_EX + HTYPE + CMD */ + (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) + + SZ_NLATTR_DATA_OF(2)); + (*action_flags) |= flags; + (*actions)--; + return pedit_size; +} + +/** + * Retrieve mask for pattern item. + * + * This function does basic sanity checks on a pattern item in order to + * return the most appropriate mask for it. + * + * @param[in] item + * Item specification. + * @param[in] mask_default + * Default mask for pattern item as specified by the flow API. + * @param[in] mask_supported + * Mask fields supported by the implementation. + * @param[in] mask_empty + * Empty mask to return when there is no specification. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * Either @p item->mask or one of the mask parameters on success, NULL + * otherwise and rte_errno is set. + */ +static const void * +flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default, + const void *mask_supported, const void *mask_empty, + size_t mask_size, struct rte_flow_error *error) +{ + const uint8_t *mask; + size_t i; + + /* item->last and item->mask cannot exist without item->spec. */ + if (!item->spec && (item->mask || item->last)) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "\"mask\" or \"last\" field provided without" + " a corresponding \"spec\""); + return NULL; + } + /* No spec, no mask, no problem. */ + if (!item->spec) + return mask_empty; + mask = item->mask ? item->mask : mask_default; + assert(mask); + /* + * Single-pass check to make sure that: + * - Mask is supported, no bits are set outside mask_supported. + * - Both item->spec and item->last are included in mask. + */ + for (i = 0; i != mask_size; ++i) { + if (!mask[i]) + continue; + if ((mask[i] | ((const uint8_t *)mask_supported)[i]) != + ((const uint8_t *)mask_supported)[i]) { + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "unsupported field found" + " in \"mask\""); + return NULL; + } + if (item->last && + (((const uint8_t *)item->spec)[i] & mask[i]) != + (((const uint8_t *)item->last)[i] & mask[i])) { + rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM_LAST, + item->last, + "range between \"spec\" and \"last\"" + " not comprised in \"mask\""); + return NULL; + } + } + return mask; +} + +/** + * Build a conversion table between port ID and ifindex. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[out] ptoi + * Pointer to ptoi table. + * @param[in] len + * Size of ptoi table provided. + * + * @return + * Size of ptoi table filled. + */ +static unsigned int +flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi, + unsigned int len) +{ + unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0); + uint16_t port_id[n + 1]; + unsigned int i; + unsigned int own = 0; + + /* At least one port is needed when no switch domain is present. */ + if (!n) { + n = 1; + port_id[0] = dev->data->port_id; + } else { + n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n); + } + if (n > len) + return 0; + for (i = 0; i != n; ++i) { + struct rte_eth_dev_info dev_info; + + rte_eth_dev_info_get(port_id[i], &dev_info); + if (port_id[i] == dev->data->port_id) + own = i; + ptoi[i].port_id = port_id[i]; + ptoi[i].ifindex = dev_info.if_index; + } + /* Ensure first entry of ptoi[] is the current device. */ + if (own) { + ptoi[n] = ptoi[0]; + ptoi[0] = ptoi[own]; + ptoi[own] = ptoi[n]; + } + /* An entry with zero ifindex terminates ptoi[]. */ + ptoi[n].port_id = 0; + ptoi[n].ifindex = 0; + return n; +} + +/** + * Verify the @p attr will be correctly understood by the E-switch. + * + * @param[in] attr + * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_tcf_validate_attributes(const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + /* + * Supported attributes: groups, some priorities and ingress only. + * group is supported only if kernel supports chain. Don't care about + * transfer as it is the caller's problem. + */ + if (attr->group > MLX5_TCF_GROUP_ID_MAX) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr, + "group ID larger than " + RTE_STR(MLX5_TCF_GROUP_ID_MAX) + " isn't supported"); + else if (attr->group > 0 && + attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + attr, + "lowest priority level is " + RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX) + " when group is configured"); + else if (attr->priority > 0xfffe) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + attr, + "lowest priority level is 0xfffe"); + if (!attr->ingress) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + attr, "only ingress is supported"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + attr, "egress is not supported"); + return 0; +} + +/** + * Validate flow for E-Switch. + * + * @param[in] priv + * Pointer to the priv structure. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + */ +static int +flow_tcf_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + union { + const struct rte_flow_item_port_id *port_id; + const struct rte_flow_item_eth *eth; + const struct rte_flow_item_vlan *vlan; + const struct rte_flow_item_ipv4 *ipv4; + const struct rte_flow_item_ipv6 *ipv6; + const struct rte_flow_item_tcp *tcp; + const struct rte_flow_item_udp *udp; + } spec, mask; + union { + const struct rte_flow_action_port_id *port_id; + const struct rte_flow_action_jump *jump; + const struct rte_flow_action_of_push_vlan *of_push_vlan; + const struct rte_flow_action_of_set_vlan_vid * + of_set_vlan_vid; + const struct rte_flow_action_of_set_vlan_pcp * + of_set_vlan_pcp; + const struct rte_flow_action_set_ipv4 *set_ipv4; + const struct rte_flow_action_set_ipv6 *set_ipv6; + } conf; + uint64_t item_flags = 0; + uint64_t action_flags = 0; + uint8_t next_protocol = -1; + unsigned int tcm_ifindex = 0; + uint8_t pedit_validated = 0; + struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)]; + struct rte_eth_dev *port_id_dev = NULL; + bool in_port_id_set; + int ret; + + claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi, + PTOI_TABLE_SZ_MAX(dev))); + ret = flow_tcf_validate_attributes(attr, error); + if (ret < 0) + return ret; + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + unsigned int i; + + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_PORT_ID: + mask.port_id = flow_tcf_item_mask + (items, &rte_flow_item_port_id_mask, + &flow_tcf_mask_supported.port_id, + &flow_tcf_mask_empty.port_id, + sizeof(flow_tcf_mask_supported.port_id), + error); + if (!mask.port_id) + return -rte_errno; + if (mask.port_id == &flow_tcf_mask_empty.port_id) { + in_port_id_set = 1; + break; + } + spec.port_id = items->spec; + if (mask.port_id->id && mask.port_id->id != 0xffffffff) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.port_id, + "no support for partial mask on" + " \"id\" field"); + if (!mask.port_id->id) + i = 0; + else + for (i = 0; ptoi[i].ifindex; ++i) + if (ptoi[i].port_id == spec.port_id->id) + break; + if (!ptoi[i].ifindex) + return rte_flow_error_set + (error, ENODEV, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + spec.port_id, + "missing data to convert port ID to" + " ifindex"); + if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + spec.port_id, + "cannot match traffic for" + " several port IDs through" + " a single flow rule"); + tcm_ifindex = ptoi[i].ifindex; + in_port_id_set = 1; + break; + case RTE_FLOW_ITEM_TYPE_ETH: + ret = mlx5_flow_validate_item_eth(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_OUTER_L2; + /* TODO: + * Redundant check due to different supported mask. + * Same for the rest of items. + */ + mask.eth = flow_tcf_item_mask + (items, &rte_flow_item_eth_mask, + &flow_tcf_mask_supported.eth, + &flow_tcf_mask_empty.eth, + sizeof(flow_tcf_mask_supported.eth), + error); + if (!mask.eth) + return -rte_errno; + if (mask.eth->type && mask.eth->type != + RTE_BE16(0xffff)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.eth, + "no support for partial mask on" + " \"type\" field"); + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + ret = mlx5_flow_validate_item_vlan(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN; + mask.vlan = flow_tcf_item_mask + (items, &rte_flow_item_vlan_mask, + &flow_tcf_mask_supported.vlan, + &flow_tcf_mask_empty.vlan, + sizeof(flow_tcf_mask_supported.vlan), + error); + if (!mask.vlan) + return -rte_errno; + if ((mask.vlan->tci & RTE_BE16(0xe000) && + (mask.vlan->tci & RTE_BE16(0xe000)) != + RTE_BE16(0xe000)) || + (mask.vlan->tci & RTE_BE16(0x0fff) && + (mask.vlan->tci & RTE_BE16(0x0fff)) != + RTE_BE16(0x0fff)) || + (mask.vlan->inner_type && + mask.vlan->inner_type != RTE_BE16(0xffff))) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.vlan, + "no support for partial masks on" + " \"tci\" (PCP and VID parts) and" + " \"inner_type\" fields"); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + ret = mlx5_flow_validate_item_ipv4(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; + mask.ipv4 = flow_tcf_item_mask + (items, &rte_flow_item_ipv4_mask, + &flow_tcf_mask_supported.ipv4, + &flow_tcf_mask_empty.ipv4, + sizeof(flow_tcf_mask_supported.ipv4), + error); + if (!mask.ipv4) + return -rte_errno; + if (mask.ipv4->hdr.next_proto_id && + mask.ipv4->hdr.next_proto_id != 0xff) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.ipv4, + "no support for partial mask on" + " \"hdr.next_proto_id\" field"); + else if (mask.ipv4->hdr.next_proto_id) + next_protocol = + ((const struct rte_flow_item_ipv4 *) + (items->spec))->hdr.next_proto_id; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + ret = mlx5_flow_validate_item_ipv6(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; + mask.ipv6 = flow_tcf_item_mask + (items, &rte_flow_item_ipv6_mask, + &flow_tcf_mask_supported.ipv6, + &flow_tcf_mask_empty.ipv6, + sizeof(flow_tcf_mask_supported.ipv6), + error); + if (!mask.ipv6) + return -rte_errno; + if (mask.ipv6->hdr.proto && + mask.ipv6->hdr.proto != 0xff) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, + mask.ipv6, + "no support for partial mask on" + " \"hdr.proto\" field"); + else if (mask.ipv6->hdr.proto) + next_protocol = + ((const struct rte_flow_item_ipv6 *) + (items->spec))->hdr.proto; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + ret = mlx5_flow_validate_item_udp(items, item_flags, + next_protocol, error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP; + mask.udp = flow_tcf_item_mask + (items, &rte_flow_item_udp_mask, + &flow_tcf_mask_supported.udp, + &flow_tcf_mask_empty.udp, + sizeof(flow_tcf_mask_supported.udp), + error); + if (!mask.udp) + return -rte_errno; + break; + case RTE_FLOW_ITEM_TYPE_TCP: + ret = mlx5_flow_validate_item_tcp + (items, item_flags, + next_protocol, + &flow_tcf_mask_supported.tcp, + error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP; + mask.tcp = flow_tcf_item_mask + (items, &rte_flow_item_tcp_mask, + &flow_tcf_mask_supported.tcp, + &flow_tcf_mask_empty.tcp, + sizeof(flow_tcf_mask_supported.tcp), + error); + if (!mask.tcp) + return -rte_errno; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "item not supported"); + } + } + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + unsigned int i; + uint64_t current_action_flag = 0; + + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_PORT_ID: + current_action_flag = MLX5_FLOW_ACTION_PORT_ID; + if (!actions->conf) + break; + conf.port_id = actions->conf; + if (conf.port_id->original) + i = 0; + else + for (i = 0; ptoi[i].ifindex; ++i) + if (ptoi[i].port_id == conf.port_id->id) + break; + if (!ptoi[i].ifindex) + return rte_flow_error_set + (error, ENODEV, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + conf.port_id, + "missing data to convert port ID to" + " ifindex"); + port_id_dev = &rte_eth_devices[conf.port_id->id]; + break; + case RTE_FLOW_ACTION_TYPE_JUMP: + current_action_flag = MLX5_FLOW_ACTION_JUMP; + if (!actions->conf) + break; + conf.jump = actions->conf; + if (attr->group >= conf.jump->group) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "can jump only to a group forward"); + break; + case RTE_FLOW_ACTION_TYPE_DROP: + current_action_flag = MLX5_FLOW_ACTION_DROP; + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + break; + case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN: + current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN; + break; + case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: + current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN; + break; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID: + if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "vlan modify is not supported," + " set action must follow push action"); + current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID; + break; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP: + if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN)) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "vlan modify is not supported," + " set action must follow push action"); + current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: + current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: + current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: + current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: + current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: + current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_TP_DST: + current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST; + break; + case RTE_FLOW_ACTION_TYPE_SET_TTL: + current_action_flag = MLX5_FLOW_ACTION_SET_TTL; + break; + case RTE_FLOW_ACTION_TYPE_DEC_TTL: + current_action_flag = MLX5_FLOW_ACTION_DEC_TTL; + break; + case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: + current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC; + break; + case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: + current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) { + if (!actions->conf) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + actions, + "action configuration not set"); + } + if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) && + pedit_validated) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "set actions should be " + "listed successively"); + if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) && + (action_flags & MLX5_TCF_PEDIT_ACTIONS)) + pedit_validated = 1; + if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) && + (action_flags & MLX5_TCF_FATE_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "can't have multiple fate" + " actions"); + action_flags |= current_action_flag; + } + if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) && + (action_flags & MLX5_FLOW_ACTION_DROP)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "set action is not compatible with " + "drop action"); + if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) && + !(action_flags & MLX5_FLOW_ACTION_PORT_ID)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "set action must be followed by " + "port_id action"); + if (action_flags & + (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) { + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "no ipv4 item found in" + " pattern"); + } + if (action_flags & + (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) { + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "no ipv6 item found in" + " pattern"); + } + if (action_flags & + (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) { + if (!(item_flags & + (MLX5_FLOW_LAYER_OUTER_L4_UDP | + MLX5_FLOW_LAYER_OUTER_L4_TCP))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "no TCP/UDP item found in" + " pattern"); + } + /* + * FW syndrome (0xA9C090): + * set_flow_table_entry: push vlan action fte in fdb can ONLY be + * forward to the uplink. + */ + if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) && + (action_flags & MLX5_FLOW_ACTION_PORT_ID) && + ((struct priv *)port_id_dev->data->dev_private)->representor) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "vlan push can only be applied" + " when forwarding to uplink port"); + /* + * FW syndrome (0x294609): + * set_flow_table_entry: modify/pop/push actions in fdb flow table + * are supported only while forwarding to vport. + */ + if ((action_flags & MLX5_TCF_VLAN_ACTIONS) && + !(action_flags & MLX5_FLOW_ACTION_PORT_ID)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "vlan actions are supported" + " only with port_id action"); + if (!(action_flags & MLX5_TCF_FATE_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "no fate action is found"); + if (action_flags & + (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) { + if (!(item_flags & + (MLX5_FLOW_LAYER_OUTER_L3_IPV4 | + MLX5_FLOW_LAYER_OUTER_L3_IPV6))) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "no IP found in pattern"); + } + if (action_flags & + (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) { + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "no ethernet found in" + " pattern"); + } + return 0; +} + +/** + * Calculate maximum size of memory for flow items of Linux TC flower and + * extract specified items. + * + * @param[in] items + * Pointer to the list of items. + * @param[out] item_flags + * Pointer to the detected items. + * + * @return + * Maximum size of memory for items. + */ +static int +flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + uint64_t *item_flags) +{ + int size = 0; + uint64_t flags = 0; + + size += SZ_NLATTR_STRZ_OF("flower") + + SZ_NLATTR_NEST + /* TCA_OPTIONS. */ + SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */ + if (attr->group > 0) + size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */ + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_PORT_ID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ + SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4; + /* dst/src MAC addr and mask. */ + flags |= MLX5_FLOW_LAYER_OUTER_L2; + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ + SZ_NLATTR_TYPE_OF(uint16_t) + + /* VLAN Ether type. */ + SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */ + SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */ + flags |= MLX5_FLOW_LAYER_OUTER_VLAN; + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ + SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ + SZ_NLATTR_TYPE_OF(uint32_t) * 4; + /* dst/src IP addr and mask. */ + flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ + SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ + SZ_NLATTR_TYPE_OF(IPV6_ADDR_LEN) * 4; + /* dst/src IP addr and mask. */ + flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ + SZ_NLATTR_TYPE_OF(uint16_t) * 4; + /* dst/src port and mask. */ + flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP; + break; + case RTE_FLOW_ITEM_TYPE_TCP: + size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ + SZ_NLATTR_TYPE_OF(uint16_t) * 4; + /* dst/src port and mask. */ + flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP; + break; + default: + DRV_LOG(WARNING, + "unsupported item %p type %d," + " items must be validated before flow creation", + (const void *)items, items->type); + break; + } + } + *item_flags = flags; + return size; +} + +/** + * Calculate maximum size of memory for flow actions of Linux TC flower and + * extract specified actions. + * + * @param[in] actions + * Pointer to the list of actions. + * @param[out] action_flags + * Pointer to the detected actions. + * + * @return + * Maximum size of memory for actions. + */ +static int +flow_tcf_get_actions_and_size(const struct rte_flow_action actions[], + uint64_t *action_flags) +{ + int size = 0; + uint64_t flags = 0; + + size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */ + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_PORT_ID: + size += SZ_NLATTR_NEST + /* na_act_index. */ + SZ_NLATTR_STRZ_OF("mirred") + + SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */ + SZ_NLATTR_TYPE_OF(struct tc_mirred); + flags |= MLX5_FLOW_ACTION_PORT_ID; + break; + case RTE_FLOW_ACTION_TYPE_JUMP: + size += SZ_NLATTR_NEST + /* na_act_index. */ + SZ_NLATTR_STRZ_OF("gact") + + SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */ + SZ_NLATTR_TYPE_OF(struct tc_gact); + flags |= MLX5_FLOW_ACTION_JUMP; + break; + case RTE_FLOW_ACTION_TYPE_DROP: + size += SZ_NLATTR_NEST + /* na_act_index. */ + SZ_NLATTR_STRZ_OF("gact") + + SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */ + SZ_NLATTR_TYPE_OF(struct tc_gact); + flags |= MLX5_FLOW_ACTION_DROP; + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + break; + case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN: + flags |= MLX5_FLOW_ACTION_OF_POP_VLAN; + goto action_of_vlan; + case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: + flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN; + goto action_of_vlan; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID: + flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID; + goto action_of_vlan; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP: + flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP; + goto action_of_vlan; +action_of_vlan: + size += SZ_NLATTR_NEST + /* na_act_index. */ + SZ_NLATTR_STRZ_OF("vlan") + + SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */ + SZ_NLATTR_TYPE_OF(struct tc_vlan) + + SZ_NLATTR_TYPE_OF(uint16_t) + + /* VLAN protocol. */ + SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */ + SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */ + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: + case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: + case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: + case RTE_FLOW_ACTION_TYPE_SET_TP_DST: + case RTE_FLOW_ACTION_TYPE_SET_TTL: + case RTE_FLOW_ACTION_TYPE_DEC_TTL: + case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: + case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: + size += flow_tcf_get_pedit_actions_size(&actions, + &flags); + break; + default: + DRV_LOG(WARNING, + "unsupported action %p type %d," + " items must be validated before flow creation", + (const void *)actions, actions->type); + break; + } + } + *action_flags = flags; + return size; +} + +/** + * Brand rtnetlink buffer with unique handle. + * + * This handle should be unique for a given network interface to avoid + * collisions. + * + * @param nlh + * Pointer to Netlink message. + * @param handle + * Unique 32-bit handle to use. + */ +static void +flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle) +{ + struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh); + + tcm->tcm_handle = handle; + DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x", + (void *)nlh, handle); +} + +/** + * Prepare a flow object for Linux TC flower. It calculates the maximum size of + * memory required, allocates the memory, initializes Netlink message headers + * and set unique TC message handle. + * + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] item_flags + * Pointer to bit mask of all items detected. + * @param[out] action_flags + * Pointer to bit mask of all actions detected. + * @param[out] error + * Pointer to the error structure. + * + * @return + * Pointer to mlx5_flow object on success, + * otherwise NULL and rte_ernno is set. + */ +static struct mlx5_flow * +flow_tcf_prepare(const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + uint64_t *item_flags, uint64_t *action_flags, + struct rte_flow_error *error) +{ + size_t size = sizeof(struct mlx5_flow) + + MNL_ALIGN(sizeof(struct nlmsghdr)) + + MNL_ALIGN(sizeof(struct tcmsg)); + struct mlx5_flow *dev_flow; + struct nlmsghdr *nlh; + struct tcmsg *tcm; + + size += flow_tcf_get_items_and_size(attr, items, item_flags); + size += flow_tcf_get_actions_and_size(actions, action_flags); + dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO); + if (!dev_flow) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "not enough memory to create E-Switch flow"); + return NULL; + } + nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1)); + tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); + *dev_flow = (struct mlx5_flow){ + .tcf = (struct mlx5_flow_tcf){ + .nlh = nlh, + .tcm = tcm, + }, + }; + /* + * Generate a reasonably unique handle based on the address of the + * target buffer. + * + * This is straightforward on 32-bit systems where the flow pointer can + * be used directly. Otherwise, its least significant part is taken + * after shifting it by the previous power of two of the pointed buffer + * size. + */ + if (sizeof(dev_flow) <= 4) + flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow); + else + flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >> + rte_log2_u32(rte_align32prevpow2(size))); + return dev_flow; +} + +/** + * Make adjustments for supporting count actions. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] dev_flow + * Pointer to mlx5_flow. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 On success else a negative errno value is returned and rte_errno is set. + */ +static int +flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct rte_flow *flow = dev_flow->flow; + + if (!flow->counter) { + flow->counter = flow_tcf_counter_new(); + if (!flow->counter) + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "cannot get counter" + " context."); + } + return 0; +} + +/** + * Translate flow for Linux TC flower and construct Netlink message. + * + * @param[in] priv + * Pointer to the priv structure. + * @param[in, out] flow + * Pointer to the sub flow. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + */ +static int +flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + union { + const struct rte_flow_item_port_id *port_id; + const struct rte_flow_item_eth *eth; + const struct rte_flow_item_vlan *vlan; + const struct rte_flow_item_ipv4 *ipv4; + const struct rte_flow_item_ipv6 *ipv6; + const struct rte_flow_item_tcp *tcp; + const struct rte_flow_item_udp *udp; + } spec, mask; + union { + const struct rte_flow_action_port_id *port_id; + const struct rte_flow_action_jump *jump; + const struct rte_flow_action_of_push_vlan *of_push_vlan; + const struct rte_flow_action_of_set_vlan_vid * + of_set_vlan_vid; + const struct rte_flow_action_of_set_vlan_pcp * + of_set_vlan_pcp; + } conf; + struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)]; + struct nlmsghdr *nlh = dev_flow->tcf.nlh; + struct tcmsg *tcm = dev_flow->tcf.tcm; + uint32_t na_act_index_cur; + bool eth_type_set = 0; + bool vlan_present = 0; + bool vlan_eth_type_set = 0; + bool ip_proto_set = 0; + struct nlattr *na_flower; + struct nlattr *na_flower_act; + struct nlattr *na_vlan_id = NULL; + struct nlattr *na_vlan_priority = NULL; + uint64_t item_flags = 0; + int ret; + + claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi, + PTOI_TABLE_SZ_MAX(dev))); + nlh = dev_flow->tcf.nlh; + tcm = dev_flow->tcf.tcm; + /* Prepare API must have been called beforehand. */ + assert(nlh != NULL && tcm != NULL); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = ptoi[0].ifindex; + tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS); + /* + * Priority cannot be zero to prevent the kernel from picking one + * automatically. + */ + tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, + RTE_BE16(ETH_P_ALL)); + if (attr->group > 0) + mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group); + mnl_attr_put_strz(nlh, TCA_KIND, "flower"); + na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS); + mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, TCA_CLS_FLAGS_SKIP_SW); + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + unsigned int i; + + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_PORT_ID: + mask.port_id = flow_tcf_item_mask + (items, &rte_flow_item_port_id_mask, + &flow_tcf_mask_supported.port_id, + &flow_tcf_mask_empty.port_id, + sizeof(flow_tcf_mask_supported.port_id), + error); + assert(mask.port_id); + if (mask.port_id == &flow_tcf_mask_empty.port_id) + break; + spec.port_id = items->spec; + if (!mask.port_id->id) + i = 0; + else + for (i = 0; ptoi[i].ifindex; ++i) + if (ptoi[i].port_id == spec.port_id->id) + break; + assert(ptoi[i].ifindex); + tcm->tcm_ifindex = ptoi[i].ifindex; + break; + case RTE_FLOW_ITEM_TYPE_ETH: + item_flags |= MLX5_FLOW_LAYER_OUTER_L2; + mask.eth = flow_tcf_item_mask + (items, &rte_flow_item_eth_mask, + &flow_tcf_mask_supported.eth, + &flow_tcf_mask_empty.eth, + sizeof(flow_tcf_mask_supported.eth), + error); + assert(mask.eth); + if (mask.eth == &flow_tcf_mask_empty.eth) + break; + spec.eth = items->spec; + if (mask.eth->type) { + mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE, + spec.eth->type); + eth_type_set = 1; + } + if (!is_zero_ether_addr(&mask.eth->dst)) { + mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST, + ETHER_ADDR_LEN, + spec.eth->dst.addr_bytes); + mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK, + ETHER_ADDR_LEN, + mask.eth->dst.addr_bytes); + } + if (!is_zero_ether_addr(&mask.eth->src)) { + mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC, + ETHER_ADDR_LEN, + spec.eth->src.addr_bytes); + mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK, + ETHER_ADDR_LEN, + mask.eth->src.addr_bytes); + } + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN; + mask.vlan = flow_tcf_item_mask + (items, &rte_flow_item_vlan_mask, + &flow_tcf_mask_supported.vlan, + &flow_tcf_mask_empty.vlan, + sizeof(flow_tcf_mask_supported.vlan), + error); + assert(mask.vlan); + if (!eth_type_set) + mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE, + RTE_BE16(ETH_P_8021Q)); + eth_type_set = 1; + vlan_present = 1; + if (mask.vlan == &flow_tcf_mask_empty.vlan) + break; + spec.vlan = items->spec; + if (mask.vlan->inner_type) { + mnl_attr_put_u16(nlh, + TCA_FLOWER_KEY_VLAN_ETH_TYPE, + spec.vlan->inner_type); + vlan_eth_type_set = 1; + } + if (mask.vlan->tci & RTE_BE16(0xe000)) + mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO, + (rte_be_to_cpu_16 + (spec.vlan->tci) >> 13) & 0x7); + if (mask.vlan->tci & RTE_BE16(0x0fff)) + mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID, + rte_be_to_cpu_16 + (spec.vlan->tci & + RTE_BE16(0x0fff))); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; + mask.ipv4 = flow_tcf_item_mask + (items, &rte_flow_item_ipv4_mask, + &flow_tcf_mask_supported.ipv4, + &flow_tcf_mask_empty.ipv4, + sizeof(flow_tcf_mask_supported.ipv4), + error); + assert(mask.ipv4); + if (!eth_type_set || !vlan_eth_type_set) + mnl_attr_put_u16(nlh, + vlan_present ? + TCA_FLOWER_KEY_VLAN_ETH_TYPE : + TCA_FLOWER_KEY_ETH_TYPE, + RTE_BE16(ETH_P_IP)); + eth_type_set = 1; + vlan_eth_type_set = 1; + if (mask.ipv4 == &flow_tcf_mask_empty.ipv4) + break; + spec.ipv4 = items->spec; + if (mask.ipv4->hdr.next_proto_id) { + mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO, + spec.ipv4->hdr.next_proto_id); + ip_proto_set = 1; + } + if (mask.ipv4->hdr.src_addr) { + mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_SRC, + spec.ipv4->hdr.src_addr); + mnl_attr_put_u32(nlh, + TCA_FLOWER_KEY_IPV4_SRC_MASK, + mask.ipv4->hdr.src_addr); + } + if (mask.ipv4->hdr.dst_addr) { + mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_DST, + spec.ipv4->hdr.dst_addr); + mnl_attr_put_u32(nlh, + TCA_FLOWER_KEY_IPV4_DST_MASK, + mask.ipv4->hdr.dst_addr); + } + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; + mask.ipv6 = flow_tcf_item_mask + (items, &rte_flow_item_ipv6_mask, + &flow_tcf_mask_supported.ipv6, + &flow_tcf_mask_empty.ipv6, + sizeof(flow_tcf_mask_supported.ipv6), + error); + assert(mask.ipv6); + if (!eth_type_set || !vlan_eth_type_set) + mnl_attr_put_u16(nlh, + vlan_present ? + TCA_FLOWER_KEY_VLAN_ETH_TYPE : + TCA_FLOWER_KEY_ETH_TYPE, + RTE_BE16(ETH_P_IPV6)); + eth_type_set = 1; + vlan_eth_type_set = 1; + if (mask.ipv6 == &flow_tcf_mask_empty.ipv6) + break; + spec.ipv6 = items->spec; + if (mask.ipv6->hdr.proto) { + mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO, + spec.ipv6->hdr.proto); + ip_proto_set = 1; + } + if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) { + mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC, + sizeof(spec.ipv6->hdr.src_addr), + spec.ipv6->hdr.src_addr); + mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(mask.ipv6->hdr.src_addr), + mask.ipv6->hdr.src_addr); + } + if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) { + mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST, + sizeof(spec.ipv6->hdr.dst_addr), + spec.ipv6->hdr.dst_addr); + mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(mask.ipv6->hdr.dst_addr), + mask.ipv6->hdr.dst_addr); + } + break; + case RTE_FLOW_ITEM_TYPE_UDP: + item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP; + mask.udp = flow_tcf_item_mask + (items, &rte_flow_item_udp_mask, + &flow_tcf_mask_supported.udp, + &flow_tcf_mask_empty.udp, + sizeof(flow_tcf_mask_supported.udp), + error); + assert(mask.udp); + if (!ip_proto_set) + mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO, + IPPROTO_UDP); + if (mask.udp == &flow_tcf_mask_empty.udp) + break; + spec.udp = items->spec; + if (mask.udp->hdr.src_port) { + mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_SRC, + spec.udp->hdr.src_port); + mnl_attr_put_u16(nlh, + TCA_FLOWER_KEY_UDP_SRC_MASK, + mask.udp->hdr.src_port); + } + if (mask.udp->hdr.dst_port) { + mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_DST, + spec.udp->hdr.dst_port); + mnl_attr_put_u16(nlh, + TCA_FLOWER_KEY_UDP_DST_MASK, + mask.udp->hdr.dst_port); + } + break; + case RTE_FLOW_ITEM_TYPE_TCP: + item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP; + mask.tcp = flow_tcf_item_mask + (items, &rte_flow_item_tcp_mask, + &flow_tcf_mask_supported.tcp, + &flow_tcf_mask_empty.tcp, + sizeof(flow_tcf_mask_supported.tcp), + error); + assert(mask.tcp); + if (!ip_proto_set) + mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO, + IPPROTO_TCP); + if (mask.tcp == &flow_tcf_mask_empty.tcp) + break; + spec.tcp = items->spec; + if (mask.tcp->hdr.src_port) { + mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC, + spec.tcp->hdr.src_port); + mnl_attr_put_u16(nlh, + TCA_FLOWER_KEY_TCP_SRC_MASK, + mask.tcp->hdr.src_port); + } + if (mask.tcp->hdr.dst_port) { + mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST, + spec.tcp->hdr.dst_port); + mnl_attr_put_u16(nlh, + TCA_FLOWER_KEY_TCP_DST_MASK, + mask.tcp->hdr.dst_port); + } + if (mask.tcp->hdr.tcp_flags) { + mnl_attr_put_u16 + (nlh, + TCA_FLOWER_KEY_TCP_FLAGS, + rte_cpu_to_be_16 + (spec.tcp->hdr.tcp_flags)); + mnl_attr_put_u16 + (nlh, + TCA_FLOWER_KEY_TCP_FLAGS_MASK, + rte_cpu_to_be_16 + (mask.tcp->hdr.tcp_flags)); + } + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "item not supported"); + } + } + na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT); + na_act_index_cur = 1; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + struct nlattr *na_act_index; + struct nlattr *na_act; + unsigned int vlan_act; + unsigned int i; + + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_PORT_ID: + conf.port_id = actions->conf; + if (conf.port_id->original) + i = 0; + else + for (i = 0; ptoi[i].ifindex; ++i) + if (ptoi[i].port_id == conf.port_id->id) + break; + assert(ptoi[i].ifindex); + na_act_index = + mnl_attr_nest_start(nlh, na_act_index_cur++); + assert(na_act_index); + mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred"); + na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS); + assert(na_act); + mnl_attr_put(nlh, TCA_MIRRED_PARMS, + sizeof(struct tc_mirred), + &(struct tc_mirred){ + .action = TC_ACT_STOLEN, + .eaction = TCA_EGRESS_REDIR, + .ifindex = ptoi[i].ifindex, + }); + mnl_attr_nest_end(nlh, na_act); + mnl_attr_nest_end(nlh, na_act_index); + break; + case RTE_FLOW_ACTION_TYPE_JUMP: + conf.jump = actions->conf; + na_act_index = + mnl_attr_nest_start(nlh, na_act_index_cur++); + assert(na_act_index); + mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact"); + na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS); + assert(na_act); + mnl_attr_put(nlh, TCA_GACT_PARMS, + sizeof(struct tc_gact), + &(struct tc_gact){ + .action = TC_ACT_GOTO_CHAIN | + conf.jump->group, + }); + mnl_attr_nest_end(nlh, na_act); + mnl_attr_nest_end(nlh, na_act_index); + break; + case RTE_FLOW_ACTION_TYPE_DROP: + na_act_index = + mnl_attr_nest_start(nlh, na_act_index_cur++); + assert(na_act_index); + mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact"); + na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS); + assert(na_act); + mnl_attr_put(nlh, TCA_GACT_PARMS, + sizeof(struct tc_gact), + &(struct tc_gact){ + .action = TC_ACT_SHOT, + }); + mnl_attr_nest_end(nlh, na_act); + mnl_attr_nest_end(nlh, na_act_index); + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + /* + * Driver adds the count action implicitly for + * each rule it creates. + */ + ret = flow_tcf_translate_action_count(dev, + dev_flow, error); + if (ret < 0) + return ret; + break; + case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN: + conf.of_push_vlan = NULL; + vlan_act = TCA_VLAN_ACT_POP; + goto action_of_vlan; + case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: + conf.of_push_vlan = actions->conf; + vlan_act = TCA_VLAN_ACT_PUSH; + goto action_of_vlan; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID: + conf.of_set_vlan_vid = actions->conf; + if (na_vlan_id) + goto override_na_vlan_id; + vlan_act = TCA_VLAN_ACT_MODIFY; + goto action_of_vlan; + case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP: + conf.of_set_vlan_pcp = actions->conf; + if (na_vlan_priority) + goto override_na_vlan_priority; + vlan_act = TCA_VLAN_ACT_MODIFY; + goto action_of_vlan; +action_of_vlan: + na_act_index = + mnl_attr_nest_start(nlh, na_act_index_cur++); + assert(na_act_index); + mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan"); + na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS); + assert(na_act); + mnl_attr_put(nlh, TCA_VLAN_PARMS, + sizeof(struct tc_vlan), + &(struct tc_vlan){ + .action = TC_ACT_PIPE, + .v_action = vlan_act, + }); + if (vlan_act == TCA_VLAN_ACT_POP) { + mnl_attr_nest_end(nlh, na_act); + mnl_attr_nest_end(nlh, na_act_index); + break; + } + if (vlan_act == TCA_VLAN_ACT_PUSH) + mnl_attr_put_u16(nlh, + TCA_VLAN_PUSH_VLAN_PROTOCOL, + conf.of_push_vlan->ethertype); + na_vlan_id = mnl_nlmsg_get_payload_tail(nlh); + mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0); + na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh); + mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0); + mnl_attr_nest_end(nlh, na_act); + mnl_attr_nest_end(nlh, na_act_index); + if (actions->type == + RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) { +override_na_vlan_id: + na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID; + *(uint16_t *)mnl_attr_get_payload(na_vlan_id) = + rte_be_to_cpu_16 + (conf.of_set_vlan_vid->vlan_vid); + } else if (actions->type == + RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) { +override_na_vlan_priority: + na_vlan_priority->nla_type = + TCA_VLAN_PUSH_VLAN_PRIORITY; + *(uint8_t *)mnl_attr_get_payload + (na_vlan_priority) = + conf.of_set_vlan_pcp->vlan_pcp; + } + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: + case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: + case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: + case RTE_FLOW_ACTION_TYPE_SET_TP_DST: + case RTE_FLOW_ACTION_TYPE_SET_TTL: + case RTE_FLOW_ACTION_TYPE_DEC_TTL: + case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: + case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: + na_act_index = + mnl_attr_nest_start(nlh, na_act_index_cur++); + flow_tcf_create_pedit_mnl_msg(nlh, + &actions, item_flags); + mnl_attr_nest_end(nlh, na_act_index); + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + } + assert(na_flower); + assert(na_flower_act); + mnl_attr_nest_end(nlh, na_flower_act); + mnl_attr_nest_end(nlh, na_flower); + return 0; +} + +/** + * Send Netlink message with acknowledgment. + * + * @param ctx + * Flow context to use. + * @param nlh + * Message to send. This function always raises the NLM_F_ACK flag before + * sending. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_tcf_nl_ack(struct mlx5_flow_tcf_context *ctx, struct nlmsghdr *nlh) +{ + alignas(struct nlmsghdr) + uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) + + nlh->nlmsg_len - sizeof(*nlh)]; + uint32_t seq = ctx->seq++; + struct mnl_socket *nl = ctx->nl; + int ret; + + nlh->nlmsg_flags |= NLM_F_ACK; + nlh->nlmsg_seq = seq; + ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len); + if (ret != -1) + ret = mnl_socket_recvfrom(nl, ans, sizeof(ans)); + if (ret != -1) + ret = mnl_cb_run + (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL); + if (ret > 0) + return 0; + rte_errno = errno; + return -rte_errno; +} + +/** + * Apply flow to E-Switch by sending Netlink message. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to the sub flow. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + */ +static int +flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_flow_tcf_context *ctx = priv->tcf_context; + struct mlx5_flow *dev_flow; + struct nlmsghdr *nlh; + + dev_flow = LIST_FIRST(&flow->dev_flows); + /* E-Switch flow can't be expanded. */ + assert(!LIST_NEXT(dev_flow, next)); + nlh = dev_flow->tcf.nlh; + nlh->nlmsg_type = RTM_NEWTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + if (!flow_tcf_nl_ack(ctx, nlh)) + return 0; + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to create TC flow rule"); +} + +/** + * Remove flow from E-Switch by sending Netlink message. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to the sub flow. + */ +static void +flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_flow_tcf_context *ctx = priv->tcf_context; + struct mlx5_flow *dev_flow; + struct nlmsghdr *nlh; + + if (!flow) + return; + if (flow->counter) { + if (--flow->counter->ref_cnt == 0) { + rte_free(flow->counter); + flow->counter = NULL; + } + } + dev_flow = LIST_FIRST(&flow->dev_flows); + if (!dev_flow) + return; + /* E-Switch flow can't be expanded. */ + assert(!LIST_NEXT(dev_flow, next)); + nlh = dev_flow->tcf.nlh; + nlh->nlmsg_type = RTM_DELTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST; + flow_tcf_nl_ack(ctx, nlh); +} + +/** + * Remove flow from E-Switch and release resources of the device flow. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to the sub flow. + */ +static void +flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_flow *dev_flow; + + if (!flow) + return; + flow_tcf_remove(dev, flow); + dev_flow = LIST_FIRST(&flow->dev_flows); + if (!dev_flow) + return; + /* E-Switch flow can't be expanded. */ + assert(!LIST_NEXT(dev_flow, next)); + LIST_REMOVE(dev_flow, next); + rte_free(dev_flow); +} + +/** + * Helper routine for figuring the space size required for a parse buffer. + * + * @param array + * array of values to use. + * @param idx + * Current location in array. + * @param value + * Value to compare with. + * + * @return + * The maximum between the given value and the array value on index. + */ +static uint16_t +flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value) +{ + return idx < 0 ? (value) : RTE_MAX((array)[idx], value); +} + +/** + * Parse rtnetlink message attributes filling the attribute table with the info + * retrieved. + * + * @param tb + * Attribute table to be filled. + * @param[out] max + * Maxinum entry in the attribute table. + * @param rte + * The attributes section in the message to be parsed. + * @param len + * The length of the attributes section in the message. + */ +static void +flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max, + struct rtattr *rta, int len) +{ + unsigned short type; + memset(tb, 0, sizeof(struct rtattr *) * (max + 1)); + while (RTA_OK(rta, len)) { + type = rta->rta_type; + if (type <= max && !tb[type]) + tb[type] = rta; + rta = RTA_NEXT(rta, len); + } +} + +/** + * Extract flow counters from flower action. + * + * @param rta + * flower action stats properties in the Netlink message received. + * @param rta_type + * The backward sequence of rta_types, as written in the attribute table, + * we need to traverse in order to get to the requested object. + * @param idx + * Current location in rta_type table. + * @param[out] data + * data holding the count statistics of the rte_flow retrieved from + * the message. + * + * @return + * 0 if data was found and retrieved, -1 otherwise. + */ +static int +flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta, + uint16_t rta_type[], int idx, + struct gnet_stats_basic *data) +{ + int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx, + TCA_STATS_BASIC); + struct rtattr *tbs[tca_stats_max + 1]; + + if (rta == NULL || idx < 0) + return -1; + flow_tcf_nl_parse_rtattr(tbs, tca_stats_max, + RTA_DATA(rta), RTA_PAYLOAD(rta)); + switch (rta_type[idx]) { + case TCA_STATS_BASIC: + if (tbs[TCA_STATS_BASIC]) { + memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]), + RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]), + sizeof(*data))); + return 0; + } + break; + default: + break; + } + return -1; +} + +/** + * Parse flower single action retrieving the requested action attribute, + * if found. + * + * @param arg + * flower action properties in the Netlink message received. + * @param rta_type + * The backward sequence of rta_types, as written in the attribute table, + * we need to traverse in order to get to the requested object. + * @param idx + * Current location in rta_type table. + * @param[out] data + * Count statistics retrieved from the message query. + * + * @return + * 0 if data was found and retrieved, -1 otherwise. + */ +static int +flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg, + uint16_t rta_type[], int idx, void *data) +{ + int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS); + struct rtattr *tb[tca_act_max + 1]; + + if (arg == NULL || idx < 0) + return -1; + flow_tcf_nl_parse_rtattr(tb, tca_act_max, + RTA_DATA(arg), RTA_PAYLOAD(arg)); + if (tb[TCA_ACT_KIND] == NULL) + return -1; + switch (rta_type[idx]) { + case TCA_ACT_STATS: + if (tb[TCA_ACT_STATS]) + return flow_tcf_nl_action_stats_parse_and_get + (tb[TCA_ACT_STATS], + rta_type, --idx, + (struct gnet_stats_basic *)data); + break; + default: + break; + } + return -1; +} + +/** + * Parse flower action section in the message retrieving the requested + * attribute from the first action that provides it. + * + * @param opt + * flower section in the Netlink message received. + * @param rta_type + * The backward sequence of rta_types, as written in the attribute table, + * we need to traverse in order to get to the requested object. + * @param idx + * Current location in rta_type table. + * @param[out] data + * data retrieved from the message query. + * + * @return + * 0 if data was found and retrieved, -1 otherwise. + */ +static int +flow_tcf_nl_action_parse_and_get(struct rtattr *arg, + uint16_t rta_type[], int idx, void *data) +{ + struct rtattr *tb[TCA_ACT_MAX_PRIO + 1]; + int i; + + if (arg == NULL || idx < 0) + return -1; + flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO, + RTA_DATA(arg), RTA_PAYLOAD(arg)); + switch (rta_type[idx]) { + /* + * flow counters are stored in the actions defined by the flow + * and not in the flow itself, therefore we need to traverse the + * flower chain of actions in search for them. + * + * Note that the index is not decremented here. + */ + case TCA_ACT_STATS: + for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) { + if (tb[i] && + !flow_tcf_nl_parse_one_action_and_get(tb[i], + rta_type, + idx, data)) + return 0; + } + break; + default: + break; + } + return -1; +} + +/** + * Parse flower classifier options in the message, retrieving the requested + * attribute if found. + * + * @param opt + * flower section in the Netlink message received. + * @param rta_type + * The backward sequence of rta_types, as written in the attribute table, + * we need to traverse in order to get to the requested object. + * @param idx + * Current location in rta_type table. + * @param[out] data + * data retrieved from the message query. + * + * @return + * 0 if data was found and retrieved, -1 otherwise. + */ +static int +flow_tcf_nl_opts_parse_and_get(struct rtattr *opt, + uint16_t rta_type[], int idx, void *data) +{ + int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx, + TCA_FLOWER_ACT); + struct rtattr *tb[tca_flower_max + 1]; + + if (!opt || idx < 0) + return -1; + flow_tcf_nl_parse_rtattr(tb, tca_flower_max, + RTA_DATA(opt), RTA_PAYLOAD(opt)); + switch (rta_type[idx]) { + case TCA_FLOWER_ACT: + if (tb[TCA_FLOWER_ACT]) + return flow_tcf_nl_action_parse_and_get + (tb[TCA_FLOWER_ACT], + rta_type, --idx, data); + break; + default: + break; + } + return -1; +} + +/** + * Parse Netlink reply on filter query, retrieving the flow counters. + * + * @param nlh + * Message received from Netlink. + * @param rta_type + * The backward sequence of rta_types, as written in the attribute table, + * we need to traverse in order to get to the requested object. + * @param idx + * Current location in rta_type table. + * @param[out] data + * data retrieved from the message query. + * + * @return + * 0 if data was found and retrieved, -1 otherwise. + */ +static int +flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh, + uint16_t rta_type[], int idx, void *data) +{ + struct nlmsghdr *nlh = cnlh; + struct tcmsg *t = NLMSG_DATA(nlh); + int len = nlh->nlmsg_len; + int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS); + struct rtattr *tb[tca_max + 1]; + + if (idx < 0) + return -1; + if (nlh->nlmsg_type != RTM_NEWTFILTER && + nlh->nlmsg_type != RTM_GETTFILTER && + nlh->nlmsg_type != RTM_DELTFILTER) + return -1; + len -= NLMSG_LENGTH(sizeof(*t)); + if (len < 0) + return -1; + flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len); + /* Not a TC flower flow - bail out */ + if (!tb[TCA_KIND] || + strcmp(RTA_DATA(tb[TCA_KIND]), "flower")) + return -1; + switch (rta_type[idx]) { + case TCA_OPTIONS: + if (tb[TCA_OPTIONS]) + return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS], + rta_type, + --idx, data); + break; + default: + break; + } + return -1; +} + +/** + * A callback to parse Netlink reply on TC flower query. + * + * @param nlh + * Message received from Netlink. + * @param[out] data + * Pointer to data area to be filled by the parsing routine. + * assumed to be a pinter to struct flow_tcf_stats_basic. + * + * @return + * MNL_CB_OK value. + */ +static int +flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data) +{ + /* + * The backward sequence of rta_types to pass in order to get + * to the counters. + */ + uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS, + TCA_FLOWER_ACT, TCA_OPTIONS }; + struct flow_tcf_stats_basic *sb_data = data; + union { + const struct nlmsghdr *c; + struct nlmsghdr *nc; + } tnlh = { .c = nlh }; + + if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type, + RTE_DIM(rta_type) - 1, + (void *)&sb_data->counters)) + sb_data->valid = true; + return MNL_CB_OK; +} + +/** + * Query a TC flower rule for its statistics via netlink. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in] flow + * Pointer to the sub flow. + * @param[out] data + * data retrieved by the query. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_tcf_query_count(struct rte_eth_dev *dev, + struct rte_flow *flow, + void *data, + struct rte_flow_error *error) +{ + struct flow_tcf_stats_basic sb_data = { 0 }; + struct rte_flow_query_count *qc = data; + struct priv *priv = dev->data->dev_private; + struct mlx5_flow_tcf_context *ctx = priv->tcf_context; + struct mnl_socket *nl = ctx->nl; + struct mlx5_flow *dev_flow; + struct nlmsghdr *nlh; + uint32_t seq = priv->tcf_context->seq++; + ssize_t ret; + assert(qc); + + dev_flow = LIST_FIRST(&flow->dev_flows); + /* E-Switch flow can't be expanded. */ + assert(!LIST_NEXT(dev_flow, next)); + if (!dev_flow->flow->counter) + goto notsup_exit; + nlh = dev_flow->tcf.nlh; + nlh->nlmsg_type = RTM_GETTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO; + nlh->nlmsg_seq = seq; + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1) + goto error_exit; + do { + ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size); + if (ret <= 0) + break; + ret = mnl_cb_run(ctx->buf, ret, seq, + mnl_socket_get_portid(nl), + flow_tcf_nl_message_get_stats_basic, + (void *)&sb_data); + } while (ret > 0); + /* Return the delta from last reset. */ + if (sb_data.valid) { + /* Return the delta from last reset. */ + qc->hits_set = 1; + qc->bytes_set = 1; + qc->hits = sb_data.counters.packets - flow->counter->hits; + qc->bytes = sb_data.counters.bytes - flow->counter->bytes; + if (qc->reset) { + flow->counter->hits = sb_data.counters.packets; + flow->counter->bytes = sb_data.counters.bytes; + } + return 0; + } + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "flow does not have counter"); +error_exit: + return rte_flow_error_set + (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "netlink: failed to read flow rule counters"); +notsup_exit: + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "counters are not available."); +} + +/** + * Query a flow. + * + * @see rte_flow_query() + * @see rte_flow_ops + */ +static int +flow_tcf_query(struct rte_eth_dev *dev, + struct rte_flow *flow, + const struct rte_flow_action *actions, + void *data, + struct rte_flow_error *error) +{ + int ret = -EINVAL; + + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = flow_tcf_query_count(dev, flow, data, error); + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + } + return ret; +} + +const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = { + .validate = flow_tcf_validate, + .prepare = flow_tcf_prepare, + .translate = flow_tcf_translate, + .apply = flow_tcf_apply, + .remove = flow_tcf_remove, + .destroy = flow_tcf_destroy, + .query = flow_tcf_query, +}; + +/** + * Create and configure a libmnl socket for Netlink flow rules. + * + * @return + * A valid libmnl socket object pointer on success, NULL otherwise and + * rte_errno is set. + */ +static struct mnl_socket * +flow_tcf_mnl_socket_create(void) +{ + struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE); + + if (nl) { + mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 }, + sizeof(int)); + if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID)) + return nl; + } + rte_errno = errno; + if (nl) + mnl_socket_close(nl); + return NULL; +} + +/** + * Destroy a libmnl socket. + * + * @param nl + * Libmnl socket of the @p NETLINK_ROUTE kind. + */ +static void +flow_tcf_mnl_socket_destroy(struct mnl_socket *nl) +{ + if (nl) + mnl_socket_close(nl); +} + +/** + * Initialize ingress qdisc of a given network interface. + * + * @param ctx + * Pointer to tc-flower context to use. + * @param ifindex + * Index of network interface to initialize. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx, + unsigned int ifindex, struct rte_flow_error *error) +{ + struct nlmsghdr *nlh; + struct tcmsg *tcm; + alignas(struct nlmsghdr) + uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)]; + + /* Destroy existing ingress qdisc and everything attached to it. */ + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = RTM_DELQDISC; + nlh->nlmsg_flags = NLM_F_REQUEST; + tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = ifindex; + tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); + tcm->tcm_parent = TC_H_INGRESS; + /* Ignore errors when qdisc is already absent. */ + if (flow_tcf_nl_ack(ctx, nlh) && + rte_errno != EINVAL && rte_errno != ENOENT) + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to remove ingress" + " qdisc"); + /* Create fresh ingress qdisc. */ + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = RTM_NEWQDISC; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = ifindex; + tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); + tcm->tcm_parent = TC_H_INGRESS; + mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress"); + if (flow_tcf_nl_ack(ctx, nlh)) + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to create ingress" + " qdisc"); + return 0; +} + +/** + * Create libmnl context for Netlink flow rules. + * + * @return + * A valid libmnl socket object pointer on success, NULL otherwise and + * rte_errno is set. + */ +struct mlx5_flow_tcf_context * +mlx5_flow_tcf_context_create(void) +{ + struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__, + sizeof(*ctx), + sizeof(uint32_t)); + if (!ctx) + goto error; + ctx->nl = flow_tcf_mnl_socket_create(); + if (!ctx->nl) + goto error; + ctx->buf_size = MNL_SOCKET_BUFFER_SIZE; + ctx->buf = rte_zmalloc(__func__, + ctx->buf_size, sizeof(uint32_t)); + if (!ctx->buf) + goto error; + ctx->seq = random(); + return ctx; +error: + mlx5_flow_tcf_context_destroy(ctx); + return NULL; +} + +/** + * Destroy a libmnl context. + * + * @param ctx + * Libmnl socket of the @p NETLINK_ROUTE kind. + */ +void +mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx) +{ + if (!ctx) + return; + flow_tcf_mnl_socket_destroy(ctx->nl); + rte_free(ctx->buf); + rte_free(ctx); +} diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c new file mode 100644 index 00000000..81bc39f9 --- /dev/null +++ b/drivers/net/mlx5/mlx5_flow_verbs.c @@ -0,0 +1,1825 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include <netinet/in.h> +#include <sys/queue.h> +#include <stdalign.h> +#include <stdint.h> +#include <string.h> + +/* Verbs header. */ +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include <infiniband/verbs.h> +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include <rte_common.h> +#include <rte_ether.h> +#include <rte_eth_ctrl.h> +#include <rte_ethdev_driver.h> +#include <rte_flow.h> +#include <rte_flow_driver.h> +#include <rte_malloc.h> +#include <rte_ip.h> + +#include "mlx5.h" +#include "mlx5_defs.h" +#include "mlx5_prm.h" +#include "mlx5_glue.h" +#include "mlx5_flow.h" + +/** + * Create Verbs flow counter with Verbs library. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] counter + * mlx5 flow counter object, contains the counter id, + * handle of created Verbs flow counter is returned + * in cs field (if counters are supported). + * + * @return + * 0 On success else a negative errno value is returned + * and rte_errno is set. + */ +static int +flow_verbs_counter_create(struct rte_eth_dev *dev, + struct mlx5_flow_counter *counter) +{ +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) + struct priv *priv = dev->data->dev_private; + struct ibv_counter_set_init_attr init = { + .counter_set_id = counter->id}; + + counter->cs = mlx5_glue->create_counter_set(priv->ctx, &init); + if (!counter->cs) { + rte_errno = ENOTSUP; + return -ENOTSUP; + } + return 0; +#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + struct priv *priv = dev->data->dev_private; + struct ibv_counters_init_attr init = {0}; + struct ibv_counter_attach_attr attach = {0}; + int ret; + + counter->cs = mlx5_glue->create_counters(priv->ctx, &init); + if (!counter->cs) { + rte_errno = ENOTSUP; + return -ENOTSUP; + } + attach.counter_desc = IBV_COUNTER_PACKETS; + attach.index = 0; + ret = mlx5_glue->attach_counters(counter->cs, &attach, NULL); + if (!ret) { + attach.counter_desc = IBV_COUNTER_BYTES; + attach.index = 1; + ret = mlx5_glue->attach_counters + (counter->cs, &attach, NULL); + } + if (ret) { + claim_zero(mlx5_glue->destroy_counters(counter->cs)); + counter->cs = NULL; + rte_errno = ret; + return -ret; + } + return 0; +#else + (void)dev; + (void)counter; + rte_errno = ENOTSUP; + return -ENOTSUP; +#endif +} + +/** + * Get a flow counter. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] shared + * Indicate if this counter is shared with other flows. + * @param[in] id + * Counter identifier. + * + * @return + * A pointer to the counter, NULL otherwise and rte_errno is set. + */ +static struct mlx5_flow_counter * +flow_verbs_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_flow_counter *cnt; + int ret; + + LIST_FOREACH(cnt, &priv->flow_counters, next) { + if (!cnt->shared || cnt->shared != shared) + continue; + if (cnt->id != id) + continue; + cnt->ref_cnt++; + return cnt; + } + cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0); + if (!cnt) { + rte_errno = ENOMEM; + return NULL; + } + cnt->id = id; + cnt->shared = shared; + cnt->ref_cnt = 1; + cnt->hits = 0; + cnt->bytes = 0; + /* Create counter with Verbs. */ + ret = flow_verbs_counter_create(dev, cnt); + if (!ret) { + LIST_INSERT_HEAD(&priv->flow_counters, cnt, next); + return cnt; + } + /* Some error occurred in Verbs library. */ + rte_free(cnt); + rte_errno = -ret; + return NULL; +} + +/** + * Release a flow counter. + * + * @param[in] counter + * Pointer to the counter handler. + */ +static void +flow_verbs_counter_release(struct mlx5_flow_counter *counter) +{ + if (--counter->ref_cnt == 0) { +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) + claim_zero(mlx5_glue->destroy_counter_set(counter->cs)); +#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + claim_zero(mlx5_glue->destroy_counters(counter->cs)); +#endif + LIST_REMOVE(counter, next); + rte_free(counter); + } +} + +/** + * Query a flow counter via Verbs library call. + * + * @see rte_flow_query() + * @see rte_flow_ops + */ +static int +flow_verbs_counter_query(struct rte_eth_dev *dev __rte_unused, + struct rte_flow *flow, void *data, + struct rte_flow_error *error) +{ +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \ + defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + if (flow->actions & MLX5_FLOW_ACTION_COUNT) { + struct rte_flow_query_count *qc = data; + uint64_t counters[2] = {0, 0}; +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) + struct ibv_query_counter_set_attr query_cs_attr = { + .cs = flow->counter->cs, + .query_flags = IBV_COUNTER_SET_FORCE_UPDATE, + }; + struct ibv_counter_set_data query_out = { + .out = counters, + .outlen = 2 * sizeof(uint64_t), + }; + int err = mlx5_glue->query_counter_set(&query_cs_attr, + &query_out); +#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + int err = mlx5_glue->query_counters + (flow->counter->cs, counters, + RTE_DIM(counters), + IBV_READ_COUNTERS_ATTR_PREFER_CACHED); +#endif + if (err) + return rte_flow_error_set + (error, err, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "cannot read counter"); + qc->hits_set = 1; + qc->bytes_set = 1; + qc->hits = counters[0] - flow->counter->hits; + qc->bytes = counters[1] - flow->counter->bytes; + if (qc->reset) { + flow->counter->hits = counters[0]; + flow->counter->bytes = counters[1]; + } + return 0; + } + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "flow does not have counter"); +#else + (void)flow; + (void)data; + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "counters are not available"); +#endif +} + +/** + * Add a verbs item specification into @p flow. + * + * @param[in, out] flow + * Pointer to flow structure. + * @param[in] src + * Create specification. + * @param[in] size + * Size in bytes of the specification to copy. + */ +static void +flow_verbs_spec_add(struct mlx5_flow *flow, void *src, unsigned int size) +{ + struct mlx5_flow_verbs *verbs = &flow->verbs; + + if (verbs->specs) { + void *dst; + + dst = (void *)(verbs->specs + verbs->size); + memcpy(dst, src, size); + ++verbs->attr->num_of_specs; + } + verbs->size += size; +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in] item + * Item specification. + * @param[in] item_flags + * Bit field with all detected items. + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + */ +static void +flow_verbs_translate_item_eth(const struct rte_flow_item *item, + uint64_t *item_flags, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_item_eth *spec = item->spec; + const struct rte_flow_item_eth *mask = item->mask; + const int tunnel = !!(*item_flags & MLX5_FLOW_LAYER_TUNNEL); + const unsigned int size = sizeof(struct ibv_flow_spec_eth); + struct ibv_flow_spec_eth eth = { + .type = IBV_FLOW_SPEC_ETH | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_eth_mask; + if (spec) { + unsigned int i; + + memcpy(ð.val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN); + memcpy(ð.val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN); + eth.val.ether_type = spec->type; + memcpy(ð.mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN); + memcpy(ð.mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN); + eth.mask.ether_type = mask->type; + /* Remove unwanted bits from values. */ + for (i = 0; i < ETHER_ADDR_LEN; ++i) { + eth.val.dst_mac[i] &= eth.mask.dst_mac[i]; + eth.val.src_mac[i] &= eth.mask.src_mac[i]; + } + eth.val.ether_type &= eth.mask.ether_type; + dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L2; + } + flow_verbs_spec_add(dev_flow, ð, size); + *item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; +} + +/** + * Update the VLAN tag in the Verbs Ethernet specification. + * This function assumes that the input is valid and there is space to add + * the requested item. + * + * @param[in, out] attr + * Pointer to Verbs attributes structure. + * @param[in] eth + * Verbs structure containing the VLAN information to copy. + */ +static void +flow_verbs_item_vlan_update(struct ibv_flow_attr *attr, + struct ibv_flow_spec_eth *eth) +{ + unsigned int i; + const enum ibv_flow_spec_type search = eth->type; + struct ibv_spec_header *hdr = (struct ibv_spec_header *) + ((uint8_t *)attr + sizeof(struct ibv_flow_attr)); + + for (i = 0; i != attr->num_of_specs; ++i) { + if (hdr->type == search) { + struct ibv_flow_spec_eth *e = + (struct ibv_flow_spec_eth *)hdr; + + e->val.vlan_tag = eth->val.vlan_tag; + e->mask.vlan_tag = eth->mask.vlan_tag; + e->val.ether_type = eth->val.ether_type; + e->mask.ether_type = eth->mask.ether_type; + break; + } + hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size); + } +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in] item + * Item specification. + * @param[in, out] item_flags + * Bit mask that holds all detected items. + * @param[in, out] dev_flow + * Pointer to dev_flow structure. + */ +static void +flow_verbs_translate_item_vlan(const struct rte_flow_item *item, + uint64_t *item_flags, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_item_vlan *spec = item->spec; + const struct rte_flow_item_vlan *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_eth); + const int tunnel = !!(*item_flags & MLX5_FLOW_LAYER_TUNNEL); + struct ibv_flow_spec_eth eth = { + .type = IBV_FLOW_SPEC_ETH | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, + }; + const uint32_t l2m = tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + + if (!mask) + mask = &rte_flow_item_vlan_mask; + if (spec) { + eth.val.vlan_tag = spec->tci; + eth.mask.vlan_tag = mask->tci; + eth.val.vlan_tag &= eth.mask.vlan_tag; + eth.val.ether_type = spec->inner_type; + eth.mask.ether_type = mask->inner_type; + eth.val.ether_type &= eth.mask.ether_type; + } + if (!(*item_flags & l2m)) { + dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L2; + flow_verbs_spec_add(dev_flow, ð, size); + } else { + flow_verbs_item_vlan_update(dev_flow->verbs.attr, ð); + size = 0; /* Only an update is done in eth specification. */ + } + *item_flags |= tunnel ? + (MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_VLAN) : + (MLX5_FLOW_LAYER_OUTER_L2 | MLX5_FLOW_LAYER_OUTER_VLAN); +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in] item + * Item specification. + * @param[in, out] item_flags + * Bit mask that marks all detected items. + * @param[in, out] dev_flow + * Pointer to sepacific flow structure. + */ +static void +flow_verbs_translate_item_ipv4(const struct rte_flow_item *item, + uint64_t *item_flags, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_item_ipv4 *spec = item->spec; + const struct rte_flow_item_ipv4 *mask = item->mask; + const int tunnel = !!(*item_flags & MLX5_FLOW_LAYER_TUNNEL); + unsigned int size = sizeof(struct ibv_flow_spec_ipv4_ext); + struct ibv_flow_spec_ipv4_ext ipv4 = { + .type = IBV_FLOW_SPEC_IPV4_EXT | + (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_ipv4_mask; + *item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; + if (spec) { + ipv4.val = (struct ibv_flow_ipv4_ext_filter){ + .src_ip = spec->hdr.src_addr, + .dst_ip = spec->hdr.dst_addr, + .proto = spec->hdr.next_proto_id, + .tos = spec->hdr.type_of_service, + }; + ipv4.mask = (struct ibv_flow_ipv4_ext_filter){ + .src_ip = mask->hdr.src_addr, + .dst_ip = mask->hdr.dst_addr, + .proto = mask->hdr.next_proto_id, + .tos = mask->hdr.type_of_service, + }; + /* Remove unwanted bits from values. */ + ipv4.val.src_ip &= ipv4.mask.src_ip; + ipv4.val.dst_ip &= ipv4.mask.dst_ip; + ipv4.val.proto &= ipv4.mask.proto; + ipv4.val.tos &= ipv4.mask.tos; + } + dev_flow->verbs.hash_fields |= + mlx5_flow_hashfields_adjust(dev_flow, tunnel, + MLX5_IPV4_LAYER_TYPES, + MLX5_IPV4_IBV_RX_HASH); + dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L3; + flow_verbs_spec_add(dev_flow, &ipv4, size); +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in] item + * Item specification. + * @param[in, out] item_flags + * Bit mask that marks all detected items. + * @param[in, out] dev_flow + * Pointer to sepacific flow structure. + */ +static void +flow_verbs_translate_item_ipv6(const struct rte_flow_item *item, + uint64_t *item_flags, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_item_ipv6 *spec = item->spec; + const struct rte_flow_item_ipv6 *mask = item->mask; + const int tunnel = !!(dev_flow->layers & MLX5_FLOW_LAYER_TUNNEL); + unsigned int size = sizeof(struct ibv_flow_spec_ipv6); + struct ibv_flow_spec_ipv6 ipv6 = { + .type = IBV_FLOW_SPEC_IPV6 | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_ipv6_mask; + *item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; + if (spec) { + unsigned int i; + uint32_t vtc_flow_val; + uint32_t vtc_flow_mask; + + memcpy(&ipv6.val.src_ip, spec->hdr.src_addr, + RTE_DIM(ipv6.val.src_ip)); + memcpy(&ipv6.val.dst_ip, spec->hdr.dst_addr, + RTE_DIM(ipv6.val.dst_ip)); + memcpy(&ipv6.mask.src_ip, mask->hdr.src_addr, + RTE_DIM(ipv6.mask.src_ip)); + memcpy(&ipv6.mask.dst_ip, mask->hdr.dst_addr, + RTE_DIM(ipv6.mask.dst_ip)); + vtc_flow_val = rte_be_to_cpu_32(spec->hdr.vtc_flow); + vtc_flow_mask = rte_be_to_cpu_32(mask->hdr.vtc_flow); + ipv6.val.flow_label = + rte_cpu_to_be_32((vtc_flow_val & IPV6_HDR_FL_MASK) >> + IPV6_HDR_FL_SHIFT); + ipv6.val.traffic_class = (vtc_flow_val & IPV6_HDR_TC_MASK) >> + IPV6_HDR_TC_SHIFT; + ipv6.val.next_hdr = spec->hdr.proto; + ipv6.val.hop_limit = spec->hdr.hop_limits; + ipv6.mask.flow_label = + rte_cpu_to_be_32((vtc_flow_mask & IPV6_HDR_FL_MASK) >> + IPV6_HDR_FL_SHIFT); + ipv6.mask.traffic_class = (vtc_flow_mask & IPV6_HDR_TC_MASK) >> + IPV6_HDR_TC_SHIFT; + ipv6.mask.next_hdr = mask->hdr.proto; + ipv6.mask.hop_limit = mask->hdr.hop_limits; + /* Remove unwanted bits from values. */ + for (i = 0; i < RTE_DIM(ipv6.val.src_ip); ++i) { + ipv6.val.src_ip[i] &= ipv6.mask.src_ip[i]; + ipv6.val.dst_ip[i] &= ipv6.mask.dst_ip[i]; + } + ipv6.val.flow_label &= ipv6.mask.flow_label; + ipv6.val.traffic_class &= ipv6.mask.traffic_class; + ipv6.val.next_hdr &= ipv6.mask.next_hdr; + ipv6.val.hop_limit &= ipv6.mask.hop_limit; + } + dev_flow->verbs.hash_fields |= + mlx5_flow_hashfields_adjust(dev_flow, tunnel, + MLX5_IPV6_LAYER_TYPES, + MLX5_IPV6_IBV_RX_HASH); + dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L3; + flow_verbs_spec_add(dev_flow, &ipv6, size); +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in] item + * Item specification. + * @param[in, out] item_flags + * Bit mask that marks all detected items. + * @param[in, out] dev_flow + * Pointer to sepacific flow structure. + */ +static void +flow_verbs_translate_item_udp(const struct rte_flow_item *item, + uint64_t *item_flags, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_item_udp *spec = item->spec; + const struct rte_flow_item_udp *mask = item->mask; + const int tunnel = !!(*item_flags & MLX5_FLOW_LAYER_TUNNEL); + unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp); + struct ibv_flow_spec_tcp_udp udp = { + .type = IBV_FLOW_SPEC_UDP | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_udp_mask; + *item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; + if (spec) { + udp.val.dst_port = spec->hdr.dst_port; + udp.val.src_port = spec->hdr.src_port; + udp.mask.dst_port = mask->hdr.dst_port; + udp.mask.src_port = mask->hdr.src_port; + /* Remove unwanted bits from values. */ + udp.val.src_port &= udp.mask.src_port; + udp.val.dst_port &= udp.mask.dst_port; + } + dev_flow->verbs.hash_fields |= + mlx5_flow_hashfields_adjust(dev_flow, tunnel, ETH_RSS_UDP, + (IBV_RX_HASH_SRC_PORT_UDP | + IBV_RX_HASH_DST_PORT_UDP)); + dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L4; + flow_verbs_spec_add(dev_flow, &udp, size); +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in] item + * Item specification. + * @param[in, out] item_flags + * Bit mask that marks all detected items. + * @param[in, out] dev_flow + * Pointer to sepacific flow structure. + */ +static void +flow_verbs_translate_item_tcp(const struct rte_flow_item *item, + uint64_t *item_flags, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_item_tcp *spec = item->spec; + const struct rte_flow_item_tcp *mask = item->mask; + const int tunnel = !!(dev_flow->layers & MLX5_FLOW_LAYER_TUNNEL); + unsigned int size = sizeof(struct ibv_flow_spec_tcp_udp); + struct ibv_flow_spec_tcp_udp tcp = { + .type = IBV_FLOW_SPEC_TCP | (tunnel ? IBV_FLOW_SPEC_INNER : 0), + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_tcp_mask; + *item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + if (spec) { + tcp.val.dst_port = spec->hdr.dst_port; + tcp.val.src_port = spec->hdr.src_port; + tcp.mask.dst_port = mask->hdr.dst_port; + tcp.mask.src_port = mask->hdr.src_port; + /* Remove unwanted bits from values. */ + tcp.val.src_port &= tcp.mask.src_port; + tcp.val.dst_port &= tcp.mask.dst_port; + } + dev_flow->verbs.hash_fields |= + mlx5_flow_hashfields_adjust(dev_flow, tunnel, ETH_RSS_TCP, + (IBV_RX_HASH_SRC_PORT_TCP | + IBV_RX_HASH_DST_PORT_TCP)); + dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L4; + flow_verbs_spec_add(dev_flow, &tcp, size); +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in] item + * Item specification. + * @param[in, out] item_flags + * Bit mask that marks all detected items. + * @param[in, out] dev_flow + * Pointer to sepacific flow structure. + */ +static void +flow_verbs_translate_item_vxlan(const struct rte_flow_item *item, + uint64_t *item_flags, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_item_vxlan *spec = item->spec; + const struct rte_flow_item_vxlan *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_tunnel); + struct ibv_flow_spec_tunnel vxlan = { + .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, + .size = size, + }; + union vni { + uint32_t vlan_id; + uint8_t vni[4]; + } id = { .vlan_id = 0, }; + + if (!mask) + mask = &rte_flow_item_vxlan_mask; + if (spec) { + memcpy(&id.vni[1], spec->vni, 3); + vxlan.val.tunnel_id = id.vlan_id; + memcpy(&id.vni[1], mask->vni, 3); + vxlan.mask.tunnel_id = id.vlan_id; + /* Remove unwanted bits from values. */ + vxlan.val.tunnel_id &= vxlan.mask.tunnel_id; + } + flow_verbs_spec_add(dev_flow, &vxlan, size); + dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L2; + *item_flags |= MLX5_FLOW_LAYER_VXLAN; +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in] item + * Item specification. + * @param[in, out] item_flags + * Bit mask that marks all detected items. + * @param[in, out] dev_flow + * Pointer to sepacific flow structure. + */ +static void +flow_verbs_translate_item_vxlan_gpe(const struct rte_flow_item *item, + uint64_t *item_flags, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_item_vxlan_gpe *spec = item->spec; + const struct rte_flow_item_vxlan_gpe *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_tunnel); + struct ibv_flow_spec_tunnel vxlan_gpe = { + .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, + .size = size, + }; + union vni { + uint32_t vlan_id; + uint8_t vni[4]; + } id = { .vlan_id = 0, }; + + if (!mask) + mask = &rte_flow_item_vxlan_gpe_mask; + if (spec) { + memcpy(&id.vni[1], spec->vni, 3); + vxlan_gpe.val.tunnel_id = id.vlan_id; + memcpy(&id.vni[1], mask->vni, 3); + vxlan_gpe.mask.tunnel_id = id.vlan_id; + /* Remove unwanted bits from values. */ + vxlan_gpe.val.tunnel_id &= vxlan_gpe.mask.tunnel_id; + } + flow_verbs_spec_add(dev_flow, &vxlan_gpe, size); + dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L2; + *item_flags |= MLX5_FLOW_LAYER_VXLAN_GPE; +} + +/** + * Update the protocol in Verbs IPv4/IPv6 spec. + * + * @param[in, out] attr + * Pointer to Verbs attributes structure. + * @param[in] search + * Specification type to search in order to update the IP protocol. + * @param[in] protocol + * Protocol value to set if none is present in the specification. + */ +static void +flow_verbs_item_gre_ip_protocol_update(struct ibv_flow_attr *attr, + enum ibv_flow_spec_type search, + uint8_t protocol) +{ + unsigned int i; + struct ibv_spec_header *hdr = (struct ibv_spec_header *) + ((uint8_t *)attr + sizeof(struct ibv_flow_attr)); + + if (!attr) + return; + for (i = 0; i != attr->num_of_specs; ++i) { + if (hdr->type == search) { + union { + struct ibv_flow_spec_ipv4_ext *ipv4; + struct ibv_flow_spec_ipv6 *ipv6; + } ip; + + switch (search) { + case IBV_FLOW_SPEC_IPV4_EXT: + ip.ipv4 = (struct ibv_flow_spec_ipv4_ext *)hdr; + if (!ip.ipv4->val.proto) { + ip.ipv4->val.proto = protocol; + ip.ipv4->mask.proto = 0xff; + } + break; + case IBV_FLOW_SPEC_IPV6: + ip.ipv6 = (struct ibv_flow_spec_ipv6 *)hdr; + if (!ip.ipv6->val.next_hdr) { + ip.ipv6->val.next_hdr = protocol; + ip.ipv6->mask.next_hdr = 0xff; + } + break; + default: + break; + } + break; + } + hdr = (struct ibv_spec_header *)((uint8_t *)hdr + hdr->size); + } +} + +/** + * Convert the @p item into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested item + * into the flow. + * + * @param[in] item + * Item specification. + * @param[in, out] item_flags + * Bit mask that marks all detected items. + * @param[in, out] dev_flow + * Pointer to sepacific flow structure. + */ +static void +flow_verbs_translate_item_gre(const struct rte_flow_item *item __rte_unused, + uint64_t *item_flags, + struct mlx5_flow *dev_flow) +{ + struct mlx5_flow_verbs *verbs = &dev_flow->verbs; +#ifndef HAVE_IBV_DEVICE_MPLS_SUPPORT + unsigned int size = sizeof(struct ibv_flow_spec_tunnel); + struct ibv_flow_spec_tunnel tunnel = { + .type = IBV_FLOW_SPEC_VXLAN_TUNNEL, + .size = size, + }; +#else + const struct rte_flow_item_gre *spec = item->spec; + const struct rte_flow_item_gre *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_gre); + struct ibv_flow_spec_gre tunnel = { + .type = IBV_FLOW_SPEC_GRE, + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_gre_mask; + if (spec) { + tunnel.val.c_ks_res0_ver = spec->c_rsvd0_ver; + tunnel.val.protocol = spec->protocol; + tunnel.mask.c_ks_res0_ver = mask->c_rsvd0_ver; + tunnel.mask.protocol = mask->protocol; + /* Remove unwanted bits from values. */ + tunnel.val.c_ks_res0_ver &= tunnel.mask.c_ks_res0_ver; + tunnel.val.protocol &= tunnel.mask.protocol; + tunnel.val.key &= tunnel.mask.key; + } +#endif + if (*item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) + flow_verbs_item_gre_ip_protocol_update(verbs->attr, + IBV_FLOW_SPEC_IPV4_EXT, + IPPROTO_GRE); + else + flow_verbs_item_gre_ip_protocol_update(verbs->attr, + IBV_FLOW_SPEC_IPV6, + IPPROTO_GRE); + flow_verbs_spec_add(dev_flow, &tunnel, size); + verbs->attr->priority = MLX5_PRIORITY_MAP_L2; + *item_flags |= MLX5_FLOW_LAYER_GRE; +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. This function also return the action that was added. + * + * @param[in] item + * Item specification. + * @param[in, out] item_flags + * Bit mask that marks all detected items. + * @param[in, out] dev_flow + * Pointer to sepacific flow structure. + */ +static void +flow_verbs_translate_item_mpls(const struct rte_flow_item *item __rte_unused, + uint64_t *action_flags __rte_unused, + struct mlx5_flow *dev_flow __rte_unused) +{ +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + const struct rte_flow_item_mpls *spec = item->spec; + const struct rte_flow_item_mpls *mask = item->mask; + unsigned int size = sizeof(struct ibv_flow_spec_mpls); + struct ibv_flow_spec_mpls mpls = { + .type = IBV_FLOW_SPEC_MPLS, + .size = size, + }; + + if (!mask) + mask = &rte_flow_item_mpls_mask; + if (spec) { + memcpy(&mpls.val.label, spec, sizeof(mpls.val.label)); + memcpy(&mpls.mask.label, mask, sizeof(mpls.mask.label)); + /* Remove unwanted bits from values. */ + mpls.val.label &= mpls.mask.label; + } + flow_verbs_spec_add(dev_flow, &mpls, size); + dev_flow->verbs.attr->priority = MLX5_PRIORITY_MAP_L2; + *action_flags |= MLX5_FLOW_LAYER_MPLS; +#endif +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. This function also return the action that was added. + * + * @param[in, out] action_flags + * Pointer to the detected actions. + * @param[in] dev_flow + * Pointer to mlx5_flow. + */ +static void +flow_verbs_translate_action_drop(uint64_t *action_flags, + struct mlx5_flow *dev_flow) +{ + unsigned int size = sizeof(struct ibv_flow_spec_action_drop); + struct ibv_flow_spec_action_drop drop = { + .type = IBV_FLOW_SPEC_ACTION_DROP, + .size = size, + }; + + flow_verbs_spec_add(dev_flow, &drop, size); + *action_flags |= MLX5_FLOW_ACTION_DROP; +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. This function also return the action that was added. + * + * @param[in] action + * Action configuration. + * @param[in, out] action_flags + * Pointer to the detected actions. + * @param[in] dev_flow + * Pointer to mlx5_flow. + */ +static void +flow_verbs_translate_action_queue(const struct rte_flow_action *action, + uint64_t *action_flags, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_action_queue *queue = action->conf; + struct rte_flow *flow = dev_flow->flow; + + if (flow->queue) + (*flow->queue)[0] = queue->index; + flow->rss.queue_num = 1; + *action_flags |= MLX5_FLOW_ACTION_QUEUE; +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. This function also return the action that was added. + * + * @param[in] action + * Action configuration. + * @param[in, out] action_flags + * Pointer to the detected actions. + * @param[in] dev_flow + * Pointer to mlx5_flow. + */ +static void +flow_verbs_translate_action_rss(const struct rte_flow_action *action, + uint64_t *action_flags, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_action_rss *rss = action->conf; + struct rte_flow *flow = dev_flow->flow; + + if (flow->queue) + memcpy((*flow->queue), rss->queue, + rss->queue_num * sizeof(uint16_t)); + flow->rss.queue_num = rss->queue_num; + memcpy(flow->key, rss->key, MLX5_RSS_HASH_KEY_LEN); + flow->rss.types = rss->types; + flow->rss.level = rss->level; + *action_flags |= MLX5_FLOW_ACTION_RSS; +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. This function also return the action that was added. + * + * @param[in] action + * Action configuration. + * @param[in, out] action_flags + * Pointer to the detected actions. + * @param[in] dev_flow + * Pointer to mlx5_flow. + */ +static void +flow_verbs_translate_action_flag + (const struct rte_flow_action *action __rte_unused, + uint64_t *action_flags, + struct mlx5_flow *dev_flow) +{ + unsigned int size = sizeof(struct ibv_flow_spec_action_tag); + struct ibv_flow_spec_action_tag tag = { + .type = IBV_FLOW_SPEC_ACTION_TAG, + .size = size, + .tag_id = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT), + }; + *action_flags |= MLX5_FLOW_ACTION_MARK; + flow_verbs_spec_add(dev_flow, &tag, size); +} + +/** + * Update verbs specification to modify the flag to mark. + * + * @param[in, out] verbs + * Pointer to the mlx5_flow_verbs structure. + * @param[in] mark_id + * Mark identifier to replace the flag. + */ +static void +flow_verbs_mark_update(struct mlx5_flow_verbs *verbs, uint32_t mark_id) +{ + struct ibv_spec_header *hdr; + int i; + + if (!verbs) + return; + /* Update Verbs specification. */ + hdr = (struct ibv_spec_header *)verbs->specs; + if (!hdr) + return; + for (i = 0; i != verbs->attr->num_of_specs; ++i) { + if (hdr->type == IBV_FLOW_SPEC_ACTION_TAG) { + struct ibv_flow_spec_action_tag *t = + (struct ibv_flow_spec_action_tag *)hdr; + + t->tag_id = mlx5_flow_mark_set(mark_id); + } + hdr = (struct ibv_spec_header *)((uintptr_t)hdr + hdr->size); + } +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. This function also return the action that was added. + * + * @param[in] action + * Action configuration. + * @param[in, out] action_flags + * Pointer to the detected actions. + * @param[in] dev_flow + * Pointer to mlx5_flow. + */ +static void +flow_verbs_translate_action_mark(const struct rte_flow_action *action, + uint64_t *action_flags, + struct mlx5_flow *dev_flow) +{ + const struct rte_flow_action_mark *mark = action->conf; + unsigned int size = sizeof(struct ibv_flow_spec_action_tag); + struct ibv_flow_spec_action_tag tag = { + .type = IBV_FLOW_SPEC_ACTION_TAG, + .size = size, + }; + struct mlx5_flow_verbs *verbs = &dev_flow->verbs; + + if (*action_flags & MLX5_FLOW_ACTION_FLAG) { + flow_verbs_mark_update(verbs, mark->id); + size = 0; + } else { + tag.tag_id = mlx5_flow_mark_set(mark->id); + flow_verbs_spec_add(dev_flow, &tag, size); + } + *action_flags |= MLX5_FLOW_ACTION_MARK; +} + +/** + * Convert the @p action into a Verbs specification. This function assumes that + * the input is valid and that there is space to insert the requested action + * into the flow. This function also return the action that was added. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] action + * Action configuration. + * @param[in, out] action_flags + * Pointer to the detected actions. + * @param[in] dev_flow + * Pointer to mlx5_flow. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 On success else a negative errno value is returned and rte_errno is set. + */ +static int +flow_verbs_translate_action_count(struct rte_eth_dev *dev, + const struct rte_flow_action *action, + uint64_t *action_flags, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + const struct rte_flow_action_count *count = action->conf; + struct rte_flow *flow = dev_flow->flow; +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \ + defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + unsigned int size = sizeof(struct ibv_flow_spec_counter_action); + struct ibv_flow_spec_counter_action counter = { + .type = IBV_FLOW_SPEC_ACTION_COUNT, + .size = size, + }; +#endif + + if (!flow->counter) { + flow->counter = flow_verbs_counter_new(dev, count->shared, + count->id); + if (!flow->counter) + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_ACTION, + action, + "cannot get counter" + " context."); + } + *action_flags |= MLX5_FLOW_ACTION_COUNT; +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) + counter.counter_set_handle = flow->counter->cs->handle; + flow_verbs_spec_add(dev_flow, &counter, size); +#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + counter.counters = flow->counter->cs; + flow_verbs_spec_add(dev_flow, &counter, size); +#endif + return 0; +} + +/** + * Internal validation function. For validating both actions and items. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_verbs_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + int ret; + uint64_t action_flags = 0; + uint64_t item_flags = 0; + int tunnel = 0; + uint8_t next_protocol = 0xff; + + if (items == NULL) + return -1; + ret = mlx5_flow_validate_attributes(dev, attr, error); + if (ret < 0) + return ret; + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + int ret = 0; + + tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + ret = mlx5_flow_validate_item_eth(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + ret = mlx5_flow_validate_item_vlan(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : + MLX5_FLOW_LAYER_OUTER_VLAN; + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + ret = mlx5_flow_validate_item_ipv4(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; + if (items->mask != NULL && + ((const struct rte_flow_item_ipv4 *) + items->mask)->hdr.next_proto_id) + next_protocol = + ((const struct rte_flow_item_ipv4 *) + (items->spec))->hdr.next_proto_id; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + ret = mlx5_flow_validate_item_ipv6(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; + if (items->mask != NULL && + ((const struct rte_flow_item_ipv6 *) + items->mask)->hdr.proto) + next_protocol = + ((const struct rte_flow_item_ipv6 *) + items->spec)->hdr.proto; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + ret = mlx5_flow_validate_item_udp(items, item_flags, + next_protocol, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; + break; + case RTE_FLOW_ITEM_TYPE_TCP: + ret = mlx5_flow_validate_item_tcp + (items, item_flags, + next_protocol, + &rte_flow_item_tcp_mask, + error); + if (ret < 0) + return ret; + item_flags |= tunnel ? MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + ret = mlx5_flow_validate_item_vxlan(items, item_flags, + error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_VXLAN; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + ret = mlx5_flow_validate_item_vxlan_gpe(items, + item_flags, + dev, error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_VXLAN_GPE; + break; + case RTE_FLOW_ITEM_TYPE_GRE: + ret = mlx5_flow_validate_item_gre(items, item_flags, + next_protocol, error); + if (ret < 0) + return ret; + item_flags |= MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + ret = mlx5_flow_validate_item_mpls(items, item_flags, + next_protocol, + error); + if (ret < 0) + return ret; + if (next_protocol != 0xff && + next_protocol != IPPROTO_MPLS) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, items, + "protocol filtering not compatible" + " with MPLS layer"); + item_flags |= MLX5_FLOW_LAYER_MPLS; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "item not supported"); + } + } + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + ret = mlx5_flow_validate_action_flag(action_flags, + attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_FLAG; + break; + case RTE_FLOW_ACTION_TYPE_MARK: + ret = mlx5_flow_validate_action_mark(actions, + action_flags, + attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_MARK; + break; + case RTE_FLOW_ACTION_TYPE_DROP: + ret = mlx5_flow_validate_action_drop(action_flags, + attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_DROP; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + ret = mlx5_flow_validate_action_queue(actions, + action_flags, dev, + attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_QUEUE; + break; + case RTE_FLOW_ACTION_TYPE_RSS: + ret = mlx5_flow_validate_action_rss(actions, + action_flags, dev, + attr, + error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_RSS; + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = mlx5_flow_validate_action_count(dev, attr, error); + if (ret < 0) + return ret; + action_flags |= MLX5_FLOW_ACTION_COUNT; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + } + if (!(action_flags & MLX5_FLOW_FATE_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "no fate action is found"); + return 0; +} + +/** + * Calculate the required bytes that are needed for the action part of the verbs + * flow, in addtion returns bit-fields with all the detected action, in order to + * avoid another interation over the actions. + * + * @param[in] actions + * Pointer to the list of actions. + * @param[out] action_flags + * Pointer to the detected actions. + * + * @return + * The size of the memory needed for all actions. + */ +static int +flow_verbs_get_actions_and_size(const struct rte_flow_action actions[], + uint64_t *action_flags) +{ + int size = 0; + uint64_t detected_actions = 0; + + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + size += sizeof(struct ibv_flow_spec_action_tag); + detected_actions |= MLX5_FLOW_ACTION_FLAG; + break; + case RTE_FLOW_ACTION_TYPE_MARK: + size += sizeof(struct ibv_flow_spec_action_tag); + detected_actions |= MLX5_FLOW_ACTION_MARK; + break; + case RTE_FLOW_ACTION_TYPE_DROP: + size += sizeof(struct ibv_flow_spec_action_drop); + detected_actions |= MLX5_FLOW_ACTION_DROP; + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + detected_actions |= MLX5_FLOW_ACTION_QUEUE; + break; + case RTE_FLOW_ACTION_TYPE_RSS: + detected_actions |= MLX5_FLOW_ACTION_RSS; + break; + case RTE_FLOW_ACTION_TYPE_COUNT: +#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) || \ + defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) + size += sizeof(struct ibv_flow_spec_counter_action); +#endif + detected_actions |= MLX5_FLOW_ACTION_COUNT; + break; + default: + break; + } + } + *action_flags = detected_actions; + return size; +} + +/** + * Calculate the required bytes that are needed for the item part of the verbs + * flow, in addtion returns bit-fields with all the detected action, in order to + * avoid another interation over the actions. + * + * @param[in] actions + * Pointer to the list of items. + * @param[in, out] item_flags + * Pointer to the detected items. + * + * @return + * The size of the memory needed for all items. + */ +static int +flow_verbs_get_items_and_size(const struct rte_flow_item items[], + uint64_t *item_flags) +{ + int size = 0; + uint64_t detected_items = 0; + + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + int tunnel = !!(detected_items & MLX5_FLOW_LAYER_TUNNEL); + + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + size += sizeof(struct ibv_flow_spec_eth); + detected_items |= tunnel ? MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + size += sizeof(struct ibv_flow_spec_eth); + detected_items |= tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : + MLX5_FLOW_LAYER_OUTER_VLAN; + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + size += sizeof(struct ibv_flow_spec_ipv4_ext); + detected_items |= tunnel ? + MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + size += sizeof(struct ibv_flow_spec_ipv6); + detected_items |= tunnel ? + MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + size += sizeof(struct ibv_flow_spec_tcp_udp); + detected_items |= tunnel ? + MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; + break; + case RTE_FLOW_ITEM_TYPE_TCP: + size += sizeof(struct ibv_flow_spec_tcp_udp); + detected_items |= tunnel ? + MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + size += sizeof(struct ibv_flow_spec_tunnel); + detected_items |= MLX5_FLOW_LAYER_VXLAN; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + size += sizeof(struct ibv_flow_spec_tunnel); + detected_items |= MLX5_FLOW_LAYER_VXLAN_GPE; + break; +#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT + case RTE_FLOW_ITEM_TYPE_GRE: + size += sizeof(struct ibv_flow_spec_gre); + detected_items |= MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + size += sizeof(struct ibv_flow_spec_mpls); + detected_items |= MLX5_FLOW_LAYER_MPLS; + break; +#else + case RTE_FLOW_ITEM_TYPE_GRE: + size += sizeof(struct ibv_flow_spec_tunnel); + detected_items |= MLX5_FLOW_LAYER_TUNNEL; + break; +#endif + default: + break; + } + } + *item_flags = detected_items; + return size; +} + +/** + * Internal preparation function. Allocate mlx5_flow with the required size. + * The required size is calculate based on the actions and items. This function + * also returns the detected actions and items for later use. + * + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] item_flags + * Pointer to bit mask of all items detected. + * @param[out] action_flags + * Pointer to bit mask of all actions detected. + * @param[out] error + * Pointer to the error structure. + * + * @return + * Pointer to mlx5_flow object on success, otherwise NULL and rte_errno + * is set. + */ +static struct mlx5_flow * +flow_verbs_prepare(const struct rte_flow_attr *attr __rte_unused, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + uint64_t *item_flags, + uint64_t *action_flags, + struct rte_flow_error *error) +{ + uint32_t size = sizeof(struct mlx5_flow) + sizeof(struct ibv_flow_attr); + struct mlx5_flow *flow; + + size += flow_verbs_get_actions_and_size(actions, action_flags); + size += flow_verbs_get_items_and_size(items, item_flags); + flow = rte_calloc(__func__, 1, size, 0); + if (!flow) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "not enough memory to create flow"); + return NULL; + } + flow->verbs.attr = (void *)(flow + 1); + flow->verbs.specs = + (uint8_t *)(flow + 1) + sizeof(struct ibv_flow_attr); + return flow; +} + +/** + * Fill the flow with verb spec. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] dev_flow + * Pointer to the mlx5 flow. + * @param[in] attr + * Pointer to the flow attributes. + * @param[in] items + * Pointer to the list of items. + * @param[in] actions + * Pointer to the list of actions. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, else a negative errno value otherwise and rte_ernno is set. + */ +static int +flow_verbs_translate(struct rte_eth_dev *dev, + struct mlx5_flow *dev_flow, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + uint64_t action_flags = 0; + uint64_t item_flags = 0; + uint64_t priority = attr->priority; + struct priv *priv = dev->data->dev_private; + + if (priority == MLX5_FLOW_PRIO_RSVD) + priority = priv->config.flow_prio - 1; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + int ret; + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_FLAG: + flow_verbs_translate_action_flag(actions, + &action_flags, + dev_flow); + break; + case RTE_FLOW_ACTION_TYPE_MARK: + flow_verbs_translate_action_mark(actions, + &action_flags, + dev_flow); + break; + case RTE_FLOW_ACTION_TYPE_DROP: + flow_verbs_translate_action_drop(&action_flags, + dev_flow); + break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + flow_verbs_translate_action_queue(actions, + &action_flags, + dev_flow); + break; + case RTE_FLOW_ACTION_TYPE_RSS: + flow_verbs_translate_action_rss(actions, + &action_flags, + dev_flow); + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = flow_verbs_translate_action_count(dev, + actions, + &action_flags, + dev_flow, + error); + if (ret < 0) + return ret; + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + } + /* Device flow should have action flags by flow_drv_prepare(). */ + assert(dev_flow->flow->actions == action_flags); + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + switch (items->type) { + case RTE_FLOW_ITEM_TYPE_VOID: + break; + case RTE_FLOW_ITEM_TYPE_ETH: + flow_verbs_translate_item_eth(items, &item_flags, + dev_flow); + break; + case RTE_FLOW_ITEM_TYPE_VLAN: + flow_verbs_translate_item_vlan(items, &item_flags, + dev_flow); + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + flow_verbs_translate_item_ipv4(items, &item_flags, + dev_flow); + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + flow_verbs_translate_item_ipv6(items, &item_flags, + dev_flow); + break; + case RTE_FLOW_ITEM_TYPE_UDP: + flow_verbs_translate_item_udp(items, &item_flags, + dev_flow); + break; + case RTE_FLOW_ITEM_TYPE_TCP: + flow_verbs_translate_item_tcp(items, &item_flags, + dev_flow); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + flow_verbs_translate_item_vxlan(items, &item_flags, + dev_flow); + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: + flow_verbs_translate_item_vxlan_gpe(items, &item_flags, + dev_flow); + break; + case RTE_FLOW_ITEM_TYPE_GRE: + flow_verbs_translate_item_gre(items, &item_flags, + dev_flow); + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + flow_verbs_translate_item_mpls(items, &item_flags, + dev_flow); + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, + "item not supported"); + } + } + dev_flow->verbs.attr->priority = + mlx5_flow_adjust_priority(dev, priority, + dev_flow->verbs.attr->priority); + return 0; +} + +/** + * Remove the flow from the NIC but keeps it in memory. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + */ +static void +flow_verbs_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_flow_verbs *verbs; + struct mlx5_flow *dev_flow; + + if (!flow) + return; + LIST_FOREACH(dev_flow, &flow->dev_flows, next) { + verbs = &dev_flow->verbs; + if (verbs->flow) { + claim_zero(mlx5_glue->destroy_flow(verbs->flow)); + verbs->flow = NULL; + } + if (verbs->hrxq) { + if (flow->actions & MLX5_FLOW_ACTION_DROP) + mlx5_hrxq_drop_release(dev); + else + mlx5_hrxq_release(dev, verbs->hrxq); + verbs->hrxq = NULL; + } + } + if (flow->counter) { + flow_verbs_counter_release(flow->counter); + flow->counter = NULL; + } +} + +/** + * Remove the flow from the NIC and the memory. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + */ +static void +flow_verbs_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_flow *dev_flow; + + if (!flow) + return; + flow_verbs_remove(dev, flow); + while (!LIST_EMPTY(&flow->dev_flows)) { + dev_flow = LIST_FIRST(&flow->dev_flows); + LIST_REMOVE(dev_flow, next); + rte_free(dev_flow); + } +} + +/** + * Apply the flow to the NIC. + * + * @param[in] dev + * Pointer to the Ethernet device structure. + * @param[in, out] flow + * Pointer to flow structure. + * @param[out] error + * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_verbs_apply(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct mlx5_flow_verbs *verbs; + struct mlx5_flow *dev_flow; + int err; + + LIST_FOREACH(dev_flow, &flow->dev_flows, next) { + verbs = &dev_flow->verbs; + if (flow->actions & MLX5_FLOW_ACTION_DROP) { + verbs->hrxq = mlx5_hrxq_drop_new(dev); + if (!verbs->hrxq) { + rte_flow_error_set + (error, errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot get drop hash queue"); + goto error; + } + } else { + struct mlx5_hrxq *hrxq; + + hrxq = mlx5_hrxq_get(dev, flow->key, + MLX5_RSS_HASH_KEY_LEN, + verbs->hash_fields, + (*flow->queue), + flow->rss.queue_num); + if (!hrxq) + hrxq = mlx5_hrxq_new(dev, flow->key, + MLX5_RSS_HASH_KEY_LEN, + verbs->hash_fields, + (*flow->queue), + flow->rss.queue_num, + !!(dev_flow->layers & + MLX5_FLOW_LAYER_TUNNEL)); + if (!hrxq) { + rte_flow_error_set + (error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot get hash queue"); + goto error; + } + verbs->hrxq = hrxq; + } + verbs->flow = mlx5_glue->create_flow(verbs->hrxq->qp, + verbs->attr); + if (!verbs->flow) { + rte_flow_error_set(error, errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "hardware refuses to create flow"); + goto error; + } + } + return 0; +error: + err = rte_errno; /* Save rte_errno before cleanup. */ + LIST_FOREACH(dev_flow, &flow->dev_flows, next) { + verbs = &dev_flow->verbs; + if (verbs->hrxq) { + if (flow->actions & MLX5_FLOW_ACTION_DROP) + mlx5_hrxq_drop_release(dev); + else + mlx5_hrxq_release(dev, verbs->hrxq); + verbs->hrxq = NULL; + } + } + rte_errno = err; /* Restore rte_errno. */ + return -rte_errno; +} + +/** + * Query a flow. + * + * @see rte_flow_query() + * @see rte_flow_ops + */ +static int +flow_verbs_query(struct rte_eth_dev *dev, + struct rte_flow *flow, + const struct rte_flow_action *actions, + void *data, + struct rte_flow_error *error) +{ + int ret = -EINVAL; + + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_VOID: + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + ret = flow_verbs_counter_query(dev, flow, data, error); + break; + default: + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, + actions, + "action not supported"); + } + } + return ret; +} + +const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops = { + .validate = flow_verbs_validate, + .prepare = flow_verbs_prepare, + .translate = flow_verbs_translate, + .apply = flow_verbs_apply, + .remove = flow_verbs_remove, + .destroy = flow_verbs_destroy, + .query = flow_verbs_query, +}; diff --git a/drivers/net/mlx5/mlx5_glue.c b/drivers/net/mlx5/mlx5_glue.c index 84f9492a..1afb114f 100644 --- a/drivers/net/mlx5/mlx5_glue.c +++ b/drivers/net/mlx5/mlx5_glue.c @@ -215,7 +215,7 @@ static struct ibv_counter_set * mlx5_glue_create_counter_set(struct ibv_context *context, struct ibv_counter_set_init_attr *init_attr) { -#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT +#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_V42 (void)context; (void)init_attr; return NULL; @@ -227,7 +227,7 @@ mlx5_glue_create_counter_set(struct ibv_context *context, static int mlx5_glue_destroy_counter_set(struct ibv_counter_set *cs) { -#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT +#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_V42 (void)cs; return ENOTSUP; #else @@ -240,7 +240,7 @@ mlx5_glue_describe_counter_set(struct ibv_context *context, uint16_t counter_set_id, struct ibv_counter_set_description *cs_desc) { -#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT +#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_V42 (void)context; (void)counter_set_id; (void)cs_desc; @@ -254,7 +254,7 @@ static int mlx5_glue_query_counter_set(struct ibv_query_counter_set_attr *query_attr, struct ibv_counter_set_data *cs_data) { -#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT +#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_V42 (void)query_attr; (void)cs_data; return ENOTSUP; @@ -263,6 +263,62 @@ mlx5_glue_query_counter_set(struct ibv_query_counter_set_attr *query_attr, #endif } +static struct ibv_counters * +mlx5_glue_create_counters(struct ibv_context *context, + struct ibv_counters_init_attr *init_attr) +{ +#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_V45 + (void)context; + (void)init_attr; + return NULL; +#else + return ibv_create_counters(context, init_attr); +#endif +} + +static int +mlx5_glue_destroy_counters(struct ibv_counters *counters) +{ +#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_V45 + (void)counters; + return ENOTSUP; +#else + return ibv_destroy_counters(counters); +#endif +} + +static int +mlx5_glue_attach_counters(struct ibv_counters *counters, + struct ibv_counter_attach_attr *attr, + struct ibv_flow *flow) +{ +#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_V45 + (void)counters; + (void)attr; + (void)flow; + return ENOTSUP; +#else + return ibv_attach_counters_point_flow(counters, attr, flow); +#endif +} + +static int +mlx5_glue_query_counters(struct ibv_counters *counters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags) +{ +#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_V45 + (void)counters; + (void)counters_value; + (void)ncounters; + (void)flags; + return ENOTSUP; +#else + return ibv_read_counters(counters, counters_value, ncounters, flags); +#endif +} + static void mlx5_glue_ack_async_event(struct ibv_async_event *event) { @@ -346,6 +402,48 @@ mlx5_glue_dv_create_qp(struct ibv_context *context, #endif } +static struct mlx5dv_flow_matcher * +mlx5_glue_dv_create_flow_matcher(struct ibv_context *context, + struct mlx5dv_flow_matcher_attr *matcher_attr) +{ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + return mlx5dv_create_flow_matcher(context, matcher_attr); +#else + (void)context; + (void)matcher_attr; + return NULL; +#endif +} + +static struct ibv_flow * +mlx5_glue_dv_create_flow(struct mlx5dv_flow_matcher *matcher, + struct mlx5dv_flow_match_parameters *match_value, + size_t num_actions, + struct mlx5dv_flow_action_attr *actions_attr) +{ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + return mlx5dv_create_flow(matcher, match_value, + num_actions, actions_attr); +#else + (void)matcher; + (void)match_value; + (void)num_actions; + (void)actions_attr; + return NULL; +#endif +} + +static int +mlx5_glue_dv_destroy_flow_matcher(struct mlx5dv_flow_matcher *matcher) +{ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + return mlx5dv_destroy_flow_matcher(matcher); +#else + (void)matcher; + return 0; +#endif +} + alignas(RTE_CACHE_LINE_SIZE) const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){ .version = MLX5_GLUE_VERSION, @@ -382,6 +480,10 @@ const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){ .destroy_counter_set = mlx5_glue_destroy_counter_set, .describe_counter_set = mlx5_glue_describe_counter_set, .query_counter_set = mlx5_glue_query_counter_set, + .create_counters = mlx5_glue_create_counters, + .destroy_counters = mlx5_glue_destroy_counters, + .attach_counters = mlx5_glue_attach_counters, + .query_counters = mlx5_glue_query_counters, .ack_async_event = mlx5_glue_ack_async_event, .get_async_event = mlx5_glue_get_async_event, .port_state_str = mlx5_glue_port_state_str, @@ -392,4 +494,7 @@ const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){ .dv_set_context_attr = mlx5_glue_dv_set_context_attr, .dv_init_obj = mlx5_glue_dv_init_obj, .dv_create_qp = mlx5_glue_dv_create_qp, + .dv_create_flow_matcher = mlx5_glue_dv_create_flow_matcher, + .dv_destroy_flow_matcher = mlx5_glue_dv_destroy_flow_matcher, + .dv_create_flow = mlx5_glue_dv_create_flow, }; diff --git a/drivers/net/mlx5/mlx5_glue.h b/drivers/net/mlx5/mlx5_glue.h index e584d367..44bfefed 100644 --- a/drivers/net/mlx5/mlx5_glue.h +++ b/drivers/net/mlx5/mlx5_glue.h @@ -23,7 +23,7 @@ #define MLX5_GLUE_VERSION "" #endif -#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT +#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_V42 struct ibv_counter_set; struct ibv_counter_set_data; struct ibv_counter_set_description; @@ -31,6 +31,12 @@ struct ibv_counter_set_init_attr; struct ibv_query_counter_set_attr; #endif +#ifndef HAVE_IBV_DEVICE_COUNTERS_SET_V45 +struct ibv_counters; +struct ibv_counters_init_attr; +struct ibv_counter_attach_attr; +#endif + #ifndef HAVE_IBV_DEVICE_TUNNEL_SUPPORT struct mlx5dv_qp_init_attr; #endif @@ -39,6 +45,13 @@ struct mlx5dv_qp_init_attr; struct mlx5dv_wq_init_attr; #endif +#ifndef HAVE_IBV_FLOW_DV_SUPPORT +struct mlx5dv_flow_matcher; +struct mlx5dv_flow_matcher_attr; +struct mlx5dv_flow_action_attr; +struct mlx5dv_flow_match_parameters; +#endif + /* LIB_GLUE_VERSION must be updated every time this structure is modified. */ struct mlx5_glue { const char *version; @@ -99,6 +112,17 @@ struct mlx5_glue { struct ibv_counter_set_description *cs_desc); int (*query_counter_set)(struct ibv_query_counter_set_attr *query_attr, struct ibv_counter_set_data *cs_data); + struct ibv_counters *(*create_counters) + (struct ibv_context *context, + struct ibv_counters_init_attr *init_attr); + int (*destroy_counters)(struct ibv_counters *counters); + int (*attach_counters)(struct ibv_counters *counters, + struct ibv_counter_attach_attr *attr, + struct ibv_flow *flow); + int (*query_counters)(struct ibv_counters *counters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags); void (*ack_async_event)(struct ibv_async_event *event); int (*get_async_event)(struct ibv_context *context, struct ibv_async_event *event); @@ -122,6 +146,14 @@ struct mlx5_glue { (struct ibv_context *context, struct ibv_qp_init_attr_ex *qp_init_attr_ex, struct mlx5dv_qp_init_attr *dv_qp_init_attr); + struct mlx5dv_flow_matcher *(*dv_create_flow_matcher) + (struct ibv_context *context, + struct mlx5dv_flow_matcher_attr *matcher_attr); + int (*dv_destroy_flow_matcher)(struct mlx5dv_flow_matcher *matcher); + struct ibv_flow *(*dv_create_flow)(struct mlx5dv_flow_matcher *matcher, + struct mlx5dv_flow_match_parameters *match_value, + size_t num_actions, + struct mlx5dv_flow_action_attr *actions_attr); }; const struct mlx5_glue *mlx5_glue; diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c index 12ee37f5..672a4761 100644 --- a/drivers/net/mlx5/mlx5_mac.c +++ b/drivers/net/mlx5/mlx5_mac.c @@ -49,7 +49,7 @@ mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN]) struct ifreq request; int ret; - ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0); + ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request); if (ret) return ret; memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c index 1d1bcb5f..f4b15d3f 100644 --- a/drivers/net/mlx5/mlx5_mr.c +++ b/drivers/net/mlx5/mlx5_mr.c @@ -277,6 +277,23 @@ mr_find_next_chunk(struct mlx5_mr *mr, struct mlx5_mr_cache *entry, uintptr_t end = 0; uint32_t idx = 0; + /* MR for external memory doesn't have memseg list. */ + if (mr->msl == NULL) { + struct ibv_mr *ibv_mr = mr->ibv_mr; + + assert(mr->ms_bmp_n == 1); + assert(mr->ms_n == 1); + assert(base_idx == 0); + /* + * Can't search it from memseg list but get it directly from + * verbs MR as there's only one chunk. + */ + entry->start = (uintptr_t)ibv_mr->addr; + entry->end = (uintptr_t)ibv_mr->addr + mr->ibv_mr->length; + entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey); + /* Returning 1 ends iteration. */ + return 1; + } for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { if (rte_bitmap_get(mr->ms_bmp, idx)) { const struct rte_memseg_list *msl; @@ -811,6 +828,7 @@ mlx5_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len) mr = mr_lookup_dev_list(dev, &entry, start); if (mr == NULL) continue; + assert(mr->msl); /* Can't be external memory. */ ms = rte_mem_virt2memseg((void *)start, msl); assert(ms != NULL); assert(msl->page_sz == ms->hugepage_sz); @@ -1061,6 +1079,139 @@ mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl) (void *)mr_ctrl, mr_ctrl->cur_gen); } +/** + * Called during rte_mempool_mem_iter() by mlx5_mr_update_ext_mp(). + * + * Externally allocated chunk is registered and a MR is created for the chunk. + * The MR object is added to the global list. If memseg list of a MR object + * (mr->msl) is null, the MR object can be regarded as externally allocated + * memory. + * + * Once external memory is registered, it should be static. If the memory is + * freed and the virtual address range has different physical memory mapped + * again, it may cause crash on device due to the wrong translation entry. PMD + * can't track the free event of the external memory for now. + */ +static void +mlx5_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque, + struct rte_mempool_memhdr *memhdr, + unsigned mem_idx __rte_unused) +{ + struct mr_update_mp_data *data = opaque; + struct rte_eth_dev *dev = data->dev; + struct priv *priv = dev->data->dev_private; + struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl; + struct mlx5_mr *mr = NULL; + uintptr_t addr = (uintptr_t)memhdr->addr; + size_t len = memhdr->len; + struct mlx5_mr_cache entry; + uint32_t lkey; + + /* If already registered, it should return. */ + rte_rwlock_read_lock(&priv->mr.rwlock); + lkey = mr_lookup_dev(dev, &entry, addr); + rte_rwlock_read_unlock(&priv->mr.rwlock); + if (lkey != UINT32_MAX) + return; + mr = rte_zmalloc_socket(NULL, + RTE_ALIGN_CEIL(sizeof(*mr), + RTE_CACHE_LINE_SIZE), + RTE_CACHE_LINE_SIZE, mp->socket_id); + if (mr == NULL) { + DRV_LOG(WARNING, + "port %u unable to allocate memory for a new MR of" + " mempool (%s).", + dev->data->port_id, mp->name); + data->ret = -1; + return; + } + DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)", + dev->data->port_id, mem_idx, mp->name); + mr->ibv_mr = mlx5_glue->reg_mr(priv->pd, (void *)addr, len, + IBV_ACCESS_LOCAL_WRITE); + if (mr->ibv_mr == NULL) { + DRV_LOG(WARNING, + "port %u fail to create a verbs MR for address (%p)", + dev->data->port_id, (void *)addr); + rte_free(mr); + data->ret = -1; + return; + } + mr->msl = NULL; /* Mark it is external memory. */ + mr->ms_bmp = NULL; + mr->ms_n = 1; + mr->ms_bmp_n = 1; + rte_rwlock_write_lock(&priv->mr.rwlock); + LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); + DRV_LOG(DEBUG, + "port %u MR CREATED (%p) for external memory %p:\n" + " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," + " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", + dev->data->port_id, (void *)mr, (void *)addr, + addr, addr + len, rte_cpu_to_be_32(mr->ibv_mr->lkey), + mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); + /* Insert to the global cache table. */ + mr_insert_dev_cache(dev, mr); + rte_rwlock_write_unlock(&priv->mr.rwlock); + /* Insert to the local cache table */ + mlx5_mr_addr2mr_bh(dev, mr_ctrl, addr); +} + +/** + * Register MR for entire memory chunks in a Mempool having externally allocated + * memory and fill in local cache. + * + * @param dev + * Pointer to Ethernet device. + * @param mr_ctrl + * Pointer to per-queue MR control structure. + * @param mp + * Pointer to registering Mempool. + * + * @return + * 0 on success, -1 on failure. + */ +static uint32_t +mlx5_mr_update_ext_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, + struct rte_mempool *mp) +{ + struct mr_update_mp_data data = { + .dev = dev, + .mr_ctrl = mr_ctrl, + .ret = 0, + }; + + rte_mempool_mem_iter(mp, mlx5_mr_update_ext_mp_cb, &data); + return data.ret; +} + +/** + * Register MR entire memory chunks in a Mempool having externally allocated + * memory and search LKey of the address to return. + * + * @param dev + * Pointer to Ethernet device. + * @param addr + * Search key. + * @param mp + * Pointer to registering Mempool where addr belongs. + * + * @return + * LKey for address on success, UINT32_MAX on failure. + */ +uint32_t +mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr, + struct rte_mempool *mp) +{ + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq, struct mlx5_txq_ctrl, txq); + struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl; + struct priv *priv = txq_ctrl->priv; + + mlx5_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp); + return mlx5_tx_addr2mr_bh(txq, addr); +} + /* Called during rte_mempool_mem_iter() by mlx5_mr_update_mp(). */ static void mlx5_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque, @@ -1104,6 +1255,10 @@ mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl, }; rte_mempool_mem_iter(mp, mlx5_mr_update_mp_cb, &data); + if (data.ret < 0 && rte_errno == ENXIO) { + /* Mempool may have externally allocated memory. */ + return mlx5_mr_update_ext_mp(dev, mr_ctrl, mp); + } return data.ret; } diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c deleted file mode 100644 index a1c8c340..00000000 --- a/drivers/net/mlx5/mlx5_nl_flow.c +++ /dev/null @@ -1,1248 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright 2018 6WIND S.A. - * Copyright 2018 Mellanox Technologies, Ltd - */ - -#include <assert.h> -#include <errno.h> -#include <libmnl/libmnl.h> -#include <linux/if_ether.h> -#include <linux/netlink.h> -#include <linux/pkt_cls.h> -#include <linux/pkt_sched.h> -#include <linux/rtnetlink.h> -#include <linux/tc_act/tc_gact.h> -#include <linux/tc_act/tc_mirred.h> -#include <netinet/in.h> -#include <stdalign.h> -#include <stdbool.h> -#include <stddef.h> -#include <stdint.h> -#include <stdlib.h> -#include <sys/socket.h> - -#include <rte_byteorder.h> -#include <rte_errno.h> -#include <rte_ether.h> -#include <rte_flow.h> - -#include "mlx5.h" -#include "mlx5_autoconf.h" - -#ifdef HAVE_TC_ACT_VLAN - -#include <linux/tc_act/tc_vlan.h> - -#else /* HAVE_TC_ACT_VLAN */ - -#define TCA_VLAN_ACT_POP 1 -#define TCA_VLAN_ACT_PUSH 2 -#define TCA_VLAN_ACT_MODIFY 3 -#define TCA_VLAN_PARMS 2 -#define TCA_VLAN_PUSH_VLAN_ID 3 -#define TCA_VLAN_PUSH_VLAN_PROTOCOL 4 -#define TCA_VLAN_PAD 5 -#define TCA_VLAN_PUSH_VLAN_PRIORITY 6 - -struct tc_vlan { - tc_gen; - int v_action; -}; - -#endif /* HAVE_TC_ACT_VLAN */ - -/* Normally found in linux/netlink.h. */ -#ifndef NETLINK_CAP_ACK -#define NETLINK_CAP_ACK 10 -#endif - -/* Normally found in linux/pkt_sched.h. */ -#ifndef TC_H_MIN_INGRESS -#define TC_H_MIN_INGRESS 0xfff2u -#endif - -/* Normally found in linux/pkt_cls.h. */ -#ifndef TCA_CLS_FLAGS_SKIP_SW -#define TCA_CLS_FLAGS_SKIP_SW (1 << 1) -#endif -#ifndef HAVE_TCA_FLOWER_ACT -#define TCA_FLOWER_ACT 3 -#endif -#ifndef HAVE_TCA_FLOWER_FLAGS -#define TCA_FLOWER_FLAGS 22 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE -#define TCA_FLOWER_KEY_ETH_TYPE 8 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_ETH_DST -#define TCA_FLOWER_KEY_ETH_DST 4 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK -#define TCA_FLOWER_KEY_ETH_DST_MASK 5 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC -#define TCA_FLOWER_KEY_ETH_SRC 6 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK -#define TCA_FLOWER_KEY_ETH_SRC_MASK 7 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO -#define TCA_FLOWER_KEY_IP_PROTO 9 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC -#define TCA_FLOWER_KEY_IPV4_SRC 10 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK -#define TCA_FLOWER_KEY_IPV4_SRC_MASK 11 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST -#define TCA_FLOWER_KEY_IPV4_DST 12 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK -#define TCA_FLOWER_KEY_IPV4_DST_MASK 13 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC -#define TCA_FLOWER_KEY_IPV6_SRC 14 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK -#define TCA_FLOWER_KEY_IPV6_SRC_MASK 15 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST -#define TCA_FLOWER_KEY_IPV6_DST 16 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK -#define TCA_FLOWER_KEY_IPV6_DST_MASK 17 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC -#define TCA_FLOWER_KEY_TCP_SRC 18 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK -#define TCA_FLOWER_KEY_TCP_SRC_MASK 35 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_TCP_DST -#define TCA_FLOWER_KEY_TCP_DST 19 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK -#define TCA_FLOWER_KEY_TCP_DST_MASK 36 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC -#define TCA_FLOWER_KEY_UDP_SRC 20 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK -#define TCA_FLOWER_KEY_UDP_SRC_MASK 37 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_UDP_DST -#define TCA_FLOWER_KEY_UDP_DST 21 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK -#define TCA_FLOWER_KEY_UDP_DST_MASK 38 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID -#define TCA_FLOWER_KEY_VLAN_ID 23 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO -#define TCA_FLOWER_KEY_VLAN_PRIO 24 -#endif -#ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE -#define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25 -#endif - -/** Parser state definitions for mlx5_nl_flow_trans[]. */ -enum mlx5_nl_flow_trans { - INVALID, - BACK, - ATTR, - PATTERN, - ITEM_VOID, - ITEM_PORT_ID, - ITEM_ETH, - ITEM_VLAN, - ITEM_IPV4, - ITEM_IPV6, - ITEM_TCP, - ITEM_UDP, - ACTIONS, - ACTION_VOID, - ACTION_PORT_ID, - ACTION_DROP, - ACTION_OF_POP_VLAN, - ACTION_OF_PUSH_VLAN, - ACTION_OF_SET_VLAN_VID, - ACTION_OF_SET_VLAN_PCP, - END, -}; - -#define TRANS(...) (const enum mlx5_nl_flow_trans []){ __VA_ARGS__, INVALID, } - -#define PATTERN_COMMON \ - ITEM_VOID, ITEM_PORT_ID, ACTIONS -#define ACTIONS_COMMON \ - ACTION_VOID, ACTION_OF_POP_VLAN, ACTION_OF_PUSH_VLAN, \ - ACTION_OF_SET_VLAN_VID, ACTION_OF_SET_VLAN_PCP -#define ACTIONS_FATE \ - ACTION_PORT_ID, ACTION_DROP - -/** Parser state transitions used by mlx5_nl_flow_transpose(). */ -static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = { - [INVALID] = NULL, - [BACK] = NULL, - [ATTR] = TRANS(PATTERN), - [PATTERN] = TRANS(ITEM_ETH, PATTERN_COMMON), - [ITEM_VOID] = TRANS(BACK), - [ITEM_PORT_ID] = TRANS(BACK), - [ITEM_ETH] = TRANS(ITEM_IPV4, ITEM_IPV6, ITEM_VLAN, PATTERN_COMMON), - [ITEM_VLAN] = TRANS(ITEM_IPV4, ITEM_IPV6, PATTERN_COMMON), - [ITEM_IPV4] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON), - [ITEM_IPV6] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON), - [ITEM_TCP] = TRANS(PATTERN_COMMON), - [ITEM_UDP] = TRANS(PATTERN_COMMON), - [ACTIONS] = TRANS(ACTIONS_FATE, ACTIONS_COMMON), - [ACTION_VOID] = TRANS(BACK), - [ACTION_PORT_ID] = TRANS(ACTION_VOID, END), - [ACTION_DROP] = TRANS(ACTION_VOID, END), - [ACTION_OF_POP_VLAN] = TRANS(ACTIONS_FATE, ACTIONS_COMMON), - [ACTION_OF_PUSH_VLAN] = TRANS(ACTIONS_FATE, ACTIONS_COMMON), - [ACTION_OF_SET_VLAN_VID] = TRANS(ACTIONS_FATE, ACTIONS_COMMON), - [ACTION_OF_SET_VLAN_PCP] = TRANS(ACTIONS_FATE, ACTIONS_COMMON), - [END] = NULL, -}; - -/** Empty masks for known item types. */ -static const union { - struct rte_flow_item_port_id port_id; - struct rte_flow_item_eth eth; - struct rte_flow_item_vlan vlan; - struct rte_flow_item_ipv4 ipv4; - struct rte_flow_item_ipv6 ipv6; - struct rte_flow_item_tcp tcp; - struct rte_flow_item_udp udp; -} mlx5_nl_flow_mask_empty; - -/** Supported masks for known item types. */ -static const struct { - struct rte_flow_item_port_id port_id; - struct rte_flow_item_eth eth; - struct rte_flow_item_vlan vlan; - struct rte_flow_item_ipv4 ipv4; - struct rte_flow_item_ipv6 ipv6; - struct rte_flow_item_tcp tcp; - struct rte_flow_item_udp udp; -} mlx5_nl_flow_mask_supported = { - .port_id = { - .id = 0xffffffff, - }, - .eth = { - .type = RTE_BE16(0xffff), - .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", - .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", - }, - .vlan = { - /* PCP and VID only, no DEI. */ - .tci = RTE_BE16(0xefff), - .inner_type = RTE_BE16(0xffff), - }, - .ipv4.hdr = { - .next_proto_id = 0xff, - .src_addr = RTE_BE32(0xffffffff), - .dst_addr = RTE_BE32(0xffffffff), - }, - .ipv6.hdr = { - .proto = 0xff, - .src_addr = - "\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff", - .dst_addr = - "\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff", - }, - .tcp.hdr = { - .src_port = RTE_BE16(0xffff), - .dst_port = RTE_BE16(0xffff), - }, - .udp.hdr = { - .src_port = RTE_BE16(0xffff), - .dst_port = RTE_BE16(0xffff), - }, -}; - -/** - * Retrieve mask for pattern item. - * - * This function does basic sanity checks on a pattern item in order to - * return the most appropriate mask for it. - * - * @param[in] item - * Item specification. - * @param[in] mask_default - * Default mask for pattern item as specified by the flow API. - * @param[in] mask_supported - * Mask fields supported by the implementation. - * @param[in] mask_empty - * Empty mask to return when there is no specification. - * @param[out] error - * Perform verbose error reporting if not NULL. - * - * @return - * Either @p item->mask or one of the mask parameters on success, NULL - * otherwise and rte_errno is set. - */ -static const void * -mlx5_nl_flow_item_mask(const struct rte_flow_item *item, - const void *mask_default, - const void *mask_supported, - const void *mask_empty, - size_t mask_size, - struct rte_flow_error *error) -{ - const uint8_t *mask; - size_t i; - - /* item->last and item->mask cannot exist without item->spec. */ - if (!item->spec && (item->mask || item->last)) { - rte_flow_error_set - (error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item, - "\"mask\" or \"last\" field provided without a" - " corresponding \"spec\""); - return NULL; - } - /* No spec, no mask, no problem. */ - if (!item->spec) - return mask_empty; - mask = item->mask ? item->mask : mask_default; - assert(mask); - /* - * Single-pass check to make sure that: - * - Mask is supported, no bits are set outside mask_supported. - * - Both item->spec and item->last are included in mask. - */ - for (i = 0; i != mask_size; ++i) { - if (!mask[i]) - continue; - if ((mask[i] | ((const uint8_t *)mask_supported)[i]) != - ((const uint8_t *)mask_supported)[i]) { - rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, - mask, "unsupported field found in \"mask\""); - return NULL; - } - if (item->last && - (((const uint8_t *)item->spec)[i] & mask[i]) != - (((const uint8_t *)item->last)[i] & mask[i])) { - rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_LAST, - item->last, - "range between \"spec\" and \"last\" not" - " comprised in \"mask\""); - return NULL; - } - } - return mask; -} - -/** - * Transpose flow rule description to rtnetlink message. - * - * This function transposes a flow rule description to a traffic control - * (TC) filter creation message ready to be sent over Netlink. - * - * Target interface is specified as the first entry of the @p ptoi table. - * Subsequent entries enable this function to resolve other DPDK port IDs - * found in the flow rule. - * - * @param[out] buf - * Output message buffer. May be NULL when @p size is 0. - * @param size - * Size of @p buf. Message may be truncated if not large enough. - * @param[in] ptoi - * DPDK port ID to network interface index translation table. This table - * is terminated by an entry with a zero ifindex value. - * @param[in] attr - * Flow rule attributes. - * @param[in] pattern - * Pattern specification. - * @param[in] actions - * Associated actions. - * @param[out] error - * Perform verbose error reporting if not NULL. - * - * @return - * A positive value representing the exact size of the message in bytes - * regardless of the @p size parameter on success, a negative errno value - * otherwise and rte_errno is set. - */ -int -mlx5_nl_flow_transpose(void *buf, - size_t size, - const struct mlx5_nl_flow_ptoi *ptoi, - const struct rte_flow_attr *attr, - const struct rte_flow_item *pattern, - const struct rte_flow_action *actions, - struct rte_flow_error *error) -{ - alignas(struct nlmsghdr) - uint8_t buf_tmp[mnl_nlmsg_size(sizeof(struct tcmsg) + 1024)]; - const struct rte_flow_item *item; - const struct rte_flow_action *action; - unsigned int n; - uint32_t act_index_cur; - bool in_port_id_set; - bool eth_type_set; - bool vlan_present; - bool vlan_eth_type_set; - bool ip_proto_set; - struct nlattr *na_flower; - struct nlattr *na_flower_act; - struct nlattr *na_vlan_id; - struct nlattr *na_vlan_priority; - const enum mlx5_nl_flow_trans *trans; - const enum mlx5_nl_flow_trans *back; - - if (!size) - goto error_nobufs; -init: - item = pattern; - action = actions; - n = 0; - act_index_cur = 0; - in_port_id_set = false; - eth_type_set = false; - vlan_present = false; - vlan_eth_type_set = false; - ip_proto_set = false; - na_flower = NULL; - na_flower_act = NULL; - na_vlan_id = NULL; - na_vlan_priority = NULL; - trans = TRANS(ATTR); - back = trans; -trans: - switch (trans[n++]) { - union { - const struct rte_flow_item_port_id *port_id; - const struct rte_flow_item_eth *eth; - const struct rte_flow_item_vlan *vlan; - const struct rte_flow_item_ipv4 *ipv4; - const struct rte_flow_item_ipv6 *ipv6; - const struct rte_flow_item_tcp *tcp; - const struct rte_flow_item_udp *udp; - } spec, mask; - union { - const struct rte_flow_action_port_id *port_id; - const struct rte_flow_action_of_push_vlan *of_push_vlan; - const struct rte_flow_action_of_set_vlan_vid * - of_set_vlan_vid; - const struct rte_flow_action_of_set_vlan_pcp * - of_set_vlan_pcp; - } conf; - struct nlmsghdr *nlh; - struct tcmsg *tcm; - struct nlattr *act_index; - struct nlattr *act; - unsigned int i; - - case INVALID: - if (item->type) - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, - item, "unsupported pattern item combination"); - else if (action->type) - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, - action, "unsupported action combination"); - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, - "flow rule lacks some kind of fate action"); - case BACK: - trans = back; - n = 0; - goto trans; - case ATTR: - /* - * Supported attributes: no groups, some priorities and - * ingress only. Don't care about transfer as it is the - * caller's problem. - */ - if (attr->group) - return rte_flow_error_set - (error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_GROUP, - attr, "groups are not supported"); - if (attr->priority > 0xfffe) - return rte_flow_error_set - (error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, - attr, "lowest priority level is 0xfffe"); - if (!attr->ingress) - return rte_flow_error_set - (error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, - attr, "only ingress is supported"); - if (attr->egress) - return rte_flow_error_set - (error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, - attr, "egress is not supported"); - if (size < mnl_nlmsg_size(sizeof(*tcm))) - goto error_nobufs; - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = 0; - nlh->nlmsg_flags = 0; - nlh->nlmsg_seq = 0; - tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); - tcm->tcm_family = AF_UNSPEC; - tcm->tcm_ifindex = ptoi[0].ifindex; - /* - * Let kernel pick a handle by default. A predictable handle - * can be set by the caller on the resulting buffer through - * mlx5_nl_flow_brand(). - */ - tcm->tcm_handle = 0; - tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS); - /* - * Priority cannot be zero to prevent the kernel from - * picking one automatically. - */ - tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, - RTE_BE16(ETH_P_ALL)); - break; - case PATTERN: - if (!mnl_attr_put_strz_check(buf, size, TCA_KIND, "flower")) - goto error_nobufs; - na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS); - if (!na_flower) - goto error_nobufs; - if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS, - TCA_CLS_FLAGS_SKIP_SW)) - goto error_nobufs; - break; - case ITEM_VOID: - if (item->type != RTE_FLOW_ITEM_TYPE_VOID) - goto trans; - ++item; - break; - case ITEM_PORT_ID: - if (item->type != RTE_FLOW_ITEM_TYPE_PORT_ID) - goto trans; - mask.port_id = mlx5_nl_flow_item_mask - (item, &rte_flow_item_port_id_mask, - &mlx5_nl_flow_mask_supported.port_id, - &mlx5_nl_flow_mask_empty.port_id, - sizeof(mlx5_nl_flow_mask_supported.port_id), error); - if (!mask.port_id) - return -rte_errno; - if (mask.port_id == &mlx5_nl_flow_mask_empty.port_id) { - in_port_id_set = 1; - ++item; - break; - } - spec.port_id = item->spec; - if (mask.port_id->id && mask.port_id->id != 0xffffffff) - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, - mask.port_id, - "no support for partial mask on" - " \"id\" field"); - if (!mask.port_id->id) - i = 0; - else - for (i = 0; ptoi[i].ifindex; ++i) - if (ptoi[i].port_id == spec.port_id->id) - break; - if (!ptoi[i].ifindex) - return rte_flow_error_set - (error, ENODEV, RTE_FLOW_ERROR_TYPE_ITEM_SPEC, - spec.port_id, - "missing data to convert port ID to ifindex"); - tcm = mnl_nlmsg_get_payload(buf); - if (in_port_id_set && - ptoi[i].ifindex != (unsigned int)tcm->tcm_ifindex) - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_SPEC, - spec.port_id, - "cannot match traffic for several port IDs" - " through a single flow rule"); - tcm->tcm_ifindex = ptoi[i].ifindex; - in_port_id_set = 1; - ++item; - break; - case ITEM_ETH: - if (item->type != RTE_FLOW_ITEM_TYPE_ETH) - goto trans; - mask.eth = mlx5_nl_flow_item_mask - (item, &rte_flow_item_eth_mask, - &mlx5_nl_flow_mask_supported.eth, - &mlx5_nl_flow_mask_empty.eth, - sizeof(mlx5_nl_flow_mask_supported.eth), error); - if (!mask.eth) - return -rte_errno; - if (mask.eth == &mlx5_nl_flow_mask_empty.eth) { - ++item; - break; - } - spec.eth = item->spec; - if (mask.eth->type && mask.eth->type != RTE_BE16(0xffff)) - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, - mask.eth, - "no support for partial mask on" - " \"type\" field"); - if (mask.eth->type) { - if (!mnl_attr_put_u16_check(buf, size, - TCA_FLOWER_KEY_ETH_TYPE, - spec.eth->type)) - goto error_nobufs; - eth_type_set = 1; - } - if ((!is_zero_ether_addr(&mask.eth->dst) && - (!mnl_attr_put_check(buf, size, - TCA_FLOWER_KEY_ETH_DST, - ETHER_ADDR_LEN, - spec.eth->dst.addr_bytes) || - !mnl_attr_put_check(buf, size, - TCA_FLOWER_KEY_ETH_DST_MASK, - ETHER_ADDR_LEN, - mask.eth->dst.addr_bytes))) || - (!is_zero_ether_addr(&mask.eth->src) && - (!mnl_attr_put_check(buf, size, - TCA_FLOWER_KEY_ETH_SRC, - ETHER_ADDR_LEN, - spec.eth->src.addr_bytes) || - !mnl_attr_put_check(buf, size, - TCA_FLOWER_KEY_ETH_SRC_MASK, - ETHER_ADDR_LEN, - mask.eth->src.addr_bytes)))) - goto error_nobufs; - ++item; - break; - case ITEM_VLAN: - if (item->type != RTE_FLOW_ITEM_TYPE_VLAN) - goto trans; - mask.vlan = mlx5_nl_flow_item_mask - (item, &rte_flow_item_vlan_mask, - &mlx5_nl_flow_mask_supported.vlan, - &mlx5_nl_flow_mask_empty.vlan, - sizeof(mlx5_nl_flow_mask_supported.vlan), error); - if (!mask.vlan) - return -rte_errno; - if (!eth_type_set && - !mnl_attr_put_u16_check(buf, size, - TCA_FLOWER_KEY_ETH_TYPE, - RTE_BE16(ETH_P_8021Q))) - goto error_nobufs; - eth_type_set = 1; - vlan_present = 1; - if (mask.vlan == &mlx5_nl_flow_mask_empty.vlan) { - ++item; - break; - } - spec.vlan = item->spec; - if ((mask.vlan->tci & RTE_BE16(0xe000) && - (mask.vlan->tci & RTE_BE16(0xe000)) != RTE_BE16(0xe000)) || - (mask.vlan->tci & RTE_BE16(0x0fff) && - (mask.vlan->tci & RTE_BE16(0x0fff)) != RTE_BE16(0x0fff)) || - (mask.vlan->inner_type && - mask.vlan->inner_type != RTE_BE16(0xffff))) - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, - mask.vlan, - "no support for partial masks on" - " \"tci\" (PCP and VID parts) and" - " \"inner_type\" fields"); - if (mask.vlan->inner_type) { - if (!mnl_attr_put_u16_check - (buf, size, TCA_FLOWER_KEY_VLAN_ETH_TYPE, - spec.vlan->inner_type)) - goto error_nobufs; - vlan_eth_type_set = 1; - } - if ((mask.vlan->tci & RTE_BE16(0xe000) && - !mnl_attr_put_u8_check - (buf, size, TCA_FLOWER_KEY_VLAN_PRIO, - (rte_be_to_cpu_16(spec.vlan->tci) >> 13) & 0x7)) || - (mask.vlan->tci & RTE_BE16(0x0fff) && - !mnl_attr_put_u16_check - (buf, size, TCA_FLOWER_KEY_VLAN_ID, - rte_be_to_cpu_16(spec.vlan->tci & RTE_BE16(0x0fff))))) - goto error_nobufs; - ++item; - break; - case ITEM_IPV4: - if (item->type != RTE_FLOW_ITEM_TYPE_IPV4) - goto trans; - mask.ipv4 = mlx5_nl_flow_item_mask - (item, &rte_flow_item_ipv4_mask, - &mlx5_nl_flow_mask_supported.ipv4, - &mlx5_nl_flow_mask_empty.ipv4, - sizeof(mlx5_nl_flow_mask_supported.ipv4), error); - if (!mask.ipv4) - return -rte_errno; - if ((!eth_type_set || !vlan_eth_type_set) && - !mnl_attr_put_u16_check(buf, size, - vlan_present ? - TCA_FLOWER_KEY_VLAN_ETH_TYPE : - TCA_FLOWER_KEY_ETH_TYPE, - RTE_BE16(ETH_P_IP))) - goto error_nobufs; - eth_type_set = 1; - vlan_eth_type_set = 1; - if (mask.ipv4 == &mlx5_nl_flow_mask_empty.ipv4) { - ++item; - break; - } - spec.ipv4 = item->spec; - if (mask.ipv4->hdr.next_proto_id && - mask.ipv4->hdr.next_proto_id != 0xff) - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, - mask.ipv4, - "no support for partial mask on" - " \"hdr.next_proto_id\" field"); - if (mask.ipv4->hdr.next_proto_id) { - if (!mnl_attr_put_u8_check - (buf, size, TCA_FLOWER_KEY_IP_PROTO, - spec.ipv4->hdr.next_proto_id)) - goto error_nobufs; - ip_proto_set = 1; - } - if ((mask.ipv4->hdr.src_addr && - (!mnl_attr_put_u32_check(buf, size, - TCA_FLOWER_KEY_IPV4_SRC, - spec.ipv4->hdr.src_addr) || - !mnl_attr_put_u32_check(buf, size, - TCA_FLOWER_KEY_IPV4_SRC_MASK, - mask.ipv4->hdr.src_addr))) || - (mask.ipv4->hdr.dst_addr && - (!mnl_attr_put_u32_check(buf, size, - TCA_FLOWER_KEY_IPV4_DST, - spec.ipv4->hdr.dst_addr) || - !mnl_attr_put_u32_check(buf, size, - TCA_FLOWER_KEY_IPV4_DST_MASK, - mask.ipv4->hdr.dst_addr)))) - goto error_nobufs; - ++item; - break; - case ITEM_IPV6: - if (item->type != RTE_FLOW_ITEM_TYPE_IPV6) - goto trans; - mask.ipv6 = mlx5_nl_flow_item_mask - (item, &rte_flow_item_ipv6_mask, - &mlx5_nl_flow_mask_supported.ipv6, - &mlx5_nl_flow_mask_empty.ipv6, - sizeof(mlx5_nl_flow_mask_supported.ipv6), error); - if (!mask.ipv6) - return -rte_errno; - if ((!eth_type_set || !vlan_eth_type_set) && - !mnl_attr_put_u16_check(buf, size, - vlan_present ? - TCA_FLOWER_KEY_VLAN_ETH_TYPE : - TCA_FLOWER_KEY_ETH_TYPE, - RTE_BE16(ETH_P_IPV6))) - goto error_nobufs; - eth_type_set = 1; - vlan_eth_type_set = 1; - if (mask.ipv6 == &mlx5_nl_flow_mask_empty.ipv6) { - ++item; - break; - } - spec.ipv6 = item->spec; - if (mask.ipv6->hdr.proto && mask.ipv6->hdr.proto != 0xff) - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, - mask.ipv6, - "no support for partial mask on" - " \"hdr.proto\" field"); - if (mask.ipv6->hdr.proto) { - if (!mnl_attr_put_u8_check - (buf, size, TCA_FLOWER_KEY_IP_PROTO, - spec.ipv6->hdr.proto)) - goto error_nobufs; - ip_proto_set = 1; - } - if ((!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr) && - (!mnl_attr_put_check(buf, size, - TCA_FLOWER_KEY_IPV6_SRC, - sizeof(spec.ipv6->hdr.src_addr), - spec.ipv6->hdr.src_addr) || - !mnl_attr_put_check(buf, size, - TCA_FLOWER_KEY_IPV6_SRC_MASK, - sizeof(mask.ipv6->hdr.src_addr), - mask.ipv6->hdr.src_addr))) || - (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr) && - (!mnl_attr_put_check(buf, size, - TCA_FLOWER_KEY_IPV6_DST, - sizeof(spec.ipv6->hdr.dst_addr), - spec.ipv6->hdr.dst_addr) || - !mnl_attr_put_check(buf, size, - TCA_FLOWER_KEY_IPV6_DST_MASK, - sizeof(mask.ipv6->hdr.dst_addr), - mask.ipv6->hdr.dst_addr)))) - goto error_nobufs; - ++item; - break; - case ITEM_TCP: - if (item->type != RTE_FLOW_ITEM_TYPE_TCP) - goto trans; - mask.tcp = mlx5_nl_flow_item_mask - (item, &rte_flow_item_tcp_mask, - &mlx5_nl_flow_mask_supported.tcp, - &mlx5_nl_flow_mask_empty.tcp, - sizeof(mlx5_nl_flow_mask_supported.tcp), error); - if (!mask.tcp) - return -rte_errno; - if (!ip_proto_set && - !mnl_attr_put_u8_check(buf, size, - TCA_FLOWER_KEY_IP_PROTO, - IPPROTO_TCP)) - goto error_nobufs; - if (mask.tcp == &mlx5_nl_flow_mask_empty.tcp) { - ++item; - break; - } - spec.tcp = item->spec; - if ((mask.tcp->hdr.src_port && - mask.tcp->hdr.src_port != RTE_BE16(0xffff)) || - (mask.tcp->hdr.dst_port && - mask.tcp->hdr.dst_port != RTE_BE16(0xffff))) - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, - mask.tcp, - "no support for partial masks on" - " \"hdr.src_port\" and \"hdr.dst_port\"" - " fields"); - if ((mask.tcp->hdr.src_port && - (!mnl_attr_put_u16_check(buf, size, - TCA_FLOWER_KEY_TCP_SRC, - spec.tcp->hdr.src_port) || - !mnl_attr_put_u16_check(buf, size, - TCA_FLOWER_KEY_TCP_SRC_MASK, - mask.tcp->hdr.src_port))) || - (mask.tcp->hdr.dst_port && - (!mnl_attr_put_u16_check(buf, size, - TCA_FLOWER_KEY_TCP_DST, - spec.tcp->hdr.dst_port) || - !mnl_attr_put_u16_check(buf, size, - TCA_FLOWER_KEY_TCP_DST_MASK, - mask.tcp->hdr.dst_port)))) - goto error_nobufs; - ++item; - break; - case ITEM_UDP: - if (item->type != RTE_FLOW_ITEM_TYPE_UDP) - goto trans; - mask.udp = mlx5_nl_flow_item_mask - (item, &rte_flow_item_udp_mask, - &mlx5_nl_flow_mask_supported.udp, - &mlx5_nl_flow_mask_empty.udp, - sizeof(mlx5_nl_flow_mask_supported.udp), error); - if (!mask.udp) - return -rte_errno; - if (!ip_proto_set && - !mnl_attr_put_u8_check(buf, size, - TCA_FLOWER_KEY_IP_PROTO, - IPPROTO_UDP)) - goto error_nobufs; - if (mask.udp == &mlx5_nl_flow_mask_empty.udp) { - ++item; - break; - } - spec.udp = item->spec; - if ((mask.udp->hdr.src_port && - mask.udp->hdr.src_port != RTE_BE16(0xffff)) || - (mask.udp->hdr.dst_port && - mask.udp->hdr.dst_port != RTE_BE16(0xffff))) - return rte_flow_error_set - (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK, - mask.udp, - "no support for partial masks on" - " \"hdr.src_port\" and \"hdr.dst_port\"" - " fields"); - if ((mask.udp->hdr.src_port && - (!mnl_attr_put_u16_check(buf, size, - TCA_FLOWER_KEY_UDP_SRC, - spec.udp->hdr.src_port) || - !mnl_attr_put_u16_check(buf, size, - TCA_FLOWER_KEY_UDP_SRC_MASK, - mask.udp->hdr.src_port))) || - (mask.udp->hdr.dst_port && - (!mnl_attr_put_u16_check(buf, size, - TCA_FLOWER_KEY_UDP_DST, - spec.udp->hdr.dst_port) || - !mnl_attr_put_u16_check(buf, size, - TCA_FLOWER_KEY_UDP_DST_MASK, - mask.udp->hdr.dst_port)))) - goto error_nobufs; - ++item; - break; - case ACTIONS: - if (item->type != RTE_FLOW_ITEM_TYPE_END) - goto trans; - assert(na_flower); - assert(!na_flower_act); - na_flower_act = - mnl_attr_nest_start_check(buf, size, TCA_FLOWER_ACT); - if (!na_flower_act) - goto error_nobufs; - act_index_cur = 1; - break; - case ACTION_VOID: - if (action->type != RTE_FLOW_ACTION_TYPE_VOID) - goto trans; - ++action; - break; - case ACTION_PORT_ID: - if (action->type != RTE_FLOW_ACTION_TYPE_PORT_ID) - goto trans; - conf.port_id = action->conf; - if (conf.port_id->original) - i = 0; - else - for (i = 0; ptoi[i].ifindex; ++i) - if (ptoi[i].port_id == conf.port_id->id) - break; - if (!ptoi[i].ifindex) - return rte_flow_error_set - (error, ENODEV, RTE_FLOW_ERROR_TYPE_ACTION_CONF, - conf.port_id, - "missing data to convert port ID to ifindex"); - act_index = - mnl_attr_nest_start_check(buf, size, act_index_cur++); - if (!act_index || - !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "mirred")) - goto error_nobufs; - act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS); - if (!act) - goto error_nobufs; - if (!mnl_attr_put_check(buf, size, TCA_MIRRED_PARMS, - sizeof(struct tc_mirred), - &(struct tc_mirred){ - .action = TC_ACT_STOLEN, - .eaction = TCA_EGRESS_REDIR, - .ifindex = ptoi[i].ifindex, - })) - goto error_nobufs; - mnl_attr_nest_end(buf, act); - mnl_attr_nest_end(buf, act_index); - ++action; - break; - case ACTION_DROP: - if (action->type != RTE_FLOW_ACTION_TYPE_DROP) - goto trans; - act_index = - mnl_attr_nest_start_check(buf, size, act_index_cur++); - if (!act_index || - !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "gact")) - goto error_nobufs; - act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS); - if (!act) - goto error_nobufs; - if (!mnl_attr_put_check(buf, size, TCA_GACT_PARMS, - sizeof(struct tc_gact), - &(struct tc_gact){ - .action = TC_ACT_SHOT, - })) - goto error_nobufs; - mnl_attr_nest_end(buf, act); - mnl_attr_nest_end(buf, act_index); - ++action; - break; - case ACTION_OF_POP_VLAN: - if (action->type != RTE_FLOW_ACTION_TYPE_OF_POP_VLAN) - goto trans; - conf.of_push_vlan = NULL; - i = TCA_VLAN_ACT_POP; - goto action_of_vlan; - case ACTION_OF_PUSH_VLAN: - if (action->type != RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN) - goto trans; - conf.of_push_vlan = action->conf; - i = TCA_VLAN_ACT_PUSH; - goto action_of_vlan; - case ACTION_OF_SET_VLAN_VID: - if (action->type != RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) - goto trans; - conf.of_set_vlan_vid = action->conf; - if (na_vlan_id) - goto override_na_vlan_id; - i = TCA_VLAN_ACT_MODIFY; - goto action_of_vlan; - case ACTION_OF_SET_VLAN_PCP: - if (action->type != RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) - goto trans; - conf.of_set_vlan_pcp = action->conf; - if (na_vlan_priority) - goto override_na_vlan_priority; - i = TCA_VLAN_ACT_MODIFY; - goto action_of_vlan; -action_of_vlan: - act_index = - mnl_attr_nest_start_check(buf, size, act_index_cur++); - if (!act_index || - !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "vlan")) - goto error_nobufs; - act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS); - if (!act) - goto error_nobufs; - if (!mnl_attr_put_check(buf, size, TCA_VLAN_PARMS, - sizeof(struct tc_vlan), - &(struct tc_vlan){ - .action = TC_ACT_PIPE, - .v_action = i, - })) - goto error_nobufs; - if (i == TCA_VLAN_ACT_POP) { - mnl_attr_nest_end(buf, act); - mnl_attr_nest_end(buf, act_index); - ++action; - break; - } - if (i == TCA_VLAN_ACT_PUSH && - !mnl_attr_put_u16_check(buf, size, - TCA_VLAN_PUSH_VLAN_PROTOCOL, - conf.of_push_vlan->ethertype)) - goto error_nobufs; - na_vlan_id = mnl_nlmsg_get_payload_tail(buf); - if (!mnl_attr_put_u16_check(buf, size, TCA_VLAN_PAD, 0)) - goto error_nobufs; - na_vlan_priority = mnl_nlmsg_get_payload_tail(buf); - if (!mnl_attr_put_u8_check(buf, size, TCA_VLAN_PAD, 0)) - goto error_nobufs; - mnl_attr_nest_end(buf, act); - mnl_attr_nest_end(buf, act_index); - if (action->type == RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) { -override_na_vlan_id: - na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID; - *(uint16_t *)mnl_attr_get_payload(na_vlan_id) = - rte_be_to_cpu_16 - (conf.of_set_vlan_vid->vlan_vid); - } else if (action->type == - RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) { -override_na_vlan_priority: - na_vlan_priority->nla_type = - TCA_VLAN_PUSH_VLAN_PRIORITY; - *(uint8_t *)mnl_attr_get_payload(na_vlan_priority) = - conf.of_set_vlan_pcp->vlan_pcp; - } - ++action; - break; - case END: - if (item->type != RTE_FLOW_ITEM_TYPE_END || - action->type != RTE_FLOW_ACTION_TYPE_END) - goto trans; - if (na_flower_act) - mnl_attr_nest_end(buf, na_flower_act); - if (na_flower) - mnl_attr_nest_end(buf, na_flower); - nlh = buf; - return nlh->nlmsg_len; - } - back = trans; - trans = mlx5_nl_flow_trans[trans[n - 1]]; - n = 0; - goto trans; -error_nobufs: - if (buf != buf_tmp) { - buf = buf_tmp; - size = sizeof(buf_tmp); - goto init; - } - return rte_flow_error_set - (error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, - "generated TC message is too large"); -} - -/** - * Brand rtnetlink buffer with unique handle. - * - * This handle should be unique for a given network interface to avoid - * collisions. - * - * @param buf - * Flow rule buffer previously initialized by mlx5_nl_flow_transpose(). - * @param handle - * Unique 32-bit handle to use. - */ -void -mlx5_nl_flow_brand(void *buf, uint32_t handle) -{ - struct tcmsg *tcm = mnl_nlmsg_get_payload(buf); - - tcm->tcm_handle = handle; -} - -/** - * Send Netlink message with acknowledgment. - * - * @param nl - * Libmnl socket to use. - * @param nlh - * Message to send. This function always raises the NLM_F_ACK flag before - * sending. - * - * @return - * 0 on success, a negative errno value otherwise and rte_errno is set. - */ -static int -mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh) -{ - alignas(struct nlmsghdr) - uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) + - nlh->nlmsg_len - sizeof(*nlh)]; - uint32_t seq = random(); - int ret; - - nlh->nlmsg_flags |= NLM_F_ACK; - nlh->nlmsg_seq = seq; - ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len); - if (ret != -1) - ret = mnl_socket_recvfrom(nl, ans, sizeof(ans)); - if (ret != -1) - ret = mnl_cb_run - (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL); - if (!ret) - return 0; - rte_errno = errno; - return -rte_errno; -} - -/** - * Create a Netlink flow rule. - * - * @param nl - * Libmnl socket to use. - * @param buf - * Flow rule buffer previously initialized by mlx5_nl_flow_transpose(). - * @param[out] error - * Perform verbose error reporting if not NULL. - * - * @return - * 0 on success, a negative errno value otherwise and rte_errno is set. - */ -int -mlx5_nl_flow_create(struct mnl_socket *nl, void *buf, - struct rte_flow_error *error) -{ - struct nlmsghdr *nlh = buf; - - nlh->nlmsg_type = RTM_NEWTFILTER; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; - if (!mlx5_nl_flow_nl_ack(nl, nlh)) - return 0; - return rte_flow_error_set - (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, - "netlink: failed to create TC flow rule"); -} - -/** - * Destroy a Netlink flow rule. - * - * @param nl - * Libmnl socket to use. - * @param buf - * Flow rule buffer previously initialized by mlx5_nl_flow_transpose(). - * @param[out] error - * Perform verbose error reporting if not NULL. - * - * @return - * 0 on success, a negative errno value otherwise and rte_errno is set. - */ -int -mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf, - struct rte_flow_error *error) -{ - struct nlmsghdr *nlh = buf; - - nlh->nlmsg_type = RTM_DELTFILTER; - nlh->nlmsg_flags = NLM_F_REQUEST; - if (!mlx5_nl_flow_nl_ack(nl, nlh)) - return 0; - return rte_flow_error_set - (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, - "netlink: failed to destroy TC flow rule"); -} - -/** - * Initialize ingress qdisc of a given network interface. - * - * @param nl - * Libmnl socket of the @p NETLINK_ROUTE kind. - * @param ifindex - * Index of network interface to initialize. - * @param[out] error - * Perform verbose error reporting if not NULL. - * - * @return - * 0 on success, a negative errno value otherwise and rte_errno is set. - */ -int -mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex, - struct rte_flow_error *error) -{ - struct nlmsghdr *nlh; - struct tcmsg *tcm; - alignas(struct nlmsghdr) - uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)]; - - /* Destroy existing ingress qdisc and everything attached to it. */ - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = RTM_DELQDISC; - nlh->nlmsg_flags = NLM_F_REQUEST; - tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); - tcm->tcm_family = AF_UNSPEC; - tcm->tcm_ifindex = ifindex; - tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); - tcm->tcm_parent = TC_H_INGRESS; - /* Ignore errors when qdisc is already absent. */ - if (mlx5_nl_flow_nl_ack(nl, nlh) && - rte_errno != EINVAL && rte_errno != ENOENT) - return rte_flow_error_set - (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, "netlink: failed to remove ingress qdisc"); - /* Create fresh ingress qdisc. */ - nlh = mnl_nlmsg_put_header(buf); - nlh->nlmsg_type = RTM_NEWQDISC; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; - tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); - tcm->tcm_family = AF_UNSPEC; - tcm->tcm_ifindex = ifindex; - tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); - tcm->tcm_parent = TC_H_INGRESS; - mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress"); - if (mlx5_nl_flow_nl_ack(nl, nlh)) - return rte_flow_error_set - (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, "netlink: failed to create ingress qdisc"); - return 0; -} - -/** - * Create and configure a libmnl socket for Netlink flow rules. - * - * @return - * A valid libmnl socket object pointer on success, NULL otherwise and - * rte_errno is set. - */ -struct mnl_socket * -mlx5_nl_flow_socket_create(void) -{ - struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE); - - if (nl) { - mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 }, - sizeof(int)); - if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID)) - return nl; - } - rte_errno = errno; - if (nl) - mnl_socket_close(nl); - return NULL; -} - -/** - * Destroy a libmnl socket. - */ -void -mlx5_nl_flow_socket_destroy(struct mnl_socket *nl) -{ - mnl_socket_close(nl); -} diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h index 0870d32f..29742b13 100644 --- a/drivers/net/mlx5/mlx5_prm.h +++ b/drivers/net/mlx5/mlx5_prm.h @@ -159,7 +159,7 @@ struct mlx5_wqe_eth_seg_small { uint8_t cs_flags; uint8_t rsvd1; uint16_t mss; - uint32_t rsvd2; + uint32_t flow_table_metadata; uint16_t inline_hdr_sz; uint8_t inline_hdr[2]; } __rte_aligned(MLX5_WQE_DWORD_SIZE); @@ -280,6 +280,226 @@ struct mlx5_cqe { /* CQE format value. */ #define MLX5_COMPRESSED 0x3 +/* The field of packet to be modified. */ +enum mlx5_modificaiton_field { + MLX5_MODI_OUT_SMAC_47_16 = 1, + MLX5_MODI_OUT_SMAC_15_0, + MLX5_MODI_OUT_ETHERTYPE, + MLX5_MODI_OUT_DMAC_47_16, + MLX5_MODI_OUT_DMAC_15_0, + MLX5_MODI_OUT_IP_DSCP, + MLX5_MODI_OUT_TCP_FLAGS, + MLX5_MODI_OUT_TCP_SPORT, + MLX5_MODI_OUT_TCP_DPORT, + MLX5_MODI_OUT_IPV4_TTL, + MLX5_MODI_OUT_UDP_SPORT, + MLX5_MODI_OUT_UDP_DPORT, + MLX5_MODI_OUT_SIPV6_127_96, + MLX5_MODI_OUT_SIPV6_95_64, + MLX5_MODI_OUT_SIPV6_63_32, + MLX5_MODI_OUT_SIPV6_31_0, + MLX5_MODI_OUT_DIPV6_127_96, + MLX5_MODI_OUT_DIPV6_95_64, + MLX5_MODI_OUT_DIPV6_63_32, + MLX5_MODI_OUT_DIPV6_31_0, + MLX5_MODI_OUT_SIPV4, + MLX5_MODI_OUT_DIPV4, + MLX5_MODI_IN_SMAC_47_16 = 0x31, + MLX5_MODI_IN_SMAC_15_0, + MLX5_MODI_IN_ETHERTYPE, + MLX5_MODI_IN_DMAC_47_16, + MLX5_MODI_IN_DMAC_15_0, + MLX5_MODI_IN_IP_DSCP, + MLX5_MODI_IN_TCP_FLAGS, + MLX5_MODI_IN_TCP_SPORT, + MLX5_MODI_IN_TCP_DPORT, + MLX5_MODI_IN_IPV4_TTL, + MLX5_MODI_IN_UDP_SPORT, + MLX5_MODI_IN_UDP_DPORT, + MLX5_MODI_IN_SIPV6_127_96, + MLX5_MODI_IN_SIPV6_95_64, + MLX5_MODI_IN_SIPV6_63_32, + MLX5_MODI_IN_SIPV6_31_0, + MLX5_MODI_IN_DIPV6_127_96, + MLX5_MODI_IN_DIPV6_95_64, + MLX5_MODI_IN_DIPV6_63_32, + MLX5_MODI_IN_DIPV6_31_0, + MLX5_MODI_IN_SIPV4, + MLX5_MODI_IN_DIPV4, + MLX5_MODI_OUT_IPV6_HOPLIMIT, + MLX5_MODI_IN_IPV6_HOPLIMIT, + MLX5_MODI_META_DATA_REG_A, + MLX5_MODI_META_DATA_REG_B = 0x50, +}; + +/* Modification sub command. */ +struct mlx5_modification_cmd { + union { + uint32_t data0; + struct { + unsigned int bits:5; + unsigned int rsvd0:3; + unsigned int src_offset:5; /* Start bit offset. */ + unsigned int rsvd1:3; + unsigned int src_field:12; + unsigned int type:4; + }; + }; + union { + uint32_t data1; + uint8_t data[4]; + struct { + unsigned int rsvd2:8; + unsigned int dst_offset:8; + unsigned int dst_field:12; + unsigned int rsvd3:4; + }; + }; +}; + +typedef uint32_t u32; +typedef uint16_t u16; +typedef uint8_t u8; + +#define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)0) +#define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld) +#define __mlx5_bit_off(typ, fld) ((unsigned int)(unsigned long) \ + (&(__mlx5_nullp(typ)->fld))) +#define __mlx5_dw_bit_off(typ, fld) (32 - __mlx5_bit_sz(typ, fld) - \ + (__mlx5_bit_off(typ, fld) & 0x1f)) +#define __mlx5_dw_off(typ, fld) (__mlx5_bit_off(typ, fld) / 32) +#define __mlx5_dw_mask(typ, fld) (__mlx5_mask(typ, fld) << \ + __mlx5_dw_bit_off(typ, fld)) +#define __mlx5_mask(typ, fld) ((u32)((1ull << __mlx5_bit_sz(typ, fld)) - 1)) +#define __mlx5_16_off(typ, fld) (__mlx5_bit_off(typ, fld) / 16) +#define __mlx5_16_bit_off(typ, fld) (16 - __mlx5_bit_sz(typ, fld) - \ + (__mlx5_bit_off(typ, fld) & 0xf)) +#define __mlx5_mask16(typ, fld) ((u16)((1ull << __mlx5_bit_sz(typ, fld)) - 1)) +#define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32) +#define MLX5_ST_SZ_DB(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8) +#define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8) +#define MLX5_ADDR_OF(typ, p, fld) ((char *)(p) + MLX5_BYTE_OFF(typ, fld)) + +/* insert a value to a struct */ +#define MLX5_SET(typ, p, fld, v) \ + do { \ + u32 _v = v; \ + *((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \ + rte_cpu_to_be_32((rte_be_to_cpu_32(*((u32 *)(p) + \ + __mlx5_dw_off(typ, fld))) & \ + (~__mlx5_dw_mask(typ, fld))) | \ + (((_v) & __mlx5_mask(typ, fld)) << \ + __mlx5_dw_bit_off(typ, fld))); \ + } while (0) +#define MLX5_GET16(typ, p, fld) \ + ((rte_be_to_cpu_16(*((__be16 *)(p) + \ + __mlx5_16_off(typ, fld))) >> __mlx5_16_bit_off(typ, fld)) & \ + __mlx5_mask16(typ, fld)) +#define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8) + +struct mlx5_ifc_fte_match_set_misc_bits { + u8 reserved_at_0[0x8]; + u8 source_sqn[0x18]; + u8 reserved_at_20[0x10]; + u8 source_port[0x10]; + u8 outer_second_prio[0x3]; + u8 outer_second_cfi[0x1]; + u8 outer_second_vid[0xc]; + u8 inner_second_prio[0x3]; + u8 inner_second_cfi[0x1]; + u8 inner_second_vid[0xc]; + u8 outer_second_cvlan_tag[0x1]; + u8 inner_second_cvlan_tag[0x1]; + u8 outer_second_svlan_tag[0x1]; + u8 inner_second_svlan_tag[0x1]; + u8 reserved_at_64[0xc]; + u8 gre_protocol[0x10]; + u8 gre_key_h[0x18]; + u8 gre_key_l[0x8]; + u8 vxlan_vni[0x18]; + u8 reserved_at_b8[0x8]; + u8 reserved_at_c0[0x20]; + u8 reserved_at_e0[0xc]; + u8 outer_ipv6_flow_label[0x14]; + u8 reserved_at_100[0xc]; + u8 inner_ipv6_flow_label[0x14]; + u8 reserved_at_120[0xe0]; +}; + +struct mlx5_ifc_ipv4_layout_bits { + u8 reserved_at_0[0x60]; + u8 ipv4[0x20]; +}; + +struct mlx5_ifc_ipv6_layout_bits { + u8 ipv6[16][0x8]; +}; + +union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits { + struct mlx5_ifc_ipv6_layout_bits ipv6_layout; + struct mlx5_ifc_ipv4_layout_bits ipv4_layout; + u8 reserved_at_0[0x80]; +}; + +struct mlx5_ifc_fte_match_set_lyr_2_4_bits { + u8 smac_47_16[0x20]; + u8 smac_15_0[0x10]; + u8 ethertype[0x10]; + u8 dmac_47_16[0x20]; + u8 dmac_15_0[0x10]; + u8 first_prio[0x3]; + u8 first_cfi[0x1]; + u8 first_vid[0xc]; + u8 ip_protocol[0x8]; + u8 ip_dscp[0x6]; + u8 ip_ecn[0x2]; + u8 cvlan_tag[0x1]; + u8 svlan_tag[0x1]; + u8 frag[0x1]; + u8 ip_version[0x4]; + u8 tcp_flags[0x9]; + u8 tcp_sport[0x10]; + u8 tcp_dport[0x10]; + u8 reserved_at_c0[0x20]; + u8 udp_sport[0x10]; + u8 udp_dport[0x10]; + union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6; + union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6; +}; + +struct mlx5_ifc_fte_match_mpls_bits { + u8 mpls_label[0x14]; + u8 mpls_exp[0x3]; + u8 mpls_s_bos[0x1]; + u8 mpls_ttl[0x8]; +}; + +struct mlx5_ifc_fte_match_set_misc2_bits { + struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls; + struct mlx5_ifc_fte_match_mpls_bits inner_first_mpls; + struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls_over_gre; + struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls_over_udp; + u8 reserved_at_80[0x100]; + u8 metadata_reg_a[0x20]; + u8 reserved_at_1a0[0x60]; +}; + +/* Flow matcher. */ +struct mlx5_ifc_fte_match_param_bits { + struct mlx5_ifc_fte_match_set_lyr_2_4_bits outer_headers; + struct mlx5_ifc_fte_match_set_misc_bits misc_parameters; + struct mlx5_ifc_fte_match_set_lyr_2_4_bits inner_headers; + struct mlx5_ifc_fte_match_set_misc2_bits misc_parameters_2; + u8 reserved_at_800[0x800]; +}; + +enum { + MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT, + MLX5_MATCH_CRITERIA_ENABLE_MISC_BIT, + MLX5_MATCH_CRITERIA_ENABLE_INNER_BIT, + MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT +}; + /* CQE format mask. */ #define MLX5E_CQE_FORMAT_MASK 0xc diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index 1f7bfd44..ed993ea6 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -388,7 +388,6 @@ mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev) DEV_RX_OFFLOAD_TIMESTAMP | DEV_RX_OFFLOAD_JUMBO_FRAME); - offloads |= DEV_RX_OFFLOAD_CRC_STRIP; if (config->hw_fcs_strip) offloads |= DEV_RX_OFFLOAD_KEEP_CRC; @@ -1438,7 +1437,7 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, tmpl->rxq.vlan_strip = !!(offloads & DEV_RX_OFFLOAD_VLAN_STRIP); /* By default, FCS (CRC) is stripped by hardware. */ tmpl->rxq.crc_present = 0; - if (rte_eth_dev_must_keep_crc(offloads)) { + if (offloads & DEV_RX_OFFLOAD_KEEP_CRC) { if (config->hw_fcs_strip) { tmpl->rxq.crc_present = 1; } else { diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 2d14f8a6..24a054d5 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -523,6 +523,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG); uint32_t swp_offsets = 0; uint8_t swp_types = 0; + rte_be32_t metadata; uint16_t tso_segsz = 0; #ifdef MLX5_PMD_SOFT_COUNTERS uint32_t total_length = 0; @@ -566,6 +567,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) cs_flags = txq_ol_cksum_to_cs(buf); txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types); raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE; + /* Copy metadata from mbuf if valid */ + metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata : + 0; /* Replace the Ethernet type by the VLAN if necessary. */ if (buf->ol_flags & PKT_TX_VLAN_PKT) { uint32_t vlan = rte_cpu_to_be_32(0x81000000 | @@ -781,7 +785,7 @@ next_pkt: swp_offsets, cs_flags | (swp_types << 8) | (rte_cpu_to_be_16(tso_segsz) << 16), - 0, + metadata, (ehdr << 16) | rte_cpu_to_be_16(tso_header_sz), }; } else { @@ -795,7 +799,7 @@ next_pkt: wqe->eseg = (rte_v128u32_t){ swp_offsets, cs_flags | (swp_types << 8), - 0, + metadata, (ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz), }; } @@ -861,7 +865,7 @@ mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length) mpw->wqe->eseg.inline_hdr_sz = 0; mpw->wqe->eseg.rsvd0 = 0; mpw->wqe->eseg.rsvd1 = 0; - mpw->wqe->eseg.rsvd2 = 0; + mpw->wqe->eseg.flow_table_metadata = 0; mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) | (txq->wqe_ci << 8) | MLX5_OPCODE_TSO); @@ -948,6 +952,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) uint32_t length; unsigned int segs_n = buf->nb_segs; uint32_t cs_flags; + rte_be32_t metadata; /* * Make sure there is enough room to store this packet and @@ -964,6 +969,9 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) max_elts -= segs_n; --pkts_n; cs_flags = txq_ol_cksum_to_cs(buf); + /* Copy metadata from mbuf if valid */ + metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata : + 0; /* Retrieve packet information. */ length = PKT_LEN(buf); assert(length); @@ -971,6 +979,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if ((mpw.state == MLX5_MPW_STATE_OPENED) && ((mpw.len != length) || (segs_n != 1) || + (mpw.wqe->eseg.flow_table_metadata != metadata) || (mpw.wqe->eseg.cs_flags != cs_flags))) mlx5_mpw_close(txq, &mpw); if (mpw.state == MLX5_MPW_STATE_CLOSED) { @@ -984,6 +993,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) max_wqe -= 2; mlx5_mpw_new(txq, &mpw, length); mpw.wqe->eseg.cs_flags = cs_flags; + mpw.wqe->eseg.flow_table_metadata = metadata; } /* Multi-segment packets must be alone in their MPW. */ assert((segs_n == 1) || (mpw.pkts_n == 0)); @@ -1082,7 +1092,7 @@ mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, mpw->wqe->eseg.cs_flags = 0; mpw->wqe->eseg.rsvd0 = 0; mpw->wqe->eseg.rsvd1 = 0; - mpw->wqe->eseg.rsvd2 = 0; + mpw->wqe->eseg.flow_table_metadata = 0; inl = (struct mlx5_wqe_inl_small *) (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE); mpw->data.raw = (uint8_t *)&inl->raw; @@ -1172,6 +1182,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint32_t length; unsigned int segs_n = buf->nb_segs; uint8_t cs_flags; + rte_be32_t metadata; /* * Make sure there is enough room to store this packet and @@ -1193,18 +1204,23 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, */ max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); cs_flags = txq_ol_cksum_to_cs(buf); + /* Copy metadata from mbuf if valid */ + metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata : + 0; /* Retrieve packet information. */ length = PKT_LEN(buf); /* Start new session if packet differs. */ if (mpw.state == MLX5_MPW_STATE_OPENED) { if ((mpw.len != length) || (segs_n != 1) || + (mpw.wqe->eseg.flow_table_metadata != metadata) || (mpw.wqe->eseg.cs_flags != cs_flags)) mlx5_mpw_close(txq, &mpw); } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) { if ((mpw.len != length) || (segs_n != 1) || (length > inline_room) || + (mpw.wqe->eseg.flow_table_metadata != metadata) || (mpw.wqe->eseg.cs_flags != cs_flags)) { mlx5_mpw_inline_close(txq, &mpw); inline_room = @@ -1224,12 +1240,14 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, max_wqe -= 2; mlx5_mpw_new(txq, &mpw, length); mpw.wqe->eseg.cs_flags = cs_flags; + mpw.wqe->eseg.flow_table_metadata = metadata; } else { if (unlikely(max_wqe < wqe_inl_n)) break; max_wqe -= wqe_inl_n; mlx5_mpw_inline_new(txq, &mpw, length); mpw.wqe->eseg.cs_flags = cs_flags; + mpw.wqe->eseg.flow_table_metadata = metadata; } } /* Multi-segment packets must be alone in their MPW. */ @@ -1461,6 +1479,7 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, unsigned int do_inline = 0; /* Whether inline is possible. */ uint32_t length; uint8_t cs_flags; + rte_be32_t metadata; /* Multi-segmented packet is handled in slow-path outside. */ assert(NB_SEGS(buf) == 1); @@ -1468,6 +1487,9 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, if (max_elts - j == 0) break; cs_flags = txq_ol_cksum_to_cs(buf); + /* Copy metadata from mbuf if valid */ + metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata : + 0; /* Retrieve packet information. */ length = PKT_LEN(buf); /* Start new session if: @@ -1482,6 +1504,7 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, (length <= txq->inline_max_packet_sz && inl_pad + sizeof(inl_hdr) + length > mpw_room) || + (mpw.wqe->eseg.flow_table_metadata != metadata) || (mpw.wqe->eseg.cs_flags != cs_flags)) max_wqe -= mlx5_empw_close(txq, &mpw); } @@ -1505,6 +1528,7 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, sizeof(inl_hdr) + length <= mpw_room && !txq->mpw_hdr_dseg; mpw.wqe->eseg.cs_flags = cs_flags; + mpw.wqe->eseg.flow_table_metadata = metadata; } else { /* Evaluate whether the next packet can be inlined. * Inlininig is possible when: @@ -2097,7 +2121,7 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) const unsigned int wq_mask = (1 << rxq->elts_n) - 1; volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; unsigned int i = 0; - uint16_t rq_ci = rxq->rq_ci; + uint32_t rq_ci = rxq->rq_ci; uint16_t consumed_strd = rxq->consumed_strd; struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; @@ -2324,7 +2348,7 @@ removed_rx_burst(void *dpdk_txq __rte_unused, * (e.g. mlx5_rxtx_vec_sse.c for x86). */ -uint16_t __attribute__((weak)) +__rte_weak uint16_t mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused, struct rte_mbuf **pkts __rte_unused, uint16_t pkts_n __rte_unused) @@ -2332,7 +2356,7 @@ mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused, return 0; } -uint16_t __attribute__((weak)) +__rte_weak uint16_t mlx5_tx_burst_vec(void *dpdk_txq __rte_unused, struct rte_mbuf **pkts __rte_unused, uint16_t pkts_n __rte_unused) @@ -2340,7 +2364,7 @@ mlx5_tx_burst_vec(void *dpdk_txq __rte_unused, return 0; } -uint16_t __attribute__((weak)) +__rte_weak uint16_t mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, struct rte_mbuf **pkts __rte_unused, uint16_t pkts_n __rte_unused) @@ -2348,25 +2372,25 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, return 0; } -int __attribute__((weak)) +__rte_weak int mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused) { return -ENOTSUP; } -int __attribute__((weak)) +__rte_weak int mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused) { return -ENOTSUP; } -int __attribute__((weak)) +__rte_weak int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) { return -ENOTSUP; } -int __attribute__((weak)) +__rte_weak int mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) { return -ENOTSUP; diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index 48ed2b20..1db468c3 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -97,10 +97,10 @@ struct mlx5_rxq_data { volatile uint32_t *rq_db; volatile uint32_t *cq_db; uint16_t port_id; - uint16_t rq_ci; + uint32_t rq_ci; uint16_t consumed_strd; /* Number of consumed strides in WQE. */ - uint16_t rq_pi; - uint16_t cq_ci; + uint32_t rq_pi; + uint32_t cq_ci; struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */ uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */ volatile void *wqes; @@ -363,6 +363,8 @@ uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, void mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl); uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr); uint32_t mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr); +uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr, + struct rte_mempool *mp); /** * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and @@ -607,6 +609,24 @@ mlx5_tx_complete(struct mlx5_txq_data *txq) } /** + * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which the + * cloned mbuf is allocated is returned instead. + * + * @param buf + * Pointer to mbuf. + * + * @return + * Memory pool where data is located for given mbuf. + */ +static struct rte_mempool * +mlx5_mb2mp(struct rte_mbuf *buf) +{ + if (unlikely(RTE_MBUF_INDIRECT(buf))) + return rte_mbuf_from_indirect(buf)->pool; + return buf->pool; +} + +/** * Query LKey from a packet buffer for Rx. No need to flush local caches for Rx * as mempool is pre-configured and static. * @@ -664,7 +684,20 @@ mlx5_tx_addr2mr(struct mlx5_txq_data *txq, uintptr_t addr) return mlx5_tx_addr2mr_bh(txq, addr); } -#define mlx5_tx_mb2mr(rxq, mb) mlx5_tx_addr2mr(rxq, (uintptr_t)((mb)->buf_addr)) +static __rte_always_inline uint32_t +mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb) +{ + uintptr_t addr = (uintptr_t)mb->buf_addr; + uint32_t lkey = mlx5_tx_addr2mr(txq, addr); + + if (likely(lkey != UINT32_MAX)) + return lkey; + if (rte_errno == ENXIO) { + /* Mempool may have externally allocated memory. */ + lkey = mlx5_tx_update_ext_mp(txq, addr, mlx5_mb2mp(mb)); + } + return lkey; +} /** * Ring TX queue doorbell and flush the update if requested. diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c index 0a4aed8f..1453f4ff 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.c +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c @@ -40,7 +40,8 @@ #endif /** - * Count the number of packets having same ol_flags and calculate cs_flags. + * Count the number of packets having same ol_flags and same metadata (if + * PKT_TX_METADATA is set in ol_flags), and calculate cs_flags. * * @param pkts * Pointer to array of packets. @@ -48,26 +49,45 @@ * Number of packets. * @param cs_flags * Pointer of flags to be returned. + * @param metadata + * Pointer of metadata to be returned. + * @param txq_offloads + * Offloads enabled on Tx queue * * @return - * Number of packets having same ol_flags. + * Number of packets having same ol_flags and metadata, if relevant. */ static inline unsigned int -txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags) +txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags, + rte_be32_t *metadata, const uint64_t txq_offloads) { unsigned int pos; - const uint64_t ol_mask = + const uint64_t cksum_ol_mask = PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE | PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM; + rte_be32_t p0_metadata, pn_metadata; if (!pkts_n) return 0; - /* Count the number of packets having same ol_flags. */ - for (pos = 1; pos < pkts_n; ++pos) - if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask) + p0_metadata = pkts[0]->ol_flags & PKT_TX_METADATA ? + pkts[0]->tx_metadata : 0; + /* Count the number of packets having same offload parameters. */ + for (pos = 1; pos < pkts_n; ++pos) { + /* Check if packet has same checksum flags. */ + if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) && + ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & cksum_ol_mask)) break; + /* Check if packet has same metadata. */ + if (txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) { + pn_metadata = pkts[pos]->ol_flags & PKT_TX_METADATA ? + pkts[pos]->tx_metadata : 0; + if (pn_metadata != p0_metadata) + break; + } + } *cs_flags = txq_ol_cksum_to_cs(pkts[0]); + *metadata = p0_metadata; return pos; } @@ -96,7 +116,7 @@ mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t ret; n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST); - ret = txq_burst_v(txq, &pkts[nb_tx], n, 0); + ret = txq_burst_v(txq, &pkts[nb_tx], n, 0, 0); nb_tx += ret; if (!ret) break; @@ -127,6 +147,7 @@ mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) uint8_t cs_flags = 0; uint16_t n; uint16_t ret; + rte_be32_t metadata = 0; /* Transmit multi-seg packets in the head of pkts list. */ if ((txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS) && @@ -137,9 +158,12 @@ mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST); if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS) n = txq_count_contig_single_seg(&pkts[nb_tx], n); - if (txq->offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) - n = txq_calc_offload(&pkts[nb_tx], n, &cs_flags); - ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags); + if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP | + DEV_TX_OFFLOAD_MATCH_METADATA)) + n = txq_calc_offload(&pkts[nb_tx], n, + &cs_flags, &metadata, + txq->offloads); + ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags, metadata); nb_tx += ret; if (!ret) break; diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h index fb884f92..fda7004e 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec.h @@ -22,6 +22,7 @@ /* HW offload capabilities of vectorized Tx. */ #define MLX5_VEC_TX_OFFLOAD_CAP \ (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP | \ + DEV_TX_OFFLOAD_MATCH_METADATA | \ DEV_TX_OFFLOAD_MULTI_SEGS) /* diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h index b37b7381..0b729f18 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -201,13 +201,15 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, * Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST). * @param cs_flags * Checksum offload flags to be written in the descriptor. + * @param metadata + * Metadata value to be written in the descriptor. * * @return * Number of packets successfully transmitted (<= pkts_n). */ static inline uint16_t txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, - uint8_t cs_flags) + uint8_t cs_flags, rte_be32_t metadata) { struct rte_mbuf **elts; uint16_t elts_head = txq->elts_head; @@ -293,11 +295,8 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m); vst1q_u8((void *)t_wqe, ctrl); /* Fill ESEG in the header. */ - vst1q_u8((void *)(t_wqe + 1), - ((uint8x16_t) { 0, 0, 0, 0, - cs_flags, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0 })); + vst1q_u32((void *)(t_wqe + 1), + ((uint32x4_t) { 0, cs_flags, metadata, 0 })); #ifdef MLX5_PMD_SOFT_COUNTERS txq->stats.opackets += pkts_n; #endif diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h index 54b3783c..e0f95f92 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h @@ -202,13 +202,15 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, * Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST). * @param cs_flags * Checksum offload flags to be written in the descriptor. + * @param metadata + * Metadata value to be written in the descriptor. * * @return * Number of packets successfully transmitted (<= pkts_n). */ static inline uint16_t txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, - uint8_t cs_flags) + uint8_t cs_flags, rte_be32_t metadata) { struct rte_mbuf **elts; uint16_t elts_head = txq->elts_head; @@ -292,11 +294,7 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); _mm_store_si128(t_wqe, ctrl); /* Fill ESEG in the header. */ - _mm_store_si128(t_wqe + 1, - _mm_set_epi8(0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, cs_flags, - 0, 0, 0, 0)); + _mm_store_si128(t_wqe + 1, _mm_set_epi32(0, metadata, cs_flags, 0)); #ifdef MLX5_PMD_SOFT_COUNTERS txq->stats.opackets += pkts_n; #endif diff --git a/drivers/net/mlx5/mlx5_socket.c b/drivers/net/mlx5/mlx5_socket.c index a3a52291..00106171 100644 --- a/drivers/net/mlx5/mlx5_socket.c +++ b/drivers/net/mlx5/mlx5_socket.c @@ -3,8 +3,6 @@ * Copyright 2016 Mellanox Technologies, Ltd */ -#define _GNU_SOURCE - #include <sys/types.h> #include <sys/socket.h> #include <sys/un.h> diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c index 91f3d474..a14d1e49 100644 --- a/drivers/net/mlx5/mlx5_stats.c +++ b/drivers/net/mlx5/mlx5_stats.c @@ -17,14 +17,6 @@ #include "mlx5_rxtx.h" #include "mlx5_defs.h" -struct mlx5_counter_ctrl { - /* Name of the counter. */ - char dpdk_name[RTE_ETH_XSTATS_NAME_SIZE]; - /* Name of the counter on the device table. */ - char ctr_name[RTE_ETH_XSTATS_NAME_SIZE]; - uint32_t ib:1; /**< Nonzero for IB counters. */ -}; - static const struct mlx5_counter_ctrl mlx5_counters_init[] = { { .dpdk_name = "rx_port_unicast_bytes", @@ -115,6 +107,23 @@ static const struct mlx5_counter_ctrl mlx5_counters_init[] = { .dpdk_name = "rx_bytes_phy", .ctr_name = "rx_bytes_phy", }, + /* Representor only */ + { + .dpdk_name = "rx_packets", + .ctr_name = "vport_rx_packets", + }, + { + .dpdk_name = "rx_bytes", + .ctr_name = "vport_rx_bytes", + }, + { + .dpdk_name = "tx_packets", + .ctr_name = "vport_tx_packets", + }, + { + .dpdk_name = "tx_bytes", + .ctr_name = "vport_tx_bytes", + }, }; static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init); @@ -146,19 +155,19 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats) et_stats->cmd = ETHTOOL_GSTATS; et_stats->n_stats = xstats_ctrl->stats_n; ifr.ifr_data = (caddr_t)et_stats; - ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); if (ret) { DRV_LOG(WARNING, "port %u unable to read statistic values from device", dev->data->port_id); return ret; } - for (i = 0; i != xstats_n; ++i) { - if (mlx5_counters_init[i].ib) { + for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) { + if (xstats_ctrl->info[i].ib) { FILE *file; MKSTR(path, "%s/ports/1/hw_counters/%s", priv->ibdev_path, - mlx5_counters_init[i].ctr_name); + xstats_ctrl->info[i].ctr_name); file = fopen(path, "rb"); if (file) { @@ -194,7 +203,7 @@ mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) { drvinfo.cmd = ETHTOOL_GDRVINFO; ifr.ifr_data = (caddr_t)&drvinfo; - ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); if (ret) { DRV_LOG(WARNING, "port %u unable to query number of statistics", dev->data->port_id); @@ -222,6 +231,8 @@ mlx5_xstats_init(struct rte_eth_dev *dev) unsigned int str_sz; int ret; + /* So that it won't aggregate for each init. */ + xstats_ctrl->mlx5_stats_n = 0; ret = mlx5_ethtool_get_stats_n(dev); if (ret < 0) { DRV_LOG(WARNING, "port %u no extended statistics available", @@ -229,7 +240,6 @@ mlx5_xstats_init(struct rte_eth_dev *dev) return; } dev_stats_n = ret; - xstats_ctrl->stats_n = dev_stats_n; /* Allocate memory to grab stat names and values. */ str_sz = dev_stats_n * ETH_GSTRING_LEN; strings = (struct ethtool_gstrings *) @@ -244,14 +254,12 @@ mlx5_xstats_init(struct rte_eth_dev *dev) strings->string_set = ETH_SS_STATS; strings->len = dev_stats_n; ifr.ifr_data = (caddr_t)strings; - ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1); + ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr); if (ret) { DRV_LOG(WARNING, "port %u unable to get statistic names", dev->data->port_id); goto free; } - for (j = 0; j != xstats_n; ++j) - xstats_ctrl->dev_table_idx[j] = dev_stats_n; for (i = 0; i != dev_stats_n; ++i) { const char *curr_string = (const char *) &strings->data[i * ETH_GSTRING_LEN]; @@ -259,24 +267,25 @@ mlx5_xstats_init(struct rte_eth_dev *dev) for (j = 0; j != xstats_n; ++j) { if (!strcmp(mlx5_counters_init[j].ctr_name, curr_string)) { - xstats_ctrl->dev_table_idx[j] = i; + unsigned int idx = xstats_ctrl->mlx5_stats_n++; + + xstats_ctrl->dev_table_idx[idx] = i; + xstats_ctrl->info[idx] = mlx5_counters_init[j]; break; } } } - for (j = 0; j != xstats_n; ++j) { - if (mlx5_counters_init[j].ib) - continue; - if (xstats_ctrl->dev_table_idx[j] >= dev_stats_n) { - DRV_LOG(WARNING, - "port %u counter \"%s\" is not recognized", - dev->data->port_id, - mlx5_counters_init[j].dpdk_name); - goto free; + /* Add IB counters. */ + for (i = 0; i != xstats_n; ++i) { + if (mlx5_counters_init[i].ib) { + unsigned int idx = xstats_ctrl->mlx5_stats_n++; + + xstats_ctrl->info[idx] = mlx5_counters_init[i]; } } + assert(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS); + xstats_ctrl->stats_n = dev_stats_n; /* Copy to base at first time. */ - assert(xstats_n <= MLX5_MAX_XSTATS); ret = mlx5_read_dev_counters(dev, xstats_ctrl->base); if (ret) DRV_LOG(ERR, "port %u cannot read device counters: %s", @@ -306,9 +315,10 @@ mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats, struct priv *priv = dev->data->dev_private; unsigned int i; uint64_t counters[n]; + struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + uint16_t mlx5_stats_n = xstats_ctrl->mlx5_stats_n; - if (n >= xstats_n && stats) { - struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + if (n >= mlx5_stats_n && stats) { int stats_n; int ret; @@ -320,12 +330,12 @@ mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats, ret = mlx5_read_dev_counters(dev, counters); if (ret) return ret; - for (i = 0; i != xstats_n; ++i) { + for (i = 0; i != mlx5_stats_n; ++i) { stats[i].id = i; stats[i].value = (counters[i] - xstats_ctrl->base[i]); } } - return xstats_n; + return mlx5_stats_n; } /** @@ -441,7 +451,7 @@ mlx5_xstats_reset(struct rte_eth_dev *dev) struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; int stats_n; unsigned int i; - unsigned int n = xstats_n; + unsigned int n = xstats_ctrl->mlx5_stats_n; uint64_t counters[n]; int ret; @@ -481,14 +491,17 @@ mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused, struct rte_eth_xstat_name *xstats_names, unsigned int n) { unsigned int i; + struct priv *priv = dev->data->dev_private; + struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl; + unsigned int mlx5_xstats_n = xstats_ctrl->mlx5_stats_n; - if (n >= xstats_n && xstats_names) { - for (i = 0; i != xstats_n; ++i) { + if (n >= mlx5_xstats_n && xstats_names) { + for (i = 0; i != mlx5_xstats_n; ++i) { strncpy(xstats_names[i].name, - mlx5_counters_init[i].dpdk_name, + xstats_ctrl->info[i].dpdk_name, RTE_ETH_XSTATS_NAME_SIZE); xstats_names[i].name[RTE_ETH_XSTATS_NAME_SIZE - 1] = 0; } } - return xstats_n; + return mlx5_xstats_n; } diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index f9bc4739..b01bd675 100644 --- a/drivers/net/mlx5/mlx5_txq.c +++ b/drivers/net/mlx5/mlx5_txq.c @@ -120,7 +120,6 @@ mlx5_get_tx_port_offloads(struct rte_eth_dev *dev) offloads |= (DEV_TX_OFFLOAD_IP_TNL_TSO | DEV_TX_OFFLOAD_UDP_TNL_TSO); } - if (config->tunnel_en) { if (config->hw_csum) offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; @@ -128,6 +127,10 @@ mlx5_get_tx_port_offloads(struct rte_eth_dev *dev) offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO | DEV_TX_OFFLOAD_GRE_TNL_TSO); } +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + if (config->dv_flow_en) + offloads |= DEV_TX_OFFLOAD_MATCH_METADATA; +#endif return offloads; } |