diff options
author | C.J. Collier <cjcollier@linuxfoundation.org> | 2016-06-14 07:50:17 -0700 |
---|---|---|
committer | C.J. Collier <cjcollier@linuxfoundation.org> | 2016-06-14 12:17:54 -0700 |
commit | 97f17497d162afdb82c8704bf097f0fee3724b2e (patch) | |
tree | 1c6269614c0c15ffef8451c58ae8f8b30a1bc804 /lib/librte_acl | |
parent | e04be89c2409570e0055b2cda60bd11395bb93b0 (diff) |
Imported Upstream version 16.04
Change-Id: I77eadcd8538a9122e4773cbe55b24033dc451757
Signed-off-by: C.J. Collier <cjcollier@linuxfoundation.org>
Diffstat (limited to 'lib/librte_acl')
-rw-r--r-- | lib/librte_acl/Makefile | 96 | ||||
-rw-r--r-- | lib/librte_acl/acl.h | 241 | ||||
-rw-r--r-- | lib/librte_acl/acl_bld.c | 1598 | ||||
-rw-r--r-- | lib/librte_acl/acl_gen.c | 561 | ||||
-rw-r--r-- | lib/librte_acl/acl_run.h | 263 | ||||
-rw-r--r-- | lib/librte_acl/acl_run_avx2.c | 54 | ||||
-rw-r--r-- | lib/librte_acl/acl_run_avx2.h | 284 | ||||
-rw-r--r-- | lib/librte_acl/acl_run_neon.c | 46 | ||||
-rw-r--r-- | lib/librte_acl/acl_run_neon.h | 289 | ||||
-rw-r--r-- | lib/librte_acl/acl_run_scalar.c | 192 | ||||
-rw-r--r-- | lib/librte_acl/acl_run_sse.c | 47 | ||||
-rw-r--r-- | lib/librte_acl/acl_run_sse.h | 357 | ||||
-rw-r--r-- | lib/librte_acl/acl_vect.h | 116 | ||||
-rw-r--r-- | lib/librte_acl/rte_acl.c | 393 | ||||
-rw-r--r-- | lib/librte_acl/rte_acl.h | 388 | ||||
-rw-r--r-- | lib/librte_acl/rte_acl_osdep.h | 80 | ||||
-rw-r--r-- | lib/librte_acl/rte_acl_version.map | 19 | ||||
-rw-r--r-- | lib/librte_acl/tb_mem.c | 108 | ||||
-rw-r--r-- | lib/librte_acl/tb_mem.h | 76 |
19 files changed, 5208 insertions, 0 deletions
diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile new file mode 100644 index 00000000..2e394c97 --- /dev/null +++ b/lib/librte_acl/Makefile @@ -0,0 +1,96 @@ +# BSD LICENSE +# +# Copyright(c) 2010-2014 Intel Corporation. All rights reserved. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_acl.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) + +EXPORT_MAP := rte_acl_version.map + +LIBABIVER := 2 + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += tb_mem.c + +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_scalar.c + +ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),) +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_neon.c +CFLAGS_acl_run_neon.o += -flax-vector-conversions -Wno-maybe-uninitialized +else +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c +#check if flag for SSE4.1 is already on, if not set it up manually + ifeq ($(findstring RTE_MACHINE_CPUFLAG_SSE4_1,$(CFLAGS)),) + CFLAGS_acl_run_sse.o += -msse4.1 + endif +endif + +# +# If the compiler supports AVX2 instructions, +# then add support for AVX2 classify method. +# + +#check if flag for AVX2 is already on, if not set it up manually +ifeq ($(findstring RTE_MACHINE_CPUFLAG_AVX2,$(CFLAGS)),RTE_MACHINE_CPUFLAG_AVX2) + CC_AVX2_SUPPORT=1 +else + CC_AVX2_SUPPORT=\ + $(shell $(CC) -march=core-avx2 -dM -E - </dev/null 2>&1 | \ + grep -q AVX2 && echo 1) + ifeq ($(CC_AVX2_SUPPORT), 1) + ifeq ($(CC), icc) + CFLAGS_acl_run_avx2.o += -march=core-avx2 + else + CFLAGS_acl_run_avx2.o += -mavx2 + endif + endif +endif + +ifeq ($(CC_AVX2_SUPPORT), 1) + SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_avx2.c + CFLAGS_rte_acl.o += -DCC_AVX2_SUPPORT +endif + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include := rte_acl_osdep.h +SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include += rte_acl.h + +# this lib needs eal +DEPDIRS-$(CONFIG_RTE_LIBRTE_ACL) += lib/librte_eal + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h new file mode 100644 index 00000000..09d67841 --- /dev/null +++ b/lib/librte_acl/acl.h @@ -0,0 +1,241 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ACL_H_ +#define _ACL_H_ + +#ifdef __cplusplus +extern"C" { +#endif /* __cplusplus */ + +#define RTE_ACL_QUAD_MAX 5 +#define RTE_ACL_QUAD_SIZE 4 +#define RTE_ACL_QUAD_SINGLE UINT64_C(0x7f7f7f7f00000000) + +#define RTE_ACL_SINGLE_TRIE_SIZE 2000 + +#define RTE_ACL_DFA_MAX UINT8_MAX +#define RTE_ACL_DFA_SIZE (UINT8_MAX + 1) + +#define RTE_ACL_DFA_GR64_SIZE 64 +#define RTE_ACL_DFA_GR64_NUM (RTE_ACL_DFA_SIZE / RTE_ACL_DFA_GR64_SIZE) +#define RTE_ACL_DFA_GR64_BIT \ + (CHAR_BIT * sizeof(uint32_t) / RTE_ACL_DFA_GR64_NUM) + +typedef int bits_t; + +#define RTE_ACL_BIT_SET_SIZE ((UINT8_MAX + 1) / (sizeof(bits_t) * CHAR_BIT)) + +struct rte_acl_bitset { + bits_t bits[RTE_ACL_BIT_SET_SIZE]; +}; + +#define RTE_ACL_NODE_DFA (0 << RTE_ACL_TYPE_SHIFT) +#define RTE_ACL_NODE_SINGLE (1U << RTE_ACL_TYPE_SHIFT) +#define RTE_ACL_NODE_QRANGE (3U << RTE_ACL_TYPE_SHIFT) +#define RTE_ACL_NODE_MATCH (4U << RTE_ACL_TYPE_SHIFT) +#define RTE_ACL_NODE_TYPE (7U << RTE_ACL_TYPE_SHIFT) +#define RTE_ACL_NODE_UNDEFINED UINT32_MAX + +/* + * ACL RT structure is a set of multibit tries (with stride == 8) + * represented by an array of transitions. The next position is calculated + * based on the current position and the input byte. + * Each transition is 64 bit value with the following format: + * | node_type_specific : 32 | node_type : 3 | node_addr : 29 | + * For all node types except RTE_ACL_NODE_MATCH, node_addr is an index + * to the start of the node in the transtions array. + * Few different node types are used: + * RTE_ACL_NODE_MATCH: + * node_addr value is and index into an array that contains the return value + * and its priority for each category. + * Upper 32 bits of the transition value are not used for that node type. + * RTE_ACL_NODE_QRANGE: + * that node consist of up to 5 transitions. + * Upper 32 bits are interpreted as 4 signed character values which + * are ordered from smallest(INT8_MIN) to largest (INT8_MAX). + * These values define 5 ranges: + * INT8_MIN <= range[0] <= ((int8_t *)&transition)[4] + * ((int8_t *)&transition)[4] < range[1] <= ((int8_t *)&transition)[5] + * ((int8_t *)&transition)[5] < range[2] <= ((int8_t *)&transition)[6] + * ((int8_t *)&transition)[6] < range[3] <= ((int8_t *)&transition)[7] + * ((int8_t *)&transition)[7] < range[4] <= INT8_MAX + * So for input byte value within range[i] i-th transition within that node + * will be used. + * RTE_ACL_NODE_SINGLE: + * always transitions to the same node regardless of the input value. + * RTE_ACL_NODE_DFA: + * that node consits of up to 256 transitions. + * In attempt to conserve space all transitions are divided into 4 consecutive + * groups, by 64 transitions per group: + * group64[i] contains transitions[i * 64, .. i * 64 + 63]. + * Upper 32 bits are interpreted as 4 unsigned character values one per group, + * which contain index to the start of the given group within the node. + * So to calculate transition index within the node for given input byte value: + * input_byte - ((uint8_t *)&transition)[4 + input_byte / 64]. + */ + +/* + * Structure of a node is a set of ptrs and each ptr has a bit map + * of values associated with this transition. + */ +struct rte_acl_ptr_set { + struct rte_acl_bitset values; /* input values associated with ptr */ + struct rte_acl_node *ptr; /* transition to next node */ +}; + +struct rte_acl_classifier_results { + int results[RTE_ACL_MAX_CATEGORIES]; +}; + +struct rte_acl_match_results { + uint32_t results[RTE_ACL_MAX_CATEGORIES]; + int32_t priority[RTE_ACL_MAX_CATEGORIES]; +}; + +struct rte_acl_node { + uint64_t node_index; /* index for this node */ + uint32_t level; /* level 0-n in the trie */ + uint32_t ref_count; /* ref count for this node */ + struct rte_acl_bitset values; + /* set of all values that map to another node + * (union of bits in each transition. + */ + uint32_t num_ptrs; /* number of ptr_set in use */ + uint32_t max_ptrs; /* number of allocated ptr_set */ + uint32_t min_add; /* number of ptr_set per allocation */ + struct rte_acl_ptr_set *ptrs; /* transitions array for this node */ + int32_t match_flag; + int32_t match_index; /* index to match data */ + uint32_t node_type; + int32_t fanout; + /* number of ranges (transitions w/ consecutive bits) */ + int32_t id; + struct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */ + union { + char transitions[RTE_ACL_QUAD_SIZE]; + /* boundaries for ranged node */ + uint8_t dfa_gr64[RTE_ACL_DFA_GR64_NUM]; + }; + struct rte_acl_node *next; + /* free list link or pointer to duplicate node during merge */ + struct rte_acl_node *prev; + /* points to node from which this node was duplicated */ +}; + +/* + * Types of tries used to generate runtime structure(s) + */ +enum { + RTE_ACL_FULL_TRIE = 0, + RTE_ACL_NOSRC_TRIE = 1, + RTE_ACL_NODST_TRIE = 2, + RTE_ACL_NOPORTS_TRIE = 4, + RTE_ACL_NOVLAN_TRIE = 8, + RTE_ACL_UNUSED_TRIE = 0x80000000 +}; + + +/** MAX number of tries per one ACL context.*/ +#define RTE_ACL_MAX_TRIES 8 + +/** Max number of characters in PM name.*/ +#define RTE_ACL_NAMESIZE 32 + + +struct rte_acl_trie { + uint32_t type; + uint32_t count; + uint32_t root_index; + const uint32_t *data_index; + uint32_t num_data_indexes; +}; + +struct rte_acl_bld_trie { + struct rte_acl_node *trie; +}; + +struct rte_acl_ctx { + char name[RTE_ACL_NAMESIZE]; + /** Name of the ACL context. */ + int32_t socket_id; + /** Socket ID to allocate memory from. */ + enum rte_acl_classify_alg alg; + void *rules; + uint32_t max_rules; + uint32_t rule_sz; + uint32_t num_rules; + uint32_t num_categories; + uint32_t num_tries; + uint32_t match_index; + uint64_t no_match; + uint64_t idle; + uint64_t *trans_table; + uint32_t *data_indexes; + struct rte_acl_trie trie[RTE_ACL_MAX_TRIES]; + void *mem; + size_t mem_sz; + struct rte_acl_config config; /* copy of build config. */ +}; + +int rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie, + struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries, + uint32_t num_categories, uint32_t data_index_sz, size_t max_size); + +typedef int (*rte_acl_classify_t) +(const struct rte_acl_ctx *, const uint8_t **, uint32_t *, uint32_t, uint32_t); + +/* + * Different implementations of ACL classify. + */ +int +rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories); + +int +rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories); + +int +rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories); + +int +rte_acl_classify_neon(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* _ACL_H_ */ diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c new file mode 100644 index 00000000..9e5ad1c6 --- /dev/null +++ b/lib/librte_acl/acl_bld.c @@ -0,0 +1,1598 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <rte_acl.h> +#include "tb_mem.h" +#include "acl.h" + +#define ACL_POOL_ALIGN 8 +#define ACL_POOL_ALLOC_MIN 0x800000 + +/* number of pointers per alloc */ +#define ACL_PTR_ALLOC 32 + +/* macros for dividing rule sets heuristics */ +#define NODE_MAX 0x4000 +#define NODE_MIN 0x800 + +/* TALLY are statistics per field */ +enum { + TALLY_0 = 0, /* number of rules that are 0% or more wild. */ + TALLY_25, /* number of rules that are 25% or more wild. */ + TALLY_50, + TALLY_75, + TALLY_100, + TALLY_DEACTIVATED, /* deactivated fields (100% wild in all rules). */ + TALLY_DEPTH, + /* number of rules that are 100% wild for this field and higher. */ + TALLY_NUM +}; + +static const uint32_t wild_limits[TALLY_DEACTIVATED] = {0, 25, 50, 75, 100}; + +enum { + ACL_INTERSECT_NONE = 0, + ACL_INTERSECT_A = 1, /* set A is a superset of A and B intersect */ + ACL_INTERSECT_B = 2, /* set B is a superset of A and B intersect */ + ACL_INTERSECT = 4, /* sets A and B intersect */ +}; + +enum { + ACL_PRIORITY_EQUAL = 0, + ACL_PRIORITY_NODE_A = 1, + ACL_PRIORITY_NODE_B = 2, + ACL_PRIORITY_MIXED = 3 +}; + + +struct acl_mem_block { + uint32_t block_size; + void *mem_ptr; +}; + +#define MEM_BLOCK_NUM 16 + +/* Single ACL rule, build representation.*/ +struct rte_acl_build_rule { + struct rte_acl_build_rule *next; + struct rte_acl_config *config; + /**< configuration for each field in the rule. */ + const struct rte_acl_rule *f; + uint32_t *wildness; +}; + +/* Context for build phase */ +struct acl_build_context { + const struct rte_acl_ctx *acx; + struct rte_acl_build_rule *build_rules; + struct rte_acl_config cfg; + int32_t node_max; + int32_t cur_node_max; + uint32_t node; + uint32_t num_nodes; + uint32_t category_mask; + uint32_t num_rules; + uint32_t node_id; + uint32_t src_mask; + uint32_t num_build_rules; + uint32_t num_tries; + struct tb_mem_pool pool; + struct rte_acl_trie tries[RTE_ACL_MAX_TRIES]; + struct rte_acl_bld_trie bld_tries[RTE_ACL_MAX_TRIES]; + uint32_t data_indexes[RTE_ACL_MAX_TRIES][RTE_ACL_MAX_FIELDS]; + + /* memory free lists for nodes and blocks used for node ptrs */ + struct acl_mem_block blocks[MEM_BLOCK_NUM]; + struct rte_acl_node *node_free_list; +}; + +static int acl_merge_trie(struct acl_build_context *context, + struct rte_acl_node *node_a, struct rte_acl_node *node_b, + uint32_t level, struct rte_acl_node **node_c); + +static void +acl_deref_ptr(struct acl_build_context *context, + struct rte_acl_node *node, int index); + +static void * +acl_build_alloc(struct acl_build_context *context, size_t n, size_t s) +{ + uint32_t m; + void *p; + size_t alloc_size = n * s; + + /* + * look for memory in free lists + */ + for (m = 0; m < RTE_DIM(context->blocks); m++) { + if (context->blocks[m].block_size == + alloc_size && context->blocks[m].mem_ptr != NULL) { + p = context->blocks[m].mem_ptr; + context->blocks[m].mem_ptr = *((void **)p); + memset(p, 0, alloc_size); + return p; + } + } + + /* + * return allocation from memory pool + */ + p = tb_alloc(&context->pool, alloc_size); + return p; +} + +/* + * Free memory blocks (kept in context for reuse). + */ +static void +acl_build_free(struct acl_build_context *context, size_t s, void *p) +{ + uint32_t n; + + for (n = 0; n < RTE_DIM(context->blocks); n++) { + if (context->blocks[n].block_size == s) { + *((void **)p) = context->blocks[n].mem_ptr; + context->blocks[n].mem_ptr = p; + return; + } + } + for (n = 0; n < RTE_DIM(context->blocks); n++) { + if (context->blocks[n].block_size == 0) { + context->blocks[n].block_size = s; + *((void **)p) = NULL; + context->blocks[n].mem_ptr = p; + return; + } + } +} + +/* + * Allocate and initialize a new node. + */ +static struct rte_acl_node * +acl_alloc_node(struct acl_build_context *context, int level) +{ + struct rte_acl_node *node; + + if (context->node_free_list != NULL) { + node = context->node_free_list; + context->node_free_list = node->next; + memset(node, 0, sizeof(struct rte_acl_node)); + } else { + node = acl_build_alloc(context, sizeof(struct rte_acl_node), 1); + } + + if (node != NULL) { + node->num_ptrs = 0; + node->level = level; + node->node_type = RTE_ACL_NODE_UNDEFINED; + node->node_index = RTE_ACL_NODE_UNDEFINED; + context->num_nodes++; + node->id = context->node_id++; + } + return node; +} + +/* + * Dereference all nodes to which this node points + */ +static void +acl_free_node(struct acl_build_context *context, + struct rte_acl_node *node) +{ + uint32_t n; + + if (node->prev != NULL) + node->prev->next = NULL; + for (n = 0; n < node->num_ptrs; n++) + acl_deref_ptr(context, node, n); + + /* free mrt if this is a match node */ + if (node->mrt != NULL) { + acl_build_free(context, sizeof(struct rte_acl_match_results), + node->mrt); + node->mrt = NULL; + } + + /* free transitions to other nodes */ + if (node->ptrs != NULL) { + acl_build_free(context, + node->max_ptrs * sizeof(struct rte_acl_ptr_set), + node->ptrs); + node->ptrs = NULL; + } + + /* put it on the free list */ + context->num_nodes--; + node->next = context->node_free_list; + context->node_free_list = node; +} + + +/* + * Include src bitset in dst bitset + */ +static void +acl_include(struct rte_acl_bitset *dst, struct rte_acl_bitset *src, bits_t mask) +{ + uint32_t n; + + for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) + dst->bits[n] = (dst->bits[n] & mask) | src->bits[n]; +} + +/* + * Set dst to bits of src1 that are not in src2 + */ +static int +acl_exclude(struct rte_acl_bitset *dst, + struct rte_acl_bitset *src1, + struct rte_acl_bitset *src2) +{ + uint32_t n; + bits_t all_bits = 0; + + for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) { + dst->bits[n] = src1->bits[n] & ~src2->bits[n]; + all_bits |= dst->bits[n]; + } + return all_bits != 0; +} + +/* + * Add a pointer (ptr) to a node. + */ +static int +acl_add_ptr(struct acl_build_context *context, + struct rte_acl_node *node, + struct rte_acl_node *ptr, + struct rte_acl_bitset *bits) +{ + uint32_t n, num_ptrs; + struct rte_acl_ptr_set *ptrs = NULL; + + /* + * If there's already a pointer to the same node, just add to the bitset + */ + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) { + if (node->ptrs[n].ptr == ptr) { + acl_include(&node->ptrs[n].values, bits, -1); + acl_include(&node->values, bits, -1); + return 0; + } + } + } + + /* if there's no room for another pointer, make room */ + if (node->num_ptrs >= node->max_ptrs) { + /* add room for more pointers */ + num_ptrs = node->max_ptrs + ACL_PTR_ALLOC; + ptrs = acl_build_alloc(context, num_ptrs, sizeof(*ptrs)); + + /* copy current points to new memory allocation */ + if (node->ptrs != NULL) { + memcpy(ptrs, node->ptrs, + node->num_ptrs * sizeof(*ptrs)); + acl_build_free(context, node->max_ptrs * sizeof(*ptrs), + node->ptrs); + } + node->ptrs = ptrs; + node->max_ptrs = num_ptrs; + } + + /* Find available ptr and add a new pointer to this node */ + for (n = node->min_add; n < node->max_ptrs; n++) { + if (node->ptrs[n].ptr == NULL) { + node->ptrs[n].ptr = ptr; + acl_include(&node->ptrs[n].values, bits, 0); + acl_include(&node->values, bits, -1); + if (ptr != NULL) + ptr->ref_count++; + if (node->num_ptrs <= n) + node->num_ptrs = n + 1; + return 0; + } + } + + return 0; +} + +/* + * Add a pointer for a range of values + */ +static int +acl_add_ptr_range(struct acl_build_context *context, + struct rte_acl_node *root, + struct rte_acl_node *node, + uint8_t low, + uint8_t high) +{ + uint32_t n; + struct rte_acl_bitset bitset; + + /* clear the bitset values */ + for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) + bitset.bits[n] = 0; + + /* for each bit in range, add bit to set */ + for (n = 0; n < UINT8_MAX + 1; n++) + if (n >= low && n <= high) + bitset.bits[n / (sizeof(bits_t) * 8)] |= + 1 << (n % (sizeof(bits_t) * 8)); + + return acl_add_ptr(context, root, node, &bitset); +} + +/* + * Generate a bitset from a byte value and mask. + */ +static int +acl_gen_mask(struct rte_acl_bitset *bitset, uint32_t value, uint32_t mask) +{ + int range = 0; + uint32_t n; + + /* clear the bitset values */ + for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) + bitset->bits[n] = 0; + + /* for each bit in value/mask, add bit to set */ + for (n = 0; n < UINT8_MAX + 1; n++) { + if ((n & mask) == value) { + range++; + bitset->bits[n / (sizeof(bits_t) * 8)] |= + 1 << (n % (sizeof(bits_t) * 8)); + } + } + return range; +} + +/* + * Determine how A and B intersect. + * Determine if A and/or B are supersets of the intersection. + */ +static int +acl_intersect_type(const struct rte_acl_bitset *a_bits, + const struct rte_acl_bitset *b_bits, + struct rte_acl_bitset *intersect) +{ + uint32_t n; + bits_t intersect_bits = 0; + bits_t a_superset = 0; + bits_t b_superset = 0; + + /* + * calculate and store intersection and check if A and/or B have + * bits outside the intersection (superset) + */ + for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) { + intersect->bits[n] = a_bits->bits[n] & b_bits->bits[n]; + a_superset |= a_bits->bits[n] ^ intersect->bits[n]; + b_superset |= b_bits->bits[n] ^ intersect->bits[n]; + intersect_bits |= intersect->bits[n]; + } + + n = (intersect_bits == 0 ? ACL_INTERSECT_NONE : ACL_INTERSECT) | + (b_superset == 0 ? 0 : ACL_INTERSECT_B) | + (a_superset == 0 ? 0 : ACL_INTERSECT_A); + + return n; +} + +/* + * Duplicate a node + */ +static struct rte_acl_node * +acl_dup_node(struct acl_build_context *context, struct rte_acl_node *node) +{ + uint32_t n; + struct rte_acl_node *next; + + next = acl_alloc_node(context, node->level); + + /* allocate the pointers */ + if (node->num_ptrs > 0) { + next->ptrs = acl_build_alloc(context, + node->max_ptrs, + sizeof(struct rte_acl_ptr_set)); + next->max_ptrs = node->max_ptrs; + } + + /* copy over the pointers */ + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) { + next->ptrs[n].ptr = node->ptrs[n].ptr; + next->ptrs[n].ptr->ref_count++; + acl_include(&next->ptrs[n].values, + &node->ptrs[n].values, -1); + } + } + + next->num_ptrs = node->num_ptrs; + + /* copy over node's match results */ + if (node->match_flag == 0) + next->match_flag = 0; + else { + next->match_flag = -1; + next->mrt = acl_build_alloc(context, 1, sizeof(*next->mrt)); + memcpy(next->mrt, node->mrt, sizeof(*next->mrt)); + } + + /* copy over node's bitset */ + acl_include(&next->values, &node->values, -1); + + node->next = next; + next->prev = node; + + return next; +} + +/* + * Dereference a pointer from a node + */ +static void +acl_deref_ptr(struct acl_build_context *context, + struct rte_acl_node *node, int index) +{ + struct rte_acl_node *ref_node; + + /* De-reference the node at the specified pointer */ + if (node != NULL && node->ptrs[index].ptr != NULL) { + ref_node = node->ptrs[index].ptr; + ref_node->ref_count--; + if (ref_node->ref_count == 0) + acl_free_node(context, ref_node); + } +} + +/* + * acl_exclude rte_acl_bitset from src and copy remaining pointer to dst + */ +static int +acl_copy_ptr(struct acl_build_context *context, + struct rte_acl_node *dst, + struct rte_acl_node *src, + int index, + struct rte_acl_bitset *b_bits) +{ + int rc; + struct rte_acl_bitset bits; + + if (b_bits != NULL) + if (!acl_exclude(&bits, &src->ptrs[index].values, b_bits)) + return 0; + + rc = acl_add_ptr(context, dst, src->ptrs[index].ptr, &bits); + if (rc < 0) + return rc; + return 1; +} + +/* + * Fill in gaps in ptrs list with the ptr at the end of the list + */ +static void +acl_compact_node_ptrs(struct rte_acl_node *node_a) +{ + uint32_t n; + int min_add = node_a->min_add; + + while (node_a->num_ptrs > 0 && + node_a->ptrs[node_a->num_ptrs - 1].ptr == NULL) + node_a->num_ptrs--; + + for (n = min_add; n + 1 < node_a->num_ptrs; n++) { + + /* if this entry is empty */ + if (node_a->ptrs[n].ptr == NULL) { + + /* move the last pointer to this entry */ + acl_include(&node_a->ptrs[n].values, + &node_a->ptrs[node_a->num_ptrs - 1].values, + 0); + node_a->ptrs[n].ptr = + node_a->ptrs[node_a->num_ptrs - 1].ptr; + + /* + * mark the end as empty and adjust the number + * of used pointer enum_tries + */ + node_a->ptrs[node_a->num_ptrs - 1].ptr = NULL; + while (node_a->num_ptrs > 0 && + node_a->ptrs[node_a->num_ptrs - 1].ptr == NULL) + node_a->num_ptrs--; + } + } +} + +static int +acl_resolve_leaf(struct acl_build_context *context, + struct rte_acl_node *node_a, + struct rte_acl_node *node_b, + struct rte_acl_node **node_c) +{ + uint32_t n; + int combined_priority = ACL_PRIORITY_EQUAL; + + for (n = 0; n < context->cfg.num_categories; n++) { + if (node_a->mrt->priority[n] != node_b->mrt->priority[n]) { + combined_priority |= (node_a->mrt->priority[n] > + node_b->mrt->priority[n]) ? + ACL_PRIORITY_NODE_A : ACL_PRIORITY_NODE_B; + } + } + + /* + * if node a is higher or equal priority for all categories, + * then return node_a. + */ + if (combined_priority == ACL_PRIORITY_NODE_A || + combined_priority == ACL_PRIORITY_EQUAL) { + *node_c = node_a; + return 0; + } + + /* + * if node b is higher or equal priority for all categories, + * then return node_b. + */ + if (combined_priority == ACL_PRIORITY_NODE_B) { + *node_c = node_b; + return 0; + } + + /* + * mixed priorities - create a new node with the highest priority + * for each category. + */ + + /* force new duplication. */ + node_a->next = NULL; + + *node_c = acl_dup_node(context, node_a); + for (n = 0; n < context->cfg.num_categories; n++) { + if ((*node_c)->mrt->priority[n] < node_b->mrt->priority[n]) { + (*node_c)->mrt->priority[n] = node_b->mrt->priority[n]; + (*node_c)->mrt->results[n] = node_b->mrt->results[n]; + } + } + return 0; +} + +/* + * Merge nodes A and B together, + * returns a node that is the path for the intersection + * + * If match node (leaf on trie) + * For each category + * return node = highest priority result + * + * Create C as a duplicate of A to point to child intersections + * If any pointers in C intersect with any in B + * For each intersection + * merge children + * remove intersection from C pointer + * add a pointer from C to child intersection node + * Compact the pointers in A and B + * Copy any B pointers that are outside of the intersection to C + * If C has no references to the B trie + * free C and return A + * Else If C has no references to the A trie + * free C and return B + * Else + * return C + */ +static int +acl_merge_trie(struct acl_build_context *context, + struct rte_acl_node *node_a, struct rte_acl_node *node_b, + uint32_t level, struct rte_acl_node **return_c) +{ + uint32_t n, m, ptrs_c, ptrs_b; + uint32_t min_add_c, min_add_b; + int node_intersect_type; + struct rte_acl_bitset node_intersect; + struct rte_acl_node *node_c; + struct rte_acl_node *node_a_next; + int node_b_refs; + int node_a_refs; + + node_c = node_a; + node_a_next = node_a->next; + min_add_c = 0; + min_add_b = 0; + node_a_refs = node_a->num_ptrs; + node_b_refs = 0; + node_intersect_type = 0; + + /* Resolve leaf nodes (matches) */ + if (node_a->match_flag != 0) { + acl_resolve_leaf(context, node_a, node_b, return_c); + return 0; + } + + /* + * Create node C as a copy of node A, and do: C = merge(A,B); + * If node A can be used instead (A==C), then later we'll + * destroy C and return A. + */ + if (level > 0) + node_c = acl_dup_node(context, node_a); + + /* + * If the two node transitions intersect then merge the transitions. + * Check intersection for entire node (all pointers) + */ + node_intersect_type = acl_intersect_type(&node_c->values, + &node_b->values, + &node_intersect); + + if (node_intersect_type & ACL_INTERSECT) { + + min_add_b = node_b->min_add; + node_b->min_add = node_b->num_ptrs; + ptrs_b = node_b->num_ptrs; + + min_add_c = node_c->min_add; + node_c->min_add = node_c->num_ptrs; + ptrs_c = node_c->num_ptrs; + + for (n = 0; n < ptrs_c; n++) { + if (node_c->ptrs[n].ptr == NULL) { + node_a_refs--; + continue; + } + node_c->ptrs[n].ptr->next = NULL; + for (m = 0; m < ptrs_b; m++) { + + struct rte_acl_bitset child_intersect; + int child_intersect_type; + struct rte_acl_node *child_node_c = NULL; + + if (node_b->ptrs[m].ptr == NULL || + node_c->ptrs[n].ptr == + node_b->ptrs[m].ptr) + continue; + + child_intersect_type = acl_intersect_type( + &node_c->ptrs[n].values, + &node_b->ptrs[m].values, + &child_intersect); + + if ((child_intersect_type & ACL_INTERSECT) != + 0) { + if (acl_merge_trie(context, + node_c->ptrs[n].ptr, + node_b->ptrs[m].ptr, + level + 1, + &child_node_c)) + return 1; + + if (child_node_c != NULL && + child_node_c != + node_c->ptrs[n].ptr) { + + node_b_refs++; + + /* + * Added link from C to + * child_C for all transitions + * in the intersection. + */ + acl_add_ptr(context, node_c, + child_node_c, + &child_intersect); + + /* + * inc refs if pointer is not + * to node b. + */ + node_a_refs += (child_node_c != + node_b->ptrs[m].ptr); + + /* + * Remove intersection from C + * pointer. + */ + if (!acl_exclude( + &node_c->ptrs[n].values, + &node_c->ptrs[n].values, + &child_intersect)) { + acl_deref_ptr(context, + node_c, n); + node_c->ptrs[n].ptr = + NULL; + node_a_refs--; + } + } + } + } + } + + /* Compact pointers */ + node_c->min_add = min_add_c; + acl_compact_node_ptrs(node_c); + node_b->min_add = min_add_b; + acl_compact_node_ptrs(node_b); + } + + /* + * Copy pointers outside of the intersection from B to C + */ + if ((node_intersect_type & ACL_INTERSECT_B) != 0) { + node_b_refs++; + for (m = 0; m < node_b->num_ptrs; m++) + if (node_b->ptrs[m].ptr != NULL) + acl_copy_ptr(context, node_c, + node_b, m, &node_intersect); + } + + /* + * Free node C if top of trie is contained in A or B + * if node C is a duplicate of node A && + * node C was not an existing duplicate + */ + if (node_c != node_a && node_c != node_a_next) { + + /* + * if the intersection has no references to the + * B side, then it is contained in A + */ + if (node_b_refs == 0) { + acl_free_node(context, node_c); + node_c = node_a; + } else { + /* + * if the intersection has no references to the + * A side, then it is contained in B. + */ + if (node_a_refs == 0) { + acl_free_node(context, node_c); + node_c = node_b; + } + } + } + + if (return_c != NULL) + *return_c = node_c; + + if (level == 0) + acl_free_node(context, node_b); + + return 0; +} + +/* + * Reset current runtime fields before next build: + * - free allocated RT memory. + * - reset all RT related fields to zero. + */ +static void +acl_build_reset(struct rte_acl_ctx *ctx) +{ + rte_free(ctx->mem); + memset(&ctx->num_categories, 0, + sizeof(*ctx) - offsetof(struct rte_acl_ctx, num_categories)); +} + +static void +acl_gen_range(struct acl_build_context *context, + const uint8_t *hi, const uint8_t *lo, int size, int level, + struct rte_acl_node *root, struct rte_acl_node *end) +{ + struct rte_acl_node *node, *prev; + uint32_t n; + + prev = root; + for (n = size - 1; n > 0; n--) { + node = acl_alloc_node(context, level++); + acl_add_ptr_range(context, prev, node, lo[n], hi[n]); + prev = node; + } + acl_add_ptr_range(context, prev, end, lo[0], hi[0]); +} + +static struct rte_acl_node * +acl_gen_range_trie(struct acl_build_context *context, + const void *min, const void *max, + int size, int level, struct rte_acl_node **pend) +{ + int32_t n; + struct rte_acl_node *root; + const uint8_t *lo = (const uint8_t *)min; + const uint8_t *hi = (const uint8_t *)max; + + *pend = acl_alloc_node(context, level+size); + root = acl_alloc_node(context, level++); + + if (lo[size - 1] == hi[size - 1]) { + acl_gen_range(context, hi, lo, size, level, root, *pend); + } else { + uint8_t limit_lo[64]; + uint8_t limit_hi[64]; + uint8_t hi_ff = UINT8_MAX; + uint8_t lo_00 = 0; + + memset(limit_lo, 0, RTE_DIM(limit_lo)); + memset(limit_hi, UINT8_MAX, RTE_DIM(limit_hi)); + + for (n = size - 2; n >= 0; n--) { + hi_ff = (uint8_t)(hi_ff & hi[n]); + lo_00 = (uint8_t)(lo_00 | lo[n]); + } + + if (hi_ff != UINT8_MAX) { + limit_lo[size - 1] = hi[size - 1]; + acl_gen_range(context, hi, limit_lo, size, level, + root, *pend); + } + + if (lo_00 != 0) { + limit_hi[size - 1] = lo[size - 1]; + acl_gen_range(context, limit_hi, lo, size, level, + root, *pend); + } + + if (hi[size - 1] - lo[size - 1] > 1 || + lo_00 == 0 || + hi_ff == UINT8_MAX) { + limit_lo[size-1] = (uint8_t)(lo[size-1] + (lo_00 != 0)); + limit_hi[size-1] = (uint8_t)(hi[size-1] - + (hi_ff != UINT8_MAX)); + acl_gen_range(context, limit_hi, limit_lo, size, + level, root, *pend); + } + } + return root; +} + +static struct rte_acl_node * +acl_gen_mask_trie(struct acl_build_context *context, + const void *value, const void *mask, + int size, int level, struct rte_acl_node **pend) +{ + int32_t n; + struct rte_acl_node *root; + struct rte_acl_node *node, *prev; + struct rte_acl_bitset bits; + const uint8_t *val = (const uint8_t *)value; + const uint8_t *msk = (const uint8_t *)mask; + + root = acl_alloc_node(context, level++); + prev = root; + + for (n = size - 1; n >= 0; n--) { + node = acl_alloc_node(context, level++); + acl_gen_mask(&bits, val[n] & msk[n], msk[n]); + acl_add_ptr(context, prev, node, &bits); + prev = node; + } + + *pend = prev; + return root; +} + +static struct rte_acl_node * +build_trie(struct acl_build_context *context, struct rte_acl_build_rule *head, + struct rte_acl_build_rule **last, uint32_t *count) +{ + uint32_t n, m; + int field_index, node_count; + struct rte_acl_node *trie; + struct rte_acl_build_rule *prev, *rule; + struct rte_acl_node *end, *merge, *root, *end_prev; + const struct rte_acl_field *fld; + + prev = head; + rule = head; + *last = prev; + + trie = acl_alloc_node(context, 0); + + while (rule != NULL) { + + root = acl_alloc_node(context, 0); + + root->ref_count = 1; + end = root; + + for (n = 0; n < rule->config->num_fields; n++) { + + field_index = rule->config->defs[n].field_index; + fld = rule->f->field + field_index; + end_prev = end; + + /* build a mini-trie for this field */ + switch (rule->config->defs[n].type) { + + case RTE_ACL_FIELD_TYPE_BITMASK: + merge = acl_gen_mask_trie(context, + &fld->value, + &fld->mask_range, + rule->config->defs[n].size, + end->level + 1, + &end); + break; + + case RTE_ACL_FIELD_TYPE_MASK: + { + /* + * set msb for the size of the field and + * all higher bits. + */ + uint64_t mask; + mask = RTE_ACL_MASKLEN_TO_BITMASK( + fld->mask_range.u32, + rule->config->defs[n].size); + + /* gen a mini-trie for this field */ + merge = acl_gen_mask_trie(context, + &fld->value, + (char *)&mask, + rule->config->defs[n].size, + end->level + 1, + &end); + } + break; + + case RTE_ACL_FIELD_TYPE_RANGE: + merge = acl_gen_range_trie(context, + &rule->f->field[field_index].value, + &rule->f->field[field_index].mask_range, + rule->config->defs[n].size, + end->level + 1, + &end); + break; + + default: + RTE_LOG(ERR, ACL, + "Error in rule[%u] type - %hhu\n", + rule->f->data.userdata, + rule->config->defs[n].type); + return NULL; + } + + /* merge this field on to the end of the rule */ + if (acl_merge_trie(context, end_prev, merge, 0, + NULL) != 0) { + return NULL; + } + } + + end->match_flag = ++context->num_build_rules; + + /* + * Setup the results for this rule. + * The result and priority of each category. + */ + if (end->mrt == NULL) + end->mrt = acl_build_alloc(context, 1, + sizeof(*end->mrt)); + + for (m = context->cfg.num_categories; 0 != m--; ) { + if (rule->f->data.category_mask & (1 << m)) { + end->mrt->results[m] = rule->f->data.userdata; + end->mrt->priority[m] = rule->f->data.priority; + } else { + end->mrt->results[m] = 0; + end->mrt->priority[m] = 0; + } + } + + node_count = context->num_nodes; + (*count)++; + + /* merge this rule into the trie */ + if (acl_merge_trie(context, trie, root, 0, NULL)) + return NULL; + + node_count = context->num_nodes - node_count; + if (node_count > context->cur_node_max) { + *last = prev; + return trie; + } + + prev = rule; + rule = rule->next; + } + + *last = NULL; + return trie; +} + +static void +acl_calc_wildness(struct rte_acl_build_rule *head, + const struct rte_acl_config *config) +{ + uint32_t n; + struct rte_acl_build_rule *rule; + + for (rule = head; rule != NULL; rule = rule->next) { + + for (n = 0; n < config->num_fields; n++) { + + double wild = 0; + uint32_t bit_len = CHAR_BIT * config->defs[n].size; + uint64_t msk_val = RTE_LEN2MASK(bit_len, + typeof(msk_val)); + double size = bit_len; + int field_index = config->defs[n].field_index; + const struct rte_acl_field *fld = rule->f->field + + field_index; + + switch (rule->config->defs[n].type) { + case RTE_ACL_FIELD_TYPE_BITMASK: + wild = (size - __builtin_popcountll( + fld->mask_range.u64 & msk_val)) / + size; + break; + + case RTE_ACL_FIELD_TYPE_MASK: + wild = (size - fld->mask_range.u32) / size; + break; + + case RTE_ACL_FIELD_TYPE_RANGE: + wild = (fld->mask_range.u64 & msk_val) - + (fld->value.u64 & msk_val); + wild = wild / msk_val; + break; + } + + rule->wildness[field_index] = (uint32_t)(wild * 100); + } + } +} + +static void +acl_rule_stats(struct rte_acl_build_rule *head, struct rte_acl_config *config) +{ + struct rte_acl_build_rule *rule; + uint32_t n, m, fields_deactivated = 0; + uint32_t start = 0, deactivate = 0; + int tally[RTE_ACL_MAX_LEVELS][TALLY_NUM]; + + memset(tally, 0, sizeof(tally)); + + for (rule = head; rule != NULL; rule = rule->next) { + + for (n = 0; n < config->num_fields; n++) { + uint32_t field_index = config->defs[n].field_index; + + tally[n][TALLY_0]++; + for (m = 1; m < RTE_DIM(wild_limits); m++) { + if (rule->wildness[field_index] >= + wild_limits[m]) + tally[n][m]++; + } + } + + for (n = config->num_fields - 1; n > 0; n--) { + uint32_t field_index = config->defs[n].field_index; + + if (rule->wildness[field_index] == 100) + tally[n][TALLY_DEPTH]++; + else + break; + } + } + + /* + * Look for any field that is always wild and drop it from the config + * Only deactivate if all fields for a given input loop are deactivated. + */ + for (n = 1; n < config->num_fields; n++) { + if (config->defs[n].input_index != + config->defs[n - 1].input_index) { + for (m = start; m < n; m++) + tally[m][TALLY_DEACTIVATED] = deactivate; + fields_deactivated += deactivate; + start = n; + deactivate = 1; + } + + /* if the field is not always completely wild */ + if (tally[n][TALLY_100] != tally[n][TALLY_0]) + deactivate = 0; + } + + for (m = start; m < n; m++) + tally[m][TALLY_DEACTIVATED] = deactivate; + + fields_deactivated += deactivate; + + /* remove deactivated fields */ + if (fields_deactivated) { + uint32_t k, l = 0; + + for (k = 0; k < config->num_fields; k++) { + if (tally[k][TALLY_DEACTIVATED] == 0) { + memmove(&tally[l][0], &tally[k][0], + TALLY_NUM * sizeof(tally[0][0])); + memmove(&config->defs[l++], + &config->defs[k], + sizeof(struct rte_acl_field_def)); + } + } + config->num_fields = l; + } +} + +static int +rule_cmp_wildness(struct rte_acl_build_rule *r1, struct rte_acl_build_rule *r2) +{ + uint32_t n; + + for (n = 1; n < r1->config->num_fields; n++) { + int field_index = r1->config->defs[n].field_index; + + if (r1->wildness[field_index] != r2->wildness[field_index]) + return r1->wildness[field_index] - + r2->wildness[field_index]; + } + return 0; +} + +/* + * Split the rte_acl_build_rule list into two lists. + */ +static void +rule_list_split(struct rte_acl_build_rule *source, + struct rte_acl_build_rule **list_a, + struct rte_acl_build_rule **list_b) +{ + struct rte_acl_build_rule *fast; + struct rte_acl_build_rule *slow; + + if (source == NULL || source->next == NULL) { + /* length < 2 cases */ + *list_a = source; + *list_b = NULL; + } else { + slow = source; + fast = source->next; + /* Advance 'fast' two nodes, and advance 'slow' one node */ + while (fast != NULL) { + fast = fast->next; + if (fast != NULL) { + slow = slow->next; + fast = fast->next; + } + } + /* 'slow' is before the midpoint in the list, so split it in two + at that point. */ + *list_a = source; + *list_b = slow->next; + slow->next = NULL; + } +} + +/* + * Merge two sorted lists. + */ +static struct rte_acl_build_rule * +rule_list_sorted_merge(struct rte_acl_build_rule *a, + struct rte_acl_build_rule *b) +{ + struct rte_acl_build_rule *result = NULL; + struct rte_acl_build_rule **last_next = &result; + + while (1) { + if (a == NULL) { + *last_next = b; + break; + } else if (b == NULL) { + *last_next = a; + break; + } + if (rule_cmp_wildness(a, b) >= 0) { + *last_next = a; + last_next = &a->next; + a = a->next; + } else { + *last_next = b; + last_next = &b->next; + b = b->next; + } + } + return result; +} + +/* + * Sort list of rules based on the rules wildness. + * Use recursive mergesort algorithm. + */ +static struct rte_acl_build_rule * +sort_rules(struct rte_acl_build_rule *head) +{ + struct rte_acl_build_rule *a; + struct rte_acl_build_rule *b; + + /* Base case -- length 0 or 1 */ + if (head == NULL || head->next == NULL) + return head; + + /* Split head into 'a' and 'b' sublists */ + rule_list_split(head, &a, &b); + + /* Recursively sort the sublists */ + a = sort_rules(a); + b = sort_rules(b); + + /* answer = merge the two sorted lists together */ + return rule_list_sorted_merge(a, b); +} + +static uint32_t +acl_build_index(const struct rte_acl_config *config, uint32_t *data_index) +{ + uint32_t n, m; + int32_t last_header; + + m = 0; + last_header = -1; + + for (n = 0; n < config->num_fields; n++) { + if (last_header != config->defs[n].input_index) { + last_header = config->defs[n].input_index; + data_index[m++] = config->defs[n].offset; + } + } + + return m; +} + +static struct rte_acl_build_rule * +build_one_trie(struct acl_build_context *context, + struct rte_acl_build_rule *rule_sets[RTE_ACL_MAX_TRIES], + uint32_t n, int32_t node_max) +{ + struct rte_acl_build_rule *last; + struct rte_acl_config *config; + + config = rule_sets[n]->config; + + acl_rule_stats(rule_sets[n], config); + rule_sets[n] = sort_rules(rule_sets[n]); + + context->tries[n].type = RTE_ACL_FULL_TRIE; + context->tries[n].count = 0; + + context->tries[n].num_data_indexes = acl_build_index(config, + context->data_indexes[n]); + context->tries[n].data_index = context->data_indexes[n]; + + context->cur_node_max = node_max; + + context->bld_tries[n].trie = build_trie(context, rule_sets[n], + &last, &context->tries[n].count); + + return last; +} + +static int +acl_build_tries(struct acl_build_context *context, + struct rte_acl_build_rule *head) +{ + uint32_t n, num_tries; + struct rte_acl_config *config; + struct rte_acl_build_rule *last; + struct rte_acl_build_rule *rule_sets[RTE_ACL_MAX_TRIES]; + + config = head->config; + rule_sets[0] = head; + + /* initialize tries */ + for (n = 0; n < RTE_DIM(context->tries); n++) { + context->tries[n].type = RTE_ACL_UNUSED_TRIE; + context->bld_tries[n].trie = NULL; + context->tries[n].count = 0; + } + + context->tries[0].type = RTE_ACL_FULL_TRIE; + + /* calc wildness of each field of each rule */ + acl_calc_wildness(head, config); + + for (n = 0;; n = num_tries) { + + num_tries = n + 1; + + last = build_one_trie(context, rule_sets, n, context->node_max); + if (context->bld_tries[n].trie == NULL) { + RTE_LOG(ERR, ACL, "Build of %u-th trie failed\n", n); + return -ENOMEM; + } + + /* Build of the last trie completed. */ + if (last == NULL) + break; + + if (num_tries == RTE_DIM(context->tries)) { + RTE_LOG(ERR, ACL, + "Exceeded max number of tries: %u\n", + num_tries); + return -ENOMEM; + } + + /* Trie is getting too big, split remaining rule set. */ + rule_sets[num_tries] = last->next; + last->next = NULL; + acl_free_node(context, context->bld_tries[n].trie); + + /* Create a new copy of config for remaining rules. */ + config = acl_build_alloc(context, 1, sizeof(*config)); + memcpy(config, rule_sets[n]->config, sizeof(*config)); + + /* Make remaining rules use new config. */ + for (head = rule_sets[num_tries]; head != NULL; + head = head->next) + head->config = config; + + /* + * Rebuild the trie for the reduced rule-set. + * Don't try to split it any further. + */ + last = build_one_trie(context, rule_sets, n, INT32_MAX); + if (context->bld_tries[n].trie == NULL || last != NULL) { + RTE_LOG(ERR, ACL, "Build of %u-th trie failed\n", n); + return -ENOMEM; + } + + } + + context->num_tries = num_tries; + return 0; +} + +static void +acl_build_log(const struct acl_build_context *ctx) +{ + uint32_t n; + + RTE_LOG(DEBUG, ACL, "Build phase for ACL \"%s\":\n" + "node limit for tree split: %u\n" + "nodes created: %u\n" + "memory consumed: %zu\n", + ctx->acx->name, + ctx->node_max, + ctx->num_nodes, + ctx->pool.alloc); + + for (n = 0; n < RTE_DIM(ctx->tries); n++) { + if (ctx->tries[n].count != 0) + RTE_LOG(DEBUG, ACL, + "trie %u: number of rules: %u, indexes: %u\n", + n, ctx->tries[n].count, + ctx->tries[n].num_data_indexes); + } +} + +static int +acl_build_rules(struct acl_build_context *bcx) +{ + struct rte_acl_build_rule *br, *head; + const struct rte_acl_rule *rule; + uint32_t *wp; + uint32_t fn, i, n, num; + size_t ofs, sz; + + fn = bcx->cfg.num_fields; + n = bcx->acx->num_rules; + ofs = n * sizeof(*br); + sz = ofs + n * fn * sizeof(*wp); + + br = tb_alloc(&bcx->pool, sz); + + wp = (uint32_t *)((uintptr_t)br + ofs); + num = 0; + head = NULL; + + for (i = 0; i != n; i++) { + rule = (const struct rte_acl_rule *) + ((uintptr_t)bcx->acx->rules + bcx->acx->rule_sz * i); + if ((rule->data.category_mask & bcx->category_mask) != 0) { + br[num].next = head; + br[num].config = &bcx->cfg; + br[num].f = rule; + br[num].wildness = wp; + wp += fn; + head = br + num; + num++; + } + } + + bcx->num_rules = num; + bcx->build_rules = head; + + return 0; +} + +/* + * Copy data_indexes for each trie into RT location. + */ +static void +acl_set_data_indexes(struct rte_acl_ctx *ctx) +{ + uint32_t i, n, ofs; + + ofs = 0; + for (i = 0; i != ctx->num_tries; i++) { + n = ctx->trie[i].num_data_indexes; + memcpy(ctx->data_indexes + ofs, ctx->trie[i].data_index, + n * sizeof(ctx->data_indexes[0])); + ctx->trie[i].data_index = ctx->data_indexes + ofs; + ofs += RTE_ACL_MAX_FIELDS; + } +} + +/* + * Internal routine, performs 'build' phase of trie generation: + * - setups build context. + * - analizes given set of rules. + * - builds internal tree(s). + */ +static int +acl_bld(struct acl_build_context *bcx, struct rte_acl_ctx *ctx, + const struct rte_acl_config *cfg, uint32_t node_max) +{ + int32_t rc; + + /* setup build context. */ + memset(bcx, 0, sizeof(*bcx)); + bcx->acx = ctx; + bcx->pool.alignment = ACL_POOL_ALIGN; + bcx->pool.min_alloc = ACL_POOL_ALLOC_MIN; + bcx->cfg = *cfg; + bcx->category_mask = RTE_LEN2MASK(bcx->cfg.num_categories, + typeof(bcx->category_mask)); + bcx->node_max = node_max; + + rc = sigsetjmp(bcx->pool.fail, 0); + + /* build phase runs out of memory. */ + if (rc != 0) { + RTE_LOG(ERR, ACL, + "ACL context: %s, %s() failed with error code: %d\n", + bcx->acx->name, __func__, rc); + return rc; + } + + /* Create a build rules copy. */ + rc = acl_build_rules(bcx); + if (rc != 0) + return rc; + + /* No rules to build for that context+config */ + if (bcx->build_rules == NULL) { + rc = -EINVAL; + } else { + /* build internal trie representation. */ + rc = acl_build_tries(bcx, bcx->build_rules); + } + return rc; +} + +/* + * Check that parameters for acl_build() are valid. + */ +static int +acl_check_bld_param(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg) +{ + static const size_t field_sizes[] = { + sizeof(uint8_t), sizeof(uint16_t), + sizeof(uint32_t), sizeof(uint64_t), + }; + + uint32_t i, j; + + if (ctx == NULL || cfg == NULL || cfg->num_categories == 0 || + cfg->num_categories > RTE_ACL_MAX_CATEGORIES || + cfg->num_fields == 0 || + cfg->num_fields > RTE_ACL_MAX_FIELDS) + return -EINVAL; + + for (i = 0; i != cfg->num_fields; i++) { + if (cfg->defs[i].type > RTE_ACL_FIELD_TYPE_BITMASK) { + RTE_LOG(ERR, ACL, + "ACL context: %s, invalid type: %hhu for %u-th field\n", + ctx->name, cfg->defs[i].type, i); + return -EINVAL; + } + for (j = 0; + j != RTE_DIM(field_sizes) && + cfg->defs[i].size != field_sizes[j]; + j++) + ; + + if (j == RTE_DIM(field_sizes)) { + RTE_LOG(ERR, ACL, + "ACL context: %s, invalid size: %hhu for %u-th field\n", + ctx->name, cfg->defs[i].size, i); + return -EINVAL; + } + } + + return 0; +} + +int +rte_acl_build(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg) +{ + int32_t rc; + uint32_t n; + size_t max_size; + struct acl_build_context bcx; + + rc = acl_check_bld_param(ctx, cfg); + if (rc != 0) + return rc; + + acl_build_reset(ctx); + + if (cfg->max_size == 0) { + n = NODE_MIN; + max_size = SIZE_MAX; + } else { + n = NODE_MAX; + max_size = cfg->max_size; + } + + for (rc = -ERANGE; n >= NODE_MIN && rc == -ERANGE; n /= 2) { + + /* perform build phase. */ + rc = acl_bld(&bcx, ctx, cfg, n); + + if (rc == 0) { + /* allocate and fill run-time structures. */ + rc = rte_acl_gen(ctx, bcx.tries, bcx.bld_tries, + bcx.num_tries, bcx.cfg.num_categories, + RTE_ACL_MAX_FIELDS * RTE_DIM(bcx.tries) * + sizeof(ctx->data_indexes[0]), max_size); + if (rc == 0) { + /* set data indexes. */ + acl_set_data_indexes(ctx); + + /* copy in build config. */ + ctx->config = *cfg; + } + } + + acl_build_log(&bcx); + + /* cleanup after build. */ + tb_free_pool(&bcx.pool); + } + + return rc; +} diff --git a/lib/librte_acl/acl_gen.c b/lib/librte_acl/acl_gen.c new file mode 100644 index 00000000..ea557ab9 --- /dev/null +++ b/lib/librte_acl/acl_gen.c @@ -0,0 +1,561 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <rte_acl.h> +#include "acl.h" + +#define QRANGE_MIN ((uint8_t)INT8_MIN) + +#define RTE_ACL_VERIFY(exp) do { \ + if (!(exp)) \ + rte_panic("line %d\tassert \"" #exp "\" failed\n", __LINE__); \ +} while (0) + +struct acl_node_counters { + int32_t match; + int32_t match_used; + int32_t single; + int32_t quad; + int32_t quad_vectors; + int32_t dfa; + int32_t dfa_gr64; +}; + +struct rte_acl_indices { + int32_t dfa_index; + int32_t quad_index; + int32_t single_index; + int32_t match_index; + int32_t match_start; +}; + +static void +acl_gen_log_stats(const struct rte_acl_ctx *ctx, + const struct acl_node_counters *counts, + const struct rte_acl_indices *indices, + size_t max_size) +{ + RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n" + "runtime memory footprint on socket %d:\n" + "single nodes/bytes used: %d/%zu\n" + "quad nodes/vectors/bytes used: %d/%d/%zu\n" + "DFA nodes/group64/bytes used: %d/%d/%zu\n" + "match nodes/bytes used: %d/%zu\n" + "total: %zu bytes\n" + "max limit: %zu bytes\n", + ctx->name, ctx->socket_id, + counts->single, counts->single * sizeof(uint64_t), + counts->quad, counts->quad_vectors, + (indices->quad_index - indices->dfa_index) * sizeof(uint64_t), + counts->dfa, counts->dfa_gr64, + indices->dfa_index * sizeof(uint64_t), + counts->match, + counts->match * sizeof(struct rte_acl_match_results), + ctx->mem_sz, + max_size); +} + +static uint64_t +acl_dfa_gen_idx(const struct rte_acl_node *node, uint32_t index) +{ + uint64_t idx; + uint32_t i; + + idx = 0; + for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) { + RTE_ACL_VERIFY(node->dfa_gr64[i] < RTE_ACL_DFA_GR64_NUM); + RTE_ACL_VERIFY(node->dfa_gr64[i] < node->fanout); + idx |= (i - node->dfa_gr64[i]) << + (6 + RTE_ACL_DFA_GR64_BIT * i); + } + + return idx << (CHAR_BIT * sizeof(index)) | index | node->node_type; +} + +static void +acl_dfa_fill_gr64(const struct rte_acl_node *node, + const uint64_t src[RTE_ACL_DFA_SIZE], uint64_t dst[RTE_ACL_DFA_SIZE]) +{ + uint32_t i; + + for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) { + memcpy(dst + node->dfa_gr64[i] * RTE_ACL_DFA_GR64_SIZE, + src + i * RTE_ACL_DFA_GR64_SIZE, + RTE_ACL_DFA_GR64_SIZE * sizeof(dst[0])); + } +} + +static uint32_t +acl_dfa_count_gr64(const uint64_t array_ptr[RTE_ACL_DFA_SIZE], + uint8_t gr64[RTE_ACL_DFA_GR64_NUM]) +{ + uint32_t i, j, k; + + k = 0; + for (i = 0; i != RTE_ACL_DFA_GR64_NUM; i++) { + gr64[i] = i; + for (j = 0; j != i; j++) { + if (memcmp(array_ptr + i * RTE_ACL_DFA_GR64_SIZE, + array_ptr + j * RTE_ACL_DFA_GR64_SIZE, + RTE_ACL_DFA_GR64_SIZE * + sizeof(array_ptr[0])) == 0) + break; + } + gr64[i] = (j != i) ? gr64[j] : k++; + } + + return k; +} + +static uint32_t +acl_node_fill_dfa(const struct rte_acl_node *node, + uint64_t dfa[RTE_ACL_DFA_SIZE], uint64_t no_match, int32_t resolved) +{ + uint32_t n, x; + uint32_t ranges, last_bit; + struct rte_acl_node *child; + struct rte_acl_bitset *bits; + + ranges = 0; + last_bit = 0; + + for (n = 0; n < RTE_ACL_DFA_SIZE; n++) + dfa[n] = no_match; + + for (x = 0; x < node->num_ptrs; x++) { + + child = node->ptrs[x].ptr; + if (child == NULL) + continue; + + bits = &node->ptrs[x].values; + for (n = 0; n < RTE_ACL_DFA_SIZE; n++) { + + if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] & + (1 << (n % (sizeof(bits_t) * CHAR_BIT)))) { + + dfa[n] = resolved ? child->node_index : x; + ranges += (last_bit == 0); + last_bit = 1; + } else { + last_bit = 0; + } + } + } + + return ranges; +} + +/* +* Counts the number of groups of sequential bits that are +* either 0 or 1, as specified by the zero_one parameter. This is used to +* calculate the number of ranges in a node to see if it fits in a quad range +* node. +*/ +static int +acl_count_sequential_groups(struct rte_acl_bitset *bits, int zero_one) +{ + int n, ranges, last_bit; + + ranges = 0; + last_bit = zero_one ^ 1; + + for (n = QRANGE_MIN; n < UINT8_MAX + 1; n++) { + if (bits->bits[n / (sizeof(bits_t) * 8)] & + (1 << (n % (sizeof(bits_t) * 8)))) { + if (zero_one == 1 && last_bit != 1) + ranges++; + last_bit = 1; + } else { + if (zero_one == 0 && last_bit != 0) + ranges++; + last_bit = 0; + } + } + for (n = 0; n < QRANGE_MIN; n++) { + if (bits->bits[n / (sizeof(bits_t) * 8)] & + (1 << (n % (sizeof(bits_t) * 8)))) { + if (zero_one == 1 && last_bit != 1) + ranges++; + last_bit = 1; + } else { + if (zero_one == 0 && last_bit != 0) + ranges++; + last_bit = 0; + } + } + + return ranges; +} + +/* + * Count number of ranges spanned by the node's pointers + */ +static int +acl_count_fanout(struct rte_acl_node *node) +{ + uint32_t n; + int ranges; + + if (node->fanout != 0) + return node->fanout; + + ranges = acl_count_sequential_groups(&node->values, 0); + + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) + ranges += acl_count_sequential_groups( + &node->ptrs[n].values, 1); + } + + node->fanout = ranges; + return node->fanout; +} + +/* + * Determine the type of nodes and count each type + */ +static void +acl_count_trie_types(struct acl_node_counters *counts, + struct rte_acl_node *node, uint64_t no_match, int force_dfa) +{ + uint32_t n; + int num_ptrs; + uint64_t dfa[RTE_ACL_DFA_SIZE]; + + /* skip if this node has been counted */ + if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED) + return; + + if (node->match_flag != 0 || node->num_ptrs == 0) { + counts->match++; + node->node_type = RTE_ACL_NODE_MATCH; + return; + } + + num_ptrs = acl_count_fanout(node); + + /* Force type to dfa */ + if (force_dfa) + num_ptrs = RTE_ACL_DFA_SIZE; + + /* determine node type based on number of ranges */ + if (num_ptrs == 1) { + counts->single++; + node->node_type = RTE_ACL_NODE_SINGLE; + } else if (num_ptrs <= RTE_ACL_QUAD_MAX) { + counts->quad++; + counts->quad_vectors += node->fanout; + node->node_type = RTE_ACL_NODE_QRANGE; + } else { + counts->dfa++; + node->node_type = RTE_ACL_NODE_DFA; + if (force_dfa != 0) { + /* always expand to a max number of nodes. */ + for (n = 0; n != RTE_DIM(node->dfa_gr64); n++) + node->dfa_gr64[n] = n; + node->fanout = n; + } else { + acl_node_fill_dfa(node, dfa, no_match, 0); + node->fanout = acl_dfa_count_gr64(dfa, node->dfa_gr64); + } + counts->dfa_gr64 += node->fanout; + } + + /* + * recursively count the types of all children + */ + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) + acl_count_trie_types(counts, node->ptrs[n].ptr, + no_match, 0); + } +} + +static void +acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match, + int resolved) +{ + uint32_t x; + int32_t m; + uint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE]; + + acl_node_fill_dfa(node, dfa, no_match, resolved); + + /* + * Rather than going from 0 to 256, the range count and + * the layout are from 80-ff then 0-7f due to signed compare + * for SSE (cmpgt). + */ + if (node->node_type == RTE_ACL_NODE_QRANGE) { + + m = 0; + node_a = node_array; + index = dfa[QRANGE_MIN]; + *node_a++ = index; + + for (x = QRANGE_MIN + 1; x < UINT8_MAX + 1; x++) { + if (dfa[x] != index) { + index = dfa[x]; + *node_a++ = index; + node->transitions[m++] = (uint8_t)(x - 1); + } + } + + for (x = 0; x < INT8_MAX + 1; x++) { + if (dfa[x] != index) { + index = dfa[x]; + *node_a++ = index; + node->transitions[m++] = (uint8_t)(x - 1); + } + } + + /* fill unused locations with max value - nothing is greater */ + for (; m < RTE_ACL_QUAD_SIZE; m++) + node->transitions[m] = INT8_MAX; + + RTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE); + + } else if (node->node_type == RTE_ACL_NODE_DFA && resolved) { + acl_dfa_fill_gr64(node, dfa, node_array); + } +} + +/* + * Routine that allocates space for this node and recursively calls + * to allocate space for each child. Once all the children are allocated, + * then resolve all transitions for this node. + */ +static void +acl_gen_node(struct rte_acl_node *node, uint64_t *node_array, + uint64_t no_match, struct rte_acl_indices *index, int num_categories) +{ + uint32_t n, sz, *qtrp; + uint64_t *array_ptr; + struct rte_acl_match_results *match; + + if (node->node_index != RTE_ACL_NODE_UNDEFINED) + return; + + array_ptr = NULL; + + switch (node->node_type) { + case RTE_ACL_NODE_DFA: + array_ptr = &node_array[index->dfa_index]; + node->node_index = acl_dfa_gen_idx(node, index->dfa_index); + sz = node->fanout * RTE_ACL_DFA_GR64_SIZE; + index->dfa_index += sz; + for (n = 0; n < sz; n++) + array_ptr[n] = no_match; + break; + case RTE_ACL_NODE_SINGLE: + node->node_index = RTE_ACL_QUAD_SINGLE | index->single_index | + node->node_type; + array_ptr = &node_array[index->single_index]; + index->single_index += 1; + array_ptr[0] = no_match; + break; + case RTE_ACL_NODE_QRANGE: + array_ptr = &node_array[index->quad_index]; + acl_add_ptrs(node, array_ptr, no_match, 0); + qtrp = (uint32_t *)node->transitions; + node->node_index = qtrp[0]; + node->node_index <<= sizeof(index->quad_index) * CHAR_BIT; + node->node_index |= index->quad_index | node->node_type; + index->quad_index += node->fanout; + break; + case RTE_ACL_NODE_MATCH: + match = ((struct rte_acl_match_results *) + (node_array + index->match_start)); + for (n = 0; n != RTE_DIM(match->results); n++) + RTE_ACL_VERIFY(match->results[0] == 0); + memcpy(match + index->match_index, node->mrt, + sizeof(*node->mrt)); + node->node_index = index->match_index | node->node_type; + index->match_index += 1; + break; + case RTE_ACL_NODE_UNDEFINED: + RTE_ACL_VERIFY(node->node_type != + (uint32_t)RTE_ACL_NODE_UNDEFINED); + break; + } + + /* recursively allocate space for all children */ + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) + acl_gen_node(node->ptrs[n].ptr, + node_array, + no_match, + index, + num_categories); + } + + /* All children are resolved, resolve this node's pointers */ + switch (node->node_type) { + case RTE_ACL_NODE_DFA: + acl_add_ptrs(node, array_ptr, no_match, 1); + break; + case RTE_ACL_NODE_SINGLE: + for (n = 0; n < node->num_ptrs; n++) { + if (node->ptrs[n].ptr != NULL) + array_ptr[0] = node->ptrs[n].ptr->node_index; + } + break; + case RTE_ACL_NODE_QRANGE: + acl_add_ptrs(node, array_ptr, no_match, 1); + break; + case RTE_ACL_NODE_MATCH: + break; + case RTE_ACL_NODE_UNDEFINED: + RTE_ACL_VERIFY(node->node_type != + (uint32_t)RTE_ACL_NODE_UNDEFINED); + break; + } +} + +static void +acl_calc_counts_indices(struct acl_node_counters *counts, + struct rte_acl_indices *indices, + struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries, + uint64_t no_match) +{ + uint32_t n; + + memset(indices, 0, sizeof(*indices)); + memset(counts, 0, sizeof(*counts)); + + /* Get stats on nodes */ + for (n = 0; n < num_tries; n++) { + acl_count_trie_types(counts, node_bld_trie[n].trie, + no_match, 1); + } + + indices->dfa_index = RTE_ACL_DFA_SIZE + 1; + indices->quad_index = indices->dfa_index + + counts->dfa_gr64 * RTE_ACL_DFA_GR64_SIZE; + indices->single_index = indices->quad_index + counts->quad_vectors; + indices->match_start = indices->single_index + counts->single + 1; + indices->match_start = RTE_ALIGN(indices->match_start, + (XMM_SIZE / sizeof(uint64_t))); + indices->match_index = 1; +} + +/* + * Generate the runtime structure using build structure + */ +int +rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie, + struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries, + uint32_t num_categories, uint32_t data_index_sz, size_t max_size) +{ + void *mem; + size_t total_size; + uint64_t *node_array, no_match; + uint32_t n, match_index; + struct rte_acl_match_results *match; + struct acl_node_counters counts; + struct rte_acl_indices indices; + + no_match = RTE_ACL_NODE_MATCH; + + /* Fill counts and indices arrays from the nodes. */ + acl_calc_counts_indices(&counts, &indices, + node_bld_trie, num_tries, no_match); + + /* Allocate runtime memory (align to cache boundary) */ + total_size = RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE) + + indices.match_start * sizeof(uint64_t) + + (counts.match + 1) * sizeof(struct rte_acl_match_results) + + XMM_SIZE; + + if (total_size > max_size) { + RTE_LOG(DEBUG, ACL, + "Gen phase for ACL ctx \"%s\" exceeds max_size limit, " + "bytes required: %zu, allowed: %zu\n", + ctx->name, total_size, max_size); + return -ERANGE; + } + + mem = rte_zmalloc_socket(ctx->name, total_size, RTE_CACHE_LINE_SIZE, + ctx->socket_id); + if (mem == NULL) { + RTE_LOG(ERR, ACL, + "allocation of %zu bytes on socket %d for %s failed\n", + total_size, ctx->socket_id, ctx->name); + return -ENOMEM; + } + + /* Fill the runtime structure */ + match_index = indices.match_start; + node_array = (uint64_t *)((uintptr_t)mem + + RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE)); + + /* + * Setup the NOMATCH node (a SINGLE at the + * highest index, that points to itself) + */ + + node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE; + + for (n = 0; n < RTE_ACL_DFA_SIZE; n++) + node_array[n] = no_match; + + /* NOMATCH result at index 0 */ + match = ((struct rte_acl_match_results *)(node_array + match_index)); + memset(match, 0, sizeof(*match)); + + for (n = 0; n < num_tries; n++) { + + acl_gen_node(node_bld_trie[n].trie, node_array, no_match, + &indices, num_categories); + + if (node_bld_trie[n].trie->node_index == no_match) + trie[n].root_index = 0; + else + trie[n].root_index = node_bld_trie[n].trie->node_index; + } + + ctx->mem = mem; + ctx->mem_sz = total_size; + ctx->data_indexes = mem; + ctx->num_tries = num_tries; + ctx->num_categories = num_categories; + ctx->match_index = match_index; + ctx->no_match = no_match; + ctx->idle = node_array[RTE_ACL_DFA_SIZE]; + ctx->trans_table = node_array; + memcpy(ctx->trie, trie, sizeof(ctx->trie)); + + acl_gen_log_stats(ctx, &counts, &indices, max_size); + return 0; +} diff --git a/lib/librte_acl/acl_run.h b/lib/librte_acl/acl_run.h new file mode 100644 index 00000000..b2fc42c6 --- /dev/null +++ b/lib/librte_acl/acl_run.h @@ -0,0 +1,263 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ACL_RUN_H_ +#define _ACL_RUN_H_ + +#include <rte_acl.h> +#include "acl.h" + +#define MAX_SEARCHES_AVX16 16 +#define MAX_SEARCHES_SSE8 8 +#define MAX_SEARCHES_SSE4 4 +#define MAX_SEARCHES_SCALAR 2 + +#define GET_NEXT_4BYTES(prm, idx) \ + (*((const int32_t *)((prm)[(idx)].data + *(prm)[idx].data_index++))) + + +#define RTE_ACL_NODE_INDEX ((uint32_t)~RTE_ACL_NODE_TYPE) + +#define SCALAR_QRANGE_MULT 0x01010101 +#define SCALAR_QRANGE_MASK 0x7f7f7f7f +#define SCALAR_QRANGE_MIN 0x80808080 + +/* + * Structure to manage N parallel trie traversals. + * The runtime trie traversal routines can process 8, 4, or 2 tries + * in parallel. Each packet may require multiple trie traversals (up to 4). + * This structure is used to fill the slots (0 to n-1) for parallel processing + * with the trie traversals needed for each packet. + */ +struct acl_flow_data { + uint32_t num_packets; + /* number of packets processed */ + uint32_t started; + /* number of trie traversals in progress */ + uint32_t trie; + /* current trie index (0 to N-1) */ + uint32_t cmplt_size; + uint32_t total_packets; + uint32_t categories; + /* number of result categories per packet. */ + /* maximum number of packets to process */ + const uint64_t *trans; + const uint8_t **data; + uint32_t *results; + struct completion *last_cmplt; + struct completion *cmplt_array; +}; + +/* + * Structure to maintain running results for + * a single packet (up to 4 tries). + */ +struct completion { + uint32_t *results; /* running results. */ + int32_t priority[RTE_ACL_MAX_CATEGORIES]; /* running priorities. */ + uint32_t count; /* num of remaining tries */ + /* true for allocated struct */ +} __attribute__((aligned(XMM_SIZE))); + +/* + * One parms structure for each slot in the search engine. + */ +struct parms { + const uint8_t *data; + /* input data for this packet */ + const uint32_t *data_index; + /* data indirection for this trie */ + struct completion *cmplt; + /* completion data for this packet */ +}; + +/* + * Define an global idle node for unused engine slots + */ +static const uint32_t idle[UINT8_MAX + 1]; + +/* + * Allocate a completion structure to manage the tries for a packet. + */ +static inline struct completion * +alloc_completion(struct completion *p, uint32_t size, uint32_t tries, + uint32_t *results) +{ + uint32_t n; + + for (n = 0; n < size; n++) { + + if (p[n].count == 0) { + + /* mark as allocated and set number of tries. */ + p[n].count = tries; + p[n].results = results; + return &(p[n]); + } + } + + /* should never get here */ + return NULL; +} + +/* + * Resolve priority for a single result trie. + */ +static inline void +resolve_single_priority(uint64_t transition, int n, + const struct rte_acl_ctx *ctx, struct parms *parms, + const struct rte_acl_match_results *p) +{ + if (parms[n].cmplt->count == ctx->num_tries || + parms[n].cmplt->priority[0] <= + p[transition].priority[0]) { + + parms[n].cmplt->priority[0] = p[transition].priority[0]; + parms[n].cmplt->results[0] = p[transition].results[0]; + } +} + +/* + * Routine to fill a slot in the parallel trie traversal array (parms) from + * the list of packets (flows). + */ +static inline uint64_t +acl_start_next_trie(struct acl_flow_data *flows, struct parms *parms, int n, + const struct rte_acl_ctx *ctx) +{ + uint64_t transition; + + /* if there are any more packets to process */ + if (flows->num_packets < flows->total_packets) { + parms[n].data = flows->data[flows->num_packets]; + parms[n].data_index = ctx->trie[flows->trie].data_index; + + /* if this is the first trie for this packet */ + if (flows->trie == 0) { + flows->last_cmplt = alloc_completion(flows->cmplt_array, + flows->cmplt_size, ctx->num_tries, + flows->results + + flows->num_packets * flows->categories); + } + + /* set completion parameters and starting index for this slot */ + parms[n].cmplt = flows->last_cmplt; + transition = + flows->trans[parms[n].data[*parms[n].data_index++] + + ctx->trie[flows->trie].root_index]; + + /* + * if this is the last trie for this packet, + * then setup next packet. + */ + flows->trie++; + if (flows->trie >= ctx->num_tries) { + flows->trie = 0; + flows->num_packets++; + } + + /* keep track of number of active trie traversals */ + flows->started++; + + /* no more tries to process, set slot to an idle position */ + } else { + transition = ctx->idle; + parms[n].data = (const uint8_t *)idle; + parms[n].data_index = idle; + } + return transition; +} + +static inline void +acl_set_flow(struct acl_flow_data *flows, struct completion *cmplt, + uint32_t cmplt_size, const uint8_t **data, uint32_t *results, + uint32_t data_num, uint32_t categories, const uint64_t *trans) +{ + flows->num_packets = 0; + flows->started = 0; + flows->trie = 0; + flows->last_cmplt = NULL; + flows->cmplt_array = cmplt; + flows->total_packets = data_num; + flows->categories = categories; + flows->cmplt_size = cmplt_size; + flows->data = data; + flows->results = results; + flows->trans = trans; +} + +typedef void (*resolve_priority_t) +(uint64_t transition, int n, const struct rte_acl_ctx *ctx, + struct parms *parms, const struct rte_acl_match_results *p, + uint32_t categories); + +/* + * Detect matches. If a match node transition is found, then this trie + * traversal is complete and fill the slot with the next trie + * to be processed. + */ +static inline uint64_t +acl_match_check(uint64_t transition, int slot, + const struct rte_acl_ctx *ctx, struct parms *parms, + struct acl_flow_data *flows, resolve_priority_t resolve_priority) +{ + const struct rte_acl_match_results *p; + + p = (const struct rte_acl_match_results *) + (flows->trans + ctx->match_index); + + if (transition & RTE_ACL_NODE_MATCH) { + + /* Remove flags from index and decrement active traversals */ + transition &= RTE_ACL_NODE_INDEX; + flows->started--; + + /* Resolve priorities for this trie and running results */ + if (flows->categories == 1) + resolve_single_priority(transition, slot, ctx, + parms, p); + else + resolve_priority(transition, slot, ctx, parms, + p, flows->categories); + + /* Count down completed tries for this search request */ + parms[slot].cmplt->count--; + + /* Fill the slot with the next trie or idle trie */ + transition = acl_start_next_trie(flows, parms, slot, ctx); + } + + return transition; +} + +#endif /* _ACL_RUN_H_ */ diff --git a/lib/librte_acl/acl_run_avx2.c b/lib/librte_acl/acl_run_avx2.c new file mode 100644 index 00000000..79ebbd6c --- /dev/null +++ b/lib/librte_acl/acl_run_avx2.c @@ -0,0 +1,54 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "acl_run_avx2.h" + +/* + * Note, that to be able to use AVX2 classify method, + * both compiler and target cpu have to support AVX2 instructions. + */ +int +rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + if (likely(num >= MAX_SEARCHES_AVX16)) + return search_avx2x16(ctx, data, results, num, categories); + else if (num >= MAX_SEARCHES_SSE8) + return search_sse_8(ctx, data, results, num, categories); + else if (num >= MAX_SEARCHES_SSE4) + return search_sse_4(ctx, data, results, num, categories); + else + return rte_acl_classify_scalar(ctx, data, results, num, + categories); +} diff --git a/lib/librte_acl/acl_run_avx2.h b/lib/librte_acl/acl_run_avx2.h new file mode 100644 index 00000000..b01a46a5 --- /dev/null +++ b/lib/librte_acl/acl_run_avx2.h @@ -0,0 +1,284 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "acl_run_sse.h" + +static const rte_ymm_t ymm_match_mask = { + .u32 = { + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + }, +}; + +static const rte_ymm_t ymm_index_mask = { + .u32 = { + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + }, +}; + +static const rte_ymm_t ymm_shuffle_input = { + .u32 = { + 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, + 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, + }, +}; + +static const rte_ymm_t ymm_ones_16 = { + .u16 = { + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + }, +}; + +static const rte_ymm_t ymm_range_base = { + .u32 = { + 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, + 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, + }, +}; + +/* + * Process 8 transitions in parallel. + * tr_lo contains low 32 bits for 8 transition. + * tr_hi contains high 32 bits for 8 transition. + * next_input contains up to 4 input bytes for 8 flows. + */ +static inline __attribute__((always_inline)) ymm_t +transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi) +{ + const int32_t *tr; + ymm_t addr; + + tr = (const int32_t *)(uintptr_t)trans; + + /* Calculate the address (array index) for all 8 transitions. */ + ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input, + ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y, + *tr_lo, *tr_hi); + + /* load lower 32 bits of 8 transactions at once. */ + *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0])); + + next_input = _mm256_srli_epi32(next_input, CHAR_BIT); + + /* load high 32 bits of 8 transactions at once. */ + *tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0])); + + return next_input; +} + +/* + * Process matches for 8 flows. + * tr_lo contains low 32 bits for 8 transition. + * tr_hi contains high 32 bits for 8 transition. + */ +static inline void +acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx, + struct parms *parms, struct acl_flow_data *flows, uint32_t slot, + ymm_t matches, ymm_t *tr_lo, ymm_t *tr_hi) +{ + ymm_t t0, t1; + ymm_t lo, hi; + xmm_t l0, l1; + uint32_t i; + uint64_t tr[MAX_SEARCHES_SSE8]; + + l1 = _mm256_extracti128_si256(*tr_lo, 1); + l0 = _mm256_castsi256_si128(*tr_lo); + + for (i = 0; i != RTE_DIM(tr) / 2; i++) { + + /* + * Extract low 32bits of each transition. + * That's enough to process the match. + */ + tr[i] = (uint32_t)_mm_cvtsi128_si32(l0); + tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1); + + l0 = _mm_srli_si128(l0, sizeof(uint32_t)); + l1 = _mm_srli_si128(l1, sizeof(uint32_t)); + + tr[i] = acl_match_check(tr[i], slot + i, + ctx, parms, flows, resolve_priority_sse); + tr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4, + ctx, parms, flows, resolve_priority_sse); + } + + /* Collect new transitions into 2 YMM registers. */ + t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]); + t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]); + + /* For each transition: put low 32 into tr_lo and high 32 into tr_hi */ + ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi); + + /* Keep transitions wth NOMATCH intact. */ + *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches); + *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches); +} + +static inline void +acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms, + struct acl_flow_data *flows, uint32_t slot, + ymm_t *tr_lo, ymm_t *tr_hi, ymm_t match_mask) +{ + uint32_t msk; + ymm_t matches, temp; + + /* test for match node */ + temp = _mm256_and_si256(match_mask, *tr_lo); + matches = _mm256_cmpeq_epi32(temp, match_mask); + msk = _mm256_movemask_epi8(matches); + + while (msk != 0) { + + acl_process_matches_avx2x8(ctx, parms, flows, slot, + matches, tr_lo, tr_hi); + temp = _mm256_and_si256(match_mask, *tr_lo); + matches = _mm256_cmpeq_epi32(temp, match_mask); + msk = _mm256_movemask_epi8(matches); + } +} + +/* + * Execute trie traversal for up to 16 flows in parallel. + */ +static inline int +search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t total_packets, uint32_t categories) +{ + uint32_t n; + struct acl_flow_data flows; + uint64_t index_array[MAX_SEARCHES_AVX16]; + struct completion cmplt[MAX_SEARCHES_AVX16]; + struct parms parms[MAX_SEARCHES_AVX16]; + ymm_t input[2], tr_lo[2], tr_hi[2]; + ymm_t t0, t1; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < RTE_DIM(cmplt); n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + t0 = _mm256_set_epi64x(index_array[5], index_array[4], + index_array[1], index_array[0]); + t1 = _mm256_set_epi64x(index_array[7], index_array[6], + index_array[3], index_array[2]); + + ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]); + + t0 = _mm256_set_epi64x(index_array[13], index_array[12], + index_array[9], index_array[8]); + t1 = _mm256_set_epi64x(index_array[15], index_array[14], + index_array[11], index_array[10]); + + ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]); + + /* Check for any matches. */ + acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0], + ymm_match_mask.y); + acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1], + ymm_match_mask.y); + + while (flows.started > 0) { + + uint32_t in[MAX_SEARCHES_SSE8]; + + /* Gather 4 bytes of input data for first 8 flows. */ + in[0] = GET_NEXT_4BYTES(parms, 0); + in[4] = GET_NEXT_4BYTES(parms, 4); + in[1] = GET_NEXT_4BYTES(parms, 1); + in[5] = GET_NEXT_4BYTES(parms, 5); + in[2] = GET_NEXT_4BYTES(parms, 2); + in[6] = GET_NEXT_4BYTES(parms, 6); + in[3] = GET_NEXT_4BYTES(parms, 3); + in[7] = GET_NEXT_4BYTES(parms, 7); + input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4], + in[3], in[2], in[1], in[0]); + + /* Gather 4 bytes of input data for last 8 flows. */ + in[0] = GET_NEXT_4BYTES(parms, 8); + in[4] = GET_NEXT_4BYTES(parms, 12); + in[1] = GET_NEXT_4BYTES(parms, 9); + in[5] = GET_NEXT_4BYTES(parms, 13); + in[2] = GET_NEXT_4BYTES(parms, 10); + in[6] = GET_NEXT_4BYTES(parms, 14); + in[3] = GET_NEXT_4BYTES(parms, 11); + in[7] = GET_NEXT_4BYTES(parms, 15); + input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4], + in[3], in[2], in[1], in[0]); + + input[0] = transition8(input[0], flows.trans, + &tr_lo[0], &tr_hi[0]); + input[1] = transition8(input[1], flows.trans, + &tr_lo[1], &tr_hi[1]); + + input[0] = transition8(input[0], flows.trans, + &tr_lo[0], &tr_hi[0]); + input[1] = transition8(input[1], flows.trans, + &tr_lo[1], &tr_hi[1]); + + input[0] = transition8(input[0], flows.trans, + &tr_lo[0], &tr_hi[0]); + input[1] = transition8(input[1], flows.trans, + &tr_lo[1], &tr_hi[1]); + + input[0] = transition8(input[0], flows.trans, + &tr_lo[0], &tr_hi[0]); + input[1] = transition8(input[1], flows.trans, + &tr_lo[1], &tr_hi[1]); + + /* Check for any matches. */ + acl_match_check_avx2x8(ctx, parms, &flows, 0, + &tr_lo[0], &tr_hi[0], ymm_match_mask.y); + acl_match_check_avx2x8(ctx, parms, &flows, 8, + &tr_lo[1], &tr_hi[1], ymm_match_mask.y); + } + + return 0; +} diff --git a/lib/librte_acl/acl_run_neon.c b/lib/librte_acl/acl_run_neon.c new file mode 100644 index 00000000..b0144514 --- /dev/null +++ b/lib/librte_acl/acl_run_neon.c @@ -0,0 +1,46 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2015. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "acl_run_neon.h" + +int +rte_acl_classify_neon(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + if (likely(num >= 8)) + return search_neon_8(ctx, data, results, num, categories); + else if (num >= 4) + return search_neon_4(ctx, data, results, num, categories); + else + return rte_acl_classify_scalar(ctx, data, results, num, + categories); +} diff --git a/lib/librte_acl/acl_run_neon.h b/lib/librte_acl/acl_run_neon.h new file mode 100644 index 00000000..d233ff00 --- /dev/null +++ b/lib/librte_acl/acl_run_neon.h @@ -0,0 +1,289 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2015. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "acl_run.h" +#include "acl_vect.h" + +struct _neon_acl_const { + rte_xmm_t xmm_shuffle_input; + rte_xmm_t xmm_index_mask; + rte_xmm_t range_base; +} neon_acl_const __attribute__((aligned(RTE_CACHE_LINE_SIZE))) = { + { + .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c} + }, + { + .u32 = {RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX} + }, + { + .u32 = {0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c} + }, +}; + +/* + * Resolve priority for multiple results (neon version). + * This consists comparing the priority of the current traversal with the + * running set of results for the packet. + * For each result, keep a running array of the result (rule number) and + * its priority for each category. + */ +static inline void +resolve_priority_neon(uint64_t transition, int n, const struct rte_acl_ctx *ctx, + struct parms *parms, + const struct rte_acl_match_results *p, + uint32_t categories) +{ + uint32_t x; + int32x4_t results, priority, results1, priority1; + uint32x4_t selector; + int32_t *saved_results, *saved_priority; + + for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) { + saved_results = (int32_t *)(&parms[n].cmplt->results[x]); + saved_priority = (int32_t *)(&parms[n].cmplt->priority[x]); + + /* get results and priorities for completed trie */ + results = vld1q_s32( + (const int32_t *)&p[transition].results[x]); + priority = vld1q_s32( + (const int32_t *)&p[transition].priority[x]); + + /* if this is not the first completed trie */ + if (parms[n].cmplt->count != ctx->num_tries) { + /* get running best results and their priorities */ + results1 = vld1q_s32(saved_results); + priority1 = vld1q_s32(saved_priority); + + /* select results that are highest priority */ + selector = vcgtq_s32(priority1, priority); + results = vbslq_s32(selector, results1, results); + priority = vbslq_s32(selector, priority1, priority); + } + + /* save running best results and their priorities */ + vst1q_s32(saved_results, results); + vst1q_s32(saved_priority, priority); + } +} + +/* + * Check for any match in 4 transitions + */ +static inline __attribute__((always_inline)) uint32_t +check_any_match_x4(uint64_t val[]) +{ + return (val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH; +} + +static inline __attribute__((always_inline)) void +acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, + struct acl_flow_data *flows, uint64_t transitions[]) +{ + while (check_any_match_x4(transitions)) { + transitions[0] = acl_match_check(transitions[0], slot, ctx, + parms, flows, resolve_priority_neon); + transitions[1] = acl_match_check(transitions[1], slot + 1, ctx, + parms, flows, resolve_priority_neon); + transitions[2] = acl_match_check(transitions[2], slot + 2, ctx, + parms, flows, resolve_priority_neon); + transitions[3] = acl_match_check(transitions[3], slot + 3, ctx, + parms, flows, resolve_priority_neon); + } +} + +/* + * Process 4 transitions (in 2 NEON Q registers) in parallel + */ +static inline __attribute__((always_inline)) int32x4_t +transition4(int32x4_t next_input, const uint64_t *trans, uint64_t transitions[]) +{ + int32x4x2_t tr_hi_lo; + int32x4_t t, in, r; + uint32x4_t index_msk, node_type, addr; + uint32x4_t dfa_msk, mask, quad_ofs, dfa_ofs; + + /* Move low 32 into tr_hi_lo.val[0] and high 32 into tr_hi_lo.val[1] */ + tr_hi_lo = vld2q_s32((const int32_t *)transitions); + + /* Calculate the address (array index) for all 4 transitions. */ + + index_msk = vld1q_u32((const uint32_t *)&neon_acl_const.xmm_index_mask); + + /* Calc node type and node addr */ + node_type = vbicq_s32(tr_hi_lo.val[0], index_msk); + addr = vandq_s32(tr_hi_lo.val[0], index_msk); + + /* t = 0 */ + t = veorq_s32(node_type, node_type); + + /* mask for DFA type(0) nodes */ + dfa_msk = vceqq_u32(node_type, t); + + mask = vld1q_s32((const int32_t *)&neon_acl_const.xmm_shuffle_input); + in = vqtbl1q_u8((uint8x16_t)next_input, (uint8x16_t)mask); + + /* DFA calculations. */ + r = vshrq_n_u32(in, 30); /* div by 64 */ + mask = vld1q_s32((const int32_t *)&neon_acl_const.range_base); + r = vaddq_u8(r, mask); + t = vshrq_n_u32(in, 24); + r = vqtbl1q_u8((uint8x16_t)tr_hi_lo.val[1], (uint8x16_t)r); + dfa_ofs = vsubq_s32(t, r); + + /* QUAD/SINGLE calculations. */ + t = vcgtq_s8(in, tr_hi_lo.val[1]); + t = vabsq_s8(t); + t = vpaddlq_u8(t); + quad_ofs = vpaddlq_u16(t); + + /* blend DFA and QUAD/SINGLE. */ + t = vbslq_u8(dfa_msk, dfa_ofs, quad_ofs); + + /* calculate address for next transitions */ + addr = vaddq_u32(addr, t); + + /* Fill next transitions */ + transitions[0] = trans[vgetq_lane_u32(addr, 0)]; + transitions[1] = trans[vgetq_lane_u32(addr, 1)]; + transitions[2] = trans[vgetq_lane_u32(addr, 2)]; + transitions[3] = trans[vgetq_lane_u32(addr, 3)]; + + return vshrq_n_u32(next_input, CHAR_BIT); +} + +/* + * Execute trie traversal with 8 traversals in parallel + */ +static inline int +search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t total_packets, uint32_t categories) +{ + int n; + struct acl_flow_data flows; + uint64_t index_array[8]; + struct completion cmplt[8]; + struct parms parms[8]; + int32x4_t input0, input1; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < 8; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]); + acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]); + + while (flows.started > 0) { + /* Gather 4 bytes of input data for each stream. */ + input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input0, 0); + input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4), input1, 0); + + input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1); + input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1); + + input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input0, 2); + input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6), input1, 2); + + input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input0, 3); + input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7), input1, 3); + + /* Process the 4 bytes of input on each stream. */ + + input0 = transition4(input0, flows.trans, &index_array[0]); + input1 = transition4(input1, flows.trans, &index_array[4]); + + input0 = transition4(input0, flows.trans, &index_array[0]); + input1 = transition4(input1, flows.trans, &index_array[4]); + + input0 = transition4(input0, flows.trans, &index_array[0]); + input1 = transition4(input1, flows.trans, &index_array[4]); + + input0 = transition4(input0, flows.trans, &index_array[0]); + input1 = transition4(input1, flows.trans, &index_array[4]); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]); + acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]); + } + + return 0; +} + +/* + * Execute trie traversal with 4 traversals in parallel + */ +static inline int +search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, int total_packets, uint32_t categories) +{ + int n; + struct acl_flow_data flows; + uint64_t index_array[4]; + struct completion cmplt[4]; + struct parms parms[4]; + int32x4_t input; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < 4; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, index_array); + + while (flows.started > 0) { + /* Gather 4 bytes of input data for each stream. */ + input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input, 0); + input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1); + input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2); + input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3); + + /* Process the 4 bytes of input on each stream. */ + input = transition4(input, flows.trans, index_array); + input = transition4(input, flows.trans, index_array); + input = transition4(input, flows.trans, index_array); + input = transition4(input, flows.trans, index_array); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, index_array); + } + + return 0; +} diff --git a/lib/librte_acl/acl_run_scalar.c b/lib/librte_acl/acl_run_scalar.c new file mode 100644 index 00000000..5be216c6 --- /dev/null +++ b/lib/librte_acl/acl_run_scalar.c @@ -0,0 +1,192 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "acl_run.h" + +/* + * Resolve priority for multiple results (scalar version). + * This consists comparing the priority of the current traversal with the + * running set of results for the packet. + * For each result, keep a running array of the result (rule number) and + * its priority for each category. + */ +static inline void +resolve_priority_scalar(uint64_t transition, int n, + const struct rte_acl_ctx *ctx, struct parms *parms, + const struct rte_acl_match_results *p, uint32_t categories) +{ + uint32_t i; + int32_t *saved_priority; + uint32_t *saved_results; + const int32_t *priority; + const uint32_t *results; + + saved_results = parms[n].cmplt->results; + saved_priority = parms[n].cmplt->priority; + + /* results and priorities for completed trie */ + results = p[transition].results; + priority = p[transition].priority; + + /* if this is not the first completed trie */ + if (parms[n].cmplt->count != ctx->num_tries) { + for (i = 0; i < categories; i += RTE_ACL_RESULTS_MULTIPLIER) { + + if (saved_priority[i] <= priority[i]) { + saved_priority[i] = priority[i]; + saved_results[i] = results[i]; + } + if (saved_priority[i + 1] <= priority[i + 1]) { + saved_priority[i + 1] = priority[i + 1]; + saved_results[i + 1] = results[i + 1]; + } + if (saved_priority[i + 2] <= priority[i + 2]) { + saved_priority[i + 2] = priority[i + 2]; + saved_results[i + 2] = results[i + 2]; + } + if (saved_priority[i + 3] <= priority[i + 3]) { + saved_priority[i + 3] = priority[i + 3]; + saved_results[i + 3] = results[i + 3]; + } + } + } else { + for (i = 0; i < categories; i += RTE_ACL_RESULTS_MULTIPLIER) { + saved_priority[i] = priority[i]; + saved_priority[i + 1] = priority[i + 1]; + saved_priority[i + 2] = priority[i + 2]; + saved_priority[i + 3] = priority[i + 3]; + + saved_results[i] = results[i]; + saved_results[i + 1] = results[i + 1]; + saved_results[i + 2] = results[i + 2]; + saved_results[i + 3] = results[i + 3]; + } + } +} + +static inline uint32_t +scan_forward(uint32_t input, uint32_t max) +{ + return (input == 0) ? max : rte_bsf32(input); +} + +static inline uint64_t +scalar_transition(const uint64_t *trans_table, uint64_t transition, + uint8_t input) +{ + uint32_t addr, index, ranges, x, a, b, c; + + /* break transition into component parts */ + ranges = transition >> (sizeof(index) * CHAR_BIT); + index = transition & ~RTE_ACL_NODE_INDEX; + addr = transition ^ index; + + if (index != RTE_ACL_NODE_DFA) { + /* calc address for a QRANGE/SINGLE node */ + c = (uint32_t)input * SCALAR_QRANGE_MULT; + a = ranges | SCALAR_QRANGE_MIN; + a -= (c & SCALAR_QRANGE_MASK); + b = c & SCALAR_QRANGE_MIN; + a &= SCALAR_QRANGE_MIN; + a ^= (ranges ^ b) & (a ^ b); + x = scan_forward(a, 32) >> 3; + } else { + /* calc address for a DFA node */ + x = ranges >> (input / + RTE_ACL_DFA_GR64_SIZE * RTE_ACL_DFA_GR64_BIT); + x &= UINT8_MAX; + x = input - x; + } + + addr += x; + + /* pickup next transition */ + transition = *(trans_table + addr); + return transition; +} + +int +rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + int n; + uint64_t transition0, transition1; + uint32_t input0, input1; + struct acl_flow_data flows; + uint64_t index_array[MAX_SEARCHES_SCALAR]; + struct completion cmplt[MAX_SEARCHES_SCALAR]; + struct parms parms[MAX_SEARCHES_SCALAR]; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, num, + categories, ctx->trans_table); + + for (n = 0; n < MAX_SEARCHES_SCALAR; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + transition0 = index_array[0]; + transition1 = index_array[1]; + + while ((transition0 | transition1) & RTE_ACL_NODE_MATCH) { + transition0 = acl_match_check(transition0, + 0, ctx, parms, &flows, resolve_priority_scalar); + transition1 = acl_match_check(transition1, + 1, ctx, parms, &flows, resolve_priority_scalar); + } + + while (flows.started > 0) { + + input0 = GET_NEXT_4BYTES(parms, 0); + input1 = GET_NEXT_4BYTES(parms, 1); + + for (n = 0; n < 4; n++) { + + transition0 = scalar_transition(flows.trans, + transition0, (uint8_t)input0); + input0 >>= CHAR_BIT; + + transition1 = scalar_transition(flows.trans, + transition1, (uint8_t)input1); + input1 >>= CHAR_BIT; + } + + while ((transition0 | transition1) & RTE_ACL_NODE_MATCH) { + transition0 = acl_match_check(transition0, + 0, ctx, parms, &flows, resolve_priority_scalar); + transition1 = acl_match_check(transition1, + 1, ctx, parms, &flows, resolve_priority_scalar); + } + } + return 0; +} diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c new file mode 100644 index 00000000..a5a7d361 --- /dev/null +++ b/lib/librte_acl/acl_run_sse.c @@ -0,0 +1,47 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "acl_run_sse.h" + +int +rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + if (likely(num >= MAX_SEARCHES_SSE8)) + return search_sse_8(ctx, data, results, num, categories); + else if (num >= MAX_SEARCHES_SSE4) + return search_sse_4(ctx, data, results, num, categories); + else + return rte_acl_classify_scalar(ctx, data, results, num, + categories); +} diff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h new file mode 100644 index 00000000..ad40a674 --- /dev/null +++ b/lib/librte_acl/acl_run_sse.h @@ -0,0 +1,357 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "acl_run.h" +#include "acl_vect.h" + +enum { + SHUFFLE32_SLOT1 = 0xe5, + SHUFFLE32_SLOT2 = 0xe6, + SHUFFLE32_SLOT3 = 0xe7, + SHUFFLE32_SWAP64 = 0x4e, +}; + +static const rte_xmm_t xmm_shuffle_input = { + .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c}, +}; + +static const rte_xmm_t xmm_ones_16 = { + .u16 = {1, 1, 1, 1, 1, 1, 1, 1}, +}; + +static const rte_xmm_t xmm_match_mask = { + .u32 = { + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + RTE_ACL_NODE_MATCH, + }, +}; + +static const rte_xmm_t xmm_index_mask = { + .u32 = { + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + RTE_ACL_NODE_INDEX, + }, +}; + +static const rte_xmm_t xmm_range_base = { + .u32 = { + 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, + }, +}; + +/* + * Resolve priority for multiple results (sse version). + * This consists comparing the priority of the current traversal with the + * running set of results for the packet. + * For each result, keep a running array of the result (rule number) and + * its priority for each category. + */ +static inline void +resolve_priority_sse(uint64_t transition, int n, const struct rte_acl_ctx *ctx, + struct parms *parms, const struct rte_acl_match_results *p, + uint32_t categories) +{ + uint32_t x; + xmm_t results, priority, results1, priority1, selector; + xmm_t *saved_results, *saved_priority; + + for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) { + + saved_results = (xmm_t *)(&parms[n].cmplt->results[x]); + saved_priority = + (xmm_t *)(&parms[n].cmplt->priority[x]); + + /* get results and priorities for completed trie */ + results = _mm_loadu_si128( + (const xmm_t *)&p[transition].results[x]); + priority = _mm_loadu_si128( + (const xmm_t *)&p[transition].priority[x]); + + /* if this is not the first completed trie */ + if (parms[n].cmplt->count != ctx->num_tries) { + + /* get running best results and their priorities */ + results1 = _mm_loadu_si128(saved_results); + priority1 = _mm_loadu_si128(saved_priority); + + /* select results that are highest priority */ + selector = _mm_cmpgt_epi32(priority1, priority); + results = _mm_blendv_epi8(results, results1, selector); + priority = _mm_blendv_epi8(priority, priority1, + selector); + } + + /* save running best results and their priorities */ + _mm_storeu_si128(saved_results, results); + _mm_storeu_si128(saved_priority, priority); + } +} + +/* + * Extract transitions from an XMM register and check for any matches + */ +static void +acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx, + struct parms *parms, struct acl_flow_data *flows) +{ + uint64_t transition1, transition2; + + /* extract transition from low 64 bits. */ + transition1 = _mm_cvtsi128_si64(*indices); + + /* extract transition from high 64 bits. */ + *indices = _mm_shuffle_epi32(*indices, SHUFFLE32_SWAP64); + transition2 = _mm_cvtsi128_si64(*indices); + + transition1 = acl_match_check(transition1, slot, ctx, + parms, flows, resolve_priority_sse); + transition2 = acl_match_check(transition2, slot + 1, ctx, + parms, flows, resolve_priority_sse); + + /* update indices with new transitions. */ + *indices = _mm_set_epi64x(transition2, transition1); +} + +/* + * Check for any match in 4 transitions (contained in 2 SSE registers) + */ +static inline __attribute__((always_inline)) void +acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, + struct acl_flow_data *flows, xmm_t *indices1, xmm_t *indices2, + xmm_t match_mask) +{ + xmm_t temp; + + /* put low 32 bits of each transition into one register */ + temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2, + 0x88); + /* test for match node */ + temp = _mm_and_si128(match_mask, temp); + + while (!_mm_testz_si128(temp, temp)) { + acl_process_matches(indices1, slot, ctx, parms, flows); + acl_process_matches(indices2, slot + 2, ctx, parms, flows); + + temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, + (__m128)*indices2, + 0x88); + temp = _mm_and_si128(match_mask, temp); + } +} + +/* + * Process 4 transitions (in 2 XMM registers) in parallel + */ +static inline __attribute__((always_inline)) xmm_t +transition4(xmm_t next_input, const uint64_t *trans, + xmm_t *indices1, xmm_t *indices2) +{ + xmm_t addr, tr_lo, tr_hi; + uint64_t trans0, trans2; + + /* Shuffle low 32 into tr_lo and high 32 into tr_hi */ + ACL_TR_HILO(mm, __m128, *indices1, *indices2, tr_lo, tr_hi); + + /* Calculate the address (array index) for all 4 transitions. */ + ACL_TR_CALC_ADDR(mm, 128, addr, xmm_index_mask.x, next_input, + xmm_shuffle_input.x, xmm_ones_16.x, xmm_range_base.x, + tr_lo, tr_hi); + + /* Gather 64 bit transitions and pack back into 2 registers. */ + + trans0 = trans[_mm_cvtsi128_si32(addr)]; + + /* get slot 2 */ + + /* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */ + addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT2); + trans2 = trans[_mm_cvtsi128_si32(addr)]; + + /* get slot 1 */ + + /* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */ + addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT1); + *indices1 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans0); + + /* get slot 3 */ + + /* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */ + addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT3); + *indices2 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans2); + + return _mm_srli_epi32(next_input, CHAR_BIT); +} + +/* + * Execute trie traversal with 8 traversals in parallel + */ +static inline int +search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t total_packets, uint32_t categories) +{ + int n; + struct acl_flow_data flows; + uint64_t index_array[MAX_SEARCHES_SSE8]; + struct completion cmplt[MAX_SEARCHES_SSE8]; + struct parms parms[MAX_SEARCHES_SSE8]; + xmm_t input0, input1; + xmm_t indices1, indices2, indices3, indices4; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < MAX_SEARCHES_SSE8; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + /* + * indices1 contains index_array[0,1] + * indices2 contains index_array[2,3] + * indices3 contains index_array[4,5] + * indices4 contains index_array[6,7] + */ + + indices1 = _mm_loadu_si128((xmm_t *) &index_array[0]); + indices2 = _mm_loadu_si128((xmm_t *) &index_array[2]); + + indices3 = _mm_loadu_si128((xmm_t *) &index_array[4]); + indices4 = _mm_loadu_si128((xmm_t *) &index_array[6]); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, + &indices1, &indices2, xmm_match_mask.x); + acl_match_check_x4(4, ctx, parms, &flows, + &indices3, &indices4, xmm_match_mask.x); + + while (flows.started > 0) { + + /* Gather 4 bytes of input data for each stream. */ + input0 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0)); + input1 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 4)); + + input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 1), 1); + input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 5), 1); + + input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 2), 2); + input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 6), 2); + + input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 3), 3); + input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 7), 3); + + /* Process the 4 bytes of input on each stream. */ + + input0 = transition4(input0, flows.trans, + &indices1, &indices2); + input1 = transition4(input1, flows.trans, + &indices3, &indices4); + + input0 = transition4(input0, flows.trans, + &indices1, &indices2); + input1 = transition4(input1, flows.trans, + &indices3, &indices4); + + input0 = transition4(input0, flows.trans, + &indices1, &indices2); + input1 = transition4(input1, flows.trans, + &indices3, &indices4); + + input0 = transition4(input0, flows.trans, + &indices1, &indices2); + input1 = transition4(input1, flows.trans, + &indices3, &indices4); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, + &indices1, &indices2, xmm_match_mask.x); + acl_match_check_x4(4, ctx, parms, &flows, + &indices3, &indices4, xmm_match_mask.x); + } + + return 0; +} + +/* + * Execute trie traversal with 4 traversals in parallel + */ +static inline int +search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, int total_packets, uint32_t categories) +{ + int n; + struct acl_flow_data flows; + uint64_t index_array[MAX_SEARCHES_SSE4]; + struct completion cmplt[MAX_SEARCHES_SSE4]; + struct parms parms[MAX_SEARCHES_SSE4]; + xmm_t input, indices1, indices2; + + acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, + total_packets, categories, ctx->trans_table); + + for (n = 0; n < MAX_SEARCHES_SSE4; n++) { + cmplt[n].count = 0; + index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); + } + + indices1 = _mm_loadu_si128((xmm_t *) &index_array[0]); + indices2 = _mm_loadu_si128((xmm_t *) &index_array[2]); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, + &indices1, &indices2, xmm_match_mask.x); + + while (flows.started > 0) { + + /* Gather 4 bytes of input data for each stream. */ + input = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0)); + input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 1), 1); + input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 2), 2); + input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 3), 3); + + /* Process the 4 bytes of input on each stream. */ + input = transition4(input, flows.trans, &indices1, &indices2); + input = transition4(input, flows.trans, &indices1, &indices2); + input = transition4(input, flows.trans, &indices1, &indices2); + input = transition4(input, flows.trans, &indices1, &indices2); + + /* Check for any matches. */ + acl_match_check_x4(0, ctx, parms, &flows, + &indices1, &indices2, xmm_match_mask.x); + } + + return 0; +} diff --git a/lib/librte_acl/acl_vect.h b/lib/librte_acl/acl_vect.h new file mode 100644 index 00000000..6cc19997 --- /dev/null +++ b/lib/librte_acl/acl_vect.h @@ -0,0 +1,116 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_ACL_VECT_H_ +#define _RTE_ACL_VECT_H_ + +/** + * @file + * + * RTE ACL SSE/AVX related header. + */ + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * Takes 2 SIMD registers containing N transitions eachi (tr0, tr1). + * Shuffles it into different representation: + * lo - contains low 32 bits of given N transitions. + * hi - contains high 32 bits of given N transitions. + */ +#define ACL_TR_HILO(P, TC, tr0, tr1, lo, hi) do { \ + lo = (typeof(lo))_##P##_shuffle_ps((TC)(tr0), (TC)(tr1), 0x88); \ + hi = (typeof(hi))_##P##_shuffle_ps((TC)(tr0), (TC)(tr1), 0xdd); \ +} while (0) + + +/* + * Calculate the address of the next transition for + * all types of nodes. Note that only DFA nodes and range + * nodes actually transition to another node. Match + * nodes not supposed to be encountered here. + * For quad range nodes: + * Calculate number of range boundaries that are less than the + * input value. Range boundaries for each node are in signed 8 bit, + * ordered from -128 to 127. + * This is effectively a popcnt of bytes that are greater than the + * input byte. + * Single nodes are processed in the same ways as quad range nodes. +*/ +#define ACL_TR_CALC_ADDR(P, S, \ + addr, index_mask, next_input, shuffle_input, \ + ones_16, range_base, tr_lo, tr_hi) do { \ + \ + typeof(addr) in, node_type, r, t; \ + typeof(addr) dfa_msk, dfa_ofs, quad_ofs; \ + \ + t = _##P##_xor_si##S(index_mask, index_mask); \ + in = _##P##_shuffle_epi8(next_input, shuffle_input); \ + \ + /* Calc node type and node addr */ \ + node_type = _##P##_andnot_si##S(index_mask, tr_lo); \ + addr = _##P##_and_si##S(index_mask, tr_lo); \ + \ + /* mask for DFA type(0) nodes */ \ + dfa_msk = _##P##_cmpeq_epi32(node_type, t); \ + \ + /* DFA calculations. */ \ + r = _##P##_srli_epi32(in, 30); \ + r = _##P##_add_epi8(r, range_base); \ + t = _##P##_srli_epi32(in, 24); \ + r = _##P##_shuffle_epi8(tr_hi, r); \ + \ + dfa_ofs = _##P##_sub_epi32(t, r); \ + \ + /* QUAD/SINGLE caluclations. */ \ + t = _##P##_cmpgt_epi8(in, tr_hi); \ + t = _##P##_sign_epi8(t, t); \ + t = _##P##_maddubs_epi16(t, t); \ + quad_ofs = _##P##_madd_epi16(t, ones_16); \ + \ + /* blend DFA and QUAD/SINGLE. */ \ + t = _##P##_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk); \ + \ + /* calculate address for next transitions. */ \ + addr = _##P##_add_epi32(addr, t); \ +} while (0) + + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ACL_VECT_H_ */ diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c new file mode 100644 index 00000000..4ba9786b --- /dev/null +++ b/lib/librte_acl/rte_acl.c @@ -0,0 +1,393 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <rte_acl.h> +#include "acl.h" + +TAILQ_HEAD(rte_acl_list, rte_tailq_entry); + +static struct rte_tailq_elem rte_acl_tailq = { + .name = "RTE_ACL", +}; +EAL_REGISTER_TAILQ(rte_acl_tailq) + +/* + * If the compiler doesn't support AVX2 instructions, + * then the dummy one would be used instead for AVX2 classify method. + */ +int __attribute__ ((weak)) +rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx, + __rte_unused const uint8_t **data, + __rte_unused uint32_t *results, + __rte_unused uint32_t num, + __rte_unused uint32_t categories) +{ + return -ENOTSUP; +} + +int __attribute__ ((weak)) +rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx, + __rte_unused const uint8_t **data, + __rte_unused uint32_t *results, + __rte_unused uint32_t num, + __rte_unused uint32_t categories) +{ + return -ENOTSUP; +} + +int __attribute__ ((weak)) +rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx, + __rte_unused const uint8_t **data, + __rte_unused uint32_t *results, + __rte_unused uint32_t num, + __rte_unused uint32_t categories) +{ + return -ENOTSUP; +} + +static const rte_acl_classify_t classify_fns[] = { + [RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar, + [RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar, + [RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse, + [RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2, + [RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon, +}; + +/* by default, use always available scalar code path. */ +static enum rte_acl_classify_alg rte_acl_default_classify = + RTE_ACL_CLASSIFY_SCALAR; + +static void +rte_acl_set_default_classify(enum rte_acl_classify_alg alg) +{ + rte_acl_default_classify = alg; +} + +extern int +rte_acl_set_ctx_classify(struct rte_acl_ctx *ctx, enum rte_acl_classify_alg alg) +{ + if (ctx == NULL || (uint32_t)alg >= RTE_DIM(classify_fns)) + return -EINVAL; + + ctx->alg = alg; + return 0; +} + +/* + * Select highest available classify method as default one. + * Note that CLASSIFY_AVX2 should be set as a default only + * if both conditions are met: + * at build time compiler supports AVX2 and target cpu supports AVX2. + */ +static void __attribute__((constructor)) +rte_acl_init(void) +{ + enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT; + +#if defined(RTE_ARCH_ARM64) + alg = RTE_ACL_CLASSIFY_NEON; +#elif defined(RTE_ARCH_ARM) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) + alg = RTE_ACL_CLASSIFY_NEON; +#else +#ifdef CC_AVX2_SUPPORT + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) + alg = RTE_ACL_CLASSIFY_AVX2; + else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1)) +#else + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1)) +#endif + alg = RTE_ACL_CLASSIFY_SSE; + +#endif + rte_acl_set_default_classify(alg); +} + +int +rte_acl_classify_alg(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories, + enum rte_acl_classify_alg alg) +{ + if (categories != 1 && + ((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0) + return -EINVAL; + + return classify_fns[alg](ctx, data, results, num, categories); +} + +int +rte_acl_classify(const struct rte_acl_ctx *ctx, const uint8_t **data, + uint32_t *results, uint32_t num, uint32_t categories) +{ + return rte_acl_classify_alg(ctx, data, results, num, categories, + ctx->alg); +} + +struct rte_acl_ctx * +rte_acl_find_existing(const char *name) +{ + struct rte_acl_ctx *ctx = NULL; + struct rte_acl_list *acl_list; + struct rte_tailq_entry *te; + + acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, acl_list, next) { + ctx = (struct rte_acl_ctx *) te->data; + if (strncmp(name, ctx->name, sizeof(ctx->name)) == 0) + break; + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + + if (te == NULL) { + rte_errno = ENOENT; + return NULL; + } + return ctx; +} + +void +rte_acl_free(struct rte_acl_ctx *ctx) +{ + struct rte_acl_list *acl_list; + struct rte_tailq_entry *te; + + if (ctx == NULL) + return; + + acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list); + + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* find our tailq entry */ + TAILQ_FOREACH(te, acl_list, next) { + if (te->data == (void *) ctx) + break; + } + if (te == NULL) { + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return; + } + + TAILQ_REMOVE(acl_list, te, next); + + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + + rte_free(ctx->mem); + rte_free(ctx); + rte_free(te); +} + +struct rte_acl_ctx * +rte_acl_create(const struct rte_acl_param *param) +{ + size_t sz; + struct rte_acl_ctx *ctx; + struct rte_acl_list *acl_list; + struct rte_tailq_entry *te; + char name[sizeof(ctx->name)]; + + acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list); + + /* check that input parameters are valid. */ + if (param == NULL || param->name == NULL) { + rte_errno = EINVAL; + return NULL; + } + + snprintf(name, sizeof(name), "ACL_%s", param->name); + + /* calculate amount of memory required for pattern set. */ + sz = sizeof(*ctx) + param->max_rule_num * param->rule_size; + + /* get EAL TAILQ lock. */ + rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); + + /* if we already have one with that name */ + TAILQ_FOREACH(te, acl_list, next) { + ctx = (struct rte_acl_ctx *) te->data; + if (strncmp(param->name, ctx->name, sizeof(ctx->name)) == 0) + break; + } + + /* if ACL with such name doesn't exist, then create a new one. */ + if (te == NULL) { + ctx = NULL; + te = rte_zmalloc("ACL_TAILQ_ENTRY", sizeof(*te), 0); + + if (te == NULL) { + RTE_LOG(ERR, ACL, "Cannot allocate tailq entry!\n"); + goto exit; + } + + ctx = rte_zmalloc_socket(name, sz, RTE_CACHE_LINE_SIZE, param->socket_id); + + if (ctx == NULL) { + RTE_LOG(ERR, ACL, + "allocation of %zu bytes on socket %d for %s failed\n", + sz, param->socket_id, name); + rte_free(te); + goto exit; + } + /* init new allocated context. */ + ctx->rules = ctx + 1; + ctx->max_rules = param->max_rule_num; + ctx->rule_sz = param->rule_size; + ctx->socket_id = param->socket_id; + ctx->alg = rte_acl_default_classify; + snprintf(ctx->name, sizeof(ctx->name), "%s", param->name); + + te->data = (void *) ctx; + + TAILQ_INSERT_TAIL(acl_list, te, next); + } + +exit: + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + return ctx; +} + +static int +acl_add_rules(struct rte_acl_ctx *ctx, const void *rules, uint32_t num) +{ + uint8_t *pos; + + if (num + ctx->num_rules > ctx->max_rules) + return -ENOMEM; + + pos = ctx->rules; + pos += ctx->rule_sz * ctx->num_rules; + memcpy(pos, rules, num * ctx->rule_sz); + ctx->num_rules += num; + + return 0; +} + +static int +acl_check_rule(const struct rte_acl_rule_data *rd) +{ + if ((RTE_LEN2MASK(RTE_ACL_MAX_CATEGORIES, typeof(rd->category_mask)) & + rd->category_mask) == 0 || + rd->priority > RTE_ACL_MAX_PRIORITY || + rd->priority < RTE_ACL_MIN_PRIORITY || + rd->userdata == RTE_ACL_INVALID_USERDATA) + return -EINVAL; + return 0; +} + +int +rte_acl_add_rules(struct rte_acl_ctx *ctx, const struct rte_acl_rule *rules, + uint32_t num) +{ + const struct rte_acl_rule *rv; + uint32_t i; + int32_t rc; + + if (ctx == NULL || rules == NULL || 0 == ctx->rule_sz) + return -EINVAL; + + for (i = 0; i != num; i++) { + rv = (const struct rte_acl_rule *) + ((uintptr_t)rules + i * ctx->rule_sz); + rc = acl_check_rule(&rv->data); + if (rc != 0) { + RTE_LOG(ERR, ACL, "%s(%s): rule #%u is invalid\n", + __func__, ctx->name, i + 1); + return rc; + } + } + + return acl_add_rules(ctx, rules, num); +} + +/* + * Reset all rules. + * Note that RT structures are not affected. + */ +void +rte_acl_reset_rules(struct rte_acl_ctx *ctx) +{ + if (ctx != NULL) + ctx->num_rules = 0; +} + +/* + * Reset all rules and destroys RT structures. + */ +void +rte_acl_reset(struct rte_acl_ctx *ctx) +{ + if (ctx != NULL) { + rte_acl_reset_rules(ctx); + rte_acl_build(ctx, &ctx->config); + } +} + +/* + * Dump ACL context to the stdout. + */ +void +rte_acl_dump(const struct rte_acl_ctx *ctx) +{ + if (!ctx) + return; + printf("acl context <%s>@%p\n", ctx->name, ctx); + printf(" socket_id=%"PRId32"\n", ctx->socket_id); + printf(" alg=%"PRId32"\n", ctx->alg); + printf(" max_rules=%"PRIu32"\n", ctx->max_rules); + printf(" rule_size=%"PRIu32"\n", ctx->rule_sz); + printf(" num_rules=%"PRIu32"\n", ctx->num_rules); + printf(" num_categories=%"PRIu32"\n", ctx->num_categories); + printf(" num_tries=%"PRIu32"\n", ctx->num_tries); +} + +/* + * Dump all ACL contexts to the stdout. + */ +void +rte_acl_list_dump(void) +{ + struct rte_acl_ctx *ctx; + struct rte_acl_list *acl_list; + struct rte_tailq_entry *te; + + acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list); + + rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + TAILQ_FOREACH(te, acl_list, next) { + ctx = (struct rte_acl_ctx *) te->data; + rte_acl_dump(ctx); + } + rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); +} diff --git a/lib/librte_acl/rte_acl.h b/lib/librte_acl/rte_acl.h new file mode 100644 index 00000000..0979a098 --- /dev/null +++ b/lib/librte_acl/rte_acl.h @@ -0,0 +1,388 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_ACL_H_ +#define _RTE_ACL_H_ + +/** + * @file + * + * RTE Classifier. + */ + +#include <rte_acl_osdep.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_ACL_MAX_CATEGORIES 16 + +#define RTE_ACL_RESULTS_MULTIPLIER (XMM_SIZE / sizeof(uint32_t)) + +#define RTE_ACL_MAX_LEVELS 64 +#define RTE_ACL_MAX_FIELDS 64 + +union rte_acl_field_types { + uint8_t u8; + uint16_t u16; + uint32_t u32; + uint64_t u64; +}; + +enum { + RTE_ACL_FIELD_TYPE_MASK = 0, + RTE_ACL_FIELD_TYPE_RANGE, + RTE_ACL_FIELD_TYPE_BITMASK +}; + +/** + * ACL Field definition. + * Each field in the ACL rule has an associate definition. + * It defines the type of field, its size, its offset in the input buffer, + * the field index, and the input index. + * For performance reasons, the inner loop of the search function is unrolled + * to process four input bytes at a time. This requires the input to be grouped + * into sets of 4 consecutive bytes. The loop processes the first input byte as + * part of the setup and then subsequent bytes must be in groups of 4 + * consecutive bytes. + */ +struct rte_acl_field_def { + uint8_t type; /**< type - RTE_ACL_FIELD_TYPE_*. */ + uint8_t size; /**< size of field 1,2,4, or 8. */ + uint8_t field_index; /**< index of field inside the rule. */ + uint8_t input_index; /**< 0-N input index. */ + uint32_t offset; /**< offset to start of field. */ +}; + +/** + * ACL build configuration. + * Defines the fields of an ACL trie and number of categories to build with. + */ +struct rte_acl_config { + uint32_t num_categories; /**< Number of categories to build with. */ + uint32_t num_fields; /**< Number of field definitions. */ + struct rte_acl_field_def defs[RTE_ACL_MAX_FIELDS]; + /**< array of field definitions. */ + size_t max_size; + /**< max memory limit for internal run-time structures. */ +}; + +/** + * Defines the value of a field for a rule. + */ +struct rte_acl_field { + union rte_acl_field_types value; + /**< a 1,2,4, or 8 byte value of the field. */ + union rte_acl_field_types mask_range; + /**< + * depending on field type: + * mask -> 1.2.3.4/32 value=0x1020304, mask_range=32, + * range -> 0 : 65535 value=0, mask_range=65535, + * bitmask -> 0x06/0xff value=6, mask_range=0xff. + */ +}; + +enum { + RTE_ACL_TYPE_SHIFT = 29, + RTE_ACL_MAX_INDEX = RTE_LEN2MASK(RTE_ACL_TYPE_SHIFT, uint32_t), + RTE_ACL_MAX_PRIORITY = RTE_ACL_MAX_INDEX, + RTE_ACL_MIN_PRIORITY = 0, +}; + +#define RTE_ACL_INVALID_USERDATA 0 + +#define RTE_ACL_MASKLEN_TO_BITMASK(v, s) \ +((v) == 0 ? (v) : (typeof(v))((uint64_t)-1 << ((s) * CHAR_BIT - (v)))) + +/** + * Miscellaneous data for ACL rule. + */ +struct rte_acl_rule_data { + uint32_t category_mask; /**< Mask of categories for that rule. */ + int32_t priority; /**< Priority for that rule. */ + uint32_t userdata; /**< Associated with the rule user data. */ +}; + +/** + * Defines single ACL rule. + * data - miscellaneous data for the rule. + * field[] - value and mask or range for each field. + */ +#define RTE_ACL_RULE_DEF(name, fld_num) struct name {\ + struct rte_acl_rule_data data; \ + struct rte_acl_field field[fld_num]; \ +} + +RTE_ACL_RULE_DEF(rte_acl_rule, 0); + +#define RTE_ACL_RULE_SZ(fld_num) \ + (sizeof(struct rte_acl_rule) + sizeof(struct rte_acl_field) * (fld_num)) + + +/** Max number of characters in name.*/ +#define RTE_ACL_NAMESIZE 32 + +/** + * Parameters used when creating the ACL context. + */ +struct rte_acl_param { + const char *name; /**< Name of the ACL context. */ + int socket_id; /**< Socket ID to allocate memory for. */ + uint32_t rule_size; /**< Size of each rule. */ + uint32_t max_rule_num; /**< Maximum number of rules. */ +}; + + +/** + * Create a new ACL context. + * + * @param param + * Parameters used to create and initialise the ACL context. + * @return + * Pointer to ACL context structure that is used in future ACL + * operations, or NULL on error, with error code set in rte_errno. + * Possible rte_errno errors include: + * - EINVAL - invalid parameter passed to function + */ +struct rte_acl_ctx * +rte_acl_create(const struct rte_acl_param *param); + +/** + * Find an existing ACL context object and return a pointer to it. + * + * @param name + * Name of the ACL context as passed to rte_acl_create() + * @return + * Pointer to ACL context or NULL if object not found + * with rte_errno set appropriately. Possible rte_errno values include: + * - ENOENT - value not available for return + */ +struct rte_acl_ctx * +rte_acl_find_existing(const char *name); + +/** + * De-allocate all memory used by ACL context. + * + * @param ctx + * ACL context to free + */ +void +rte_acl_free(struct rte_acl_ctx *ctx); + +/** + * Add rules to an existing ACL context. + * This function is not multi-thread safe. + * + * @param ctx + * ACL context to add patterns to. + * @param rules + * Array of rules to add to the ACL context. + * Note that all fields in rte_acl_rule structures are expected + * to be in host byte order. + * Each rule expected to be in the same format and not exceed size + * specified at ACL context creation time. + * @param num + * Number of elements in the input array of rules. + * @return + * - -ENOMEM if there is no space in the ACL context for these rules. + * - -EINVAL if the parameters are invalid. + * - Zero if operation completed successfully. + */ +int +rte_acl_add_rules(struct rte_acl_ctx *ctx, const struct rte_acl_rule *rules, + uint32_t num); + +/** + * Delete all rules from the ACL context. + * This function is not multi-thread safe. + * Note that internal run-time structures are not affected. + * + * @param ctx + * ACL context to delete rules from. + */ +void +rte_acl_reset_rules(struct rte_acl_ctx *ctx); + +/** + * Analyze set of rules and build required internal run-time structures. + * This function is not multi-thread safe. + * + * @param ctx + * ACL context to build. + * @param cfg + * Pointer to struct rte_acl_config - defines build parameters. + * @return + * - -ENOMEM if couldn't allocate enough memory. + * - -EINVAL if the parameters are invalid. + * - Negative error code if operation failed. + * - Zero if operation completed successfully. + */ +int +rte_acl_build(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg); + +/** + * Delete all rules from the ACL context and + * destroy all internal run-time structures. + * This function is not multi-thread safe. + * + * @param ctx + * ACL context to reset. + */ +void +rte_acl_reset(struct rte_acl_ctx *ctx); + +/** + * Available implementations of ACL classify. + */ +enum rte_acl_classify_alg { + RTE_ACL_CLASSIFY_DEFAULT = 0, + RTE_ACL_CLASSIFY_SCALAR = 1, /**< generic implementation. */ + RTE_ACL_CLASSIFY_SSE = 2, /**< requires SSE4.1 support. */ + RTE_ACL_CLASSIFY_AVX2 = 3, /**< requires AVX2 support. */ + RTE_ACL_CLASSIFY_NEON = 4, /**< requires NEON support. */ + RTE_ACL_CLASSIFY_NUM /* should always be the last one. */ +}; + +/** + * Perform search for a matching ACL rule for each input data buffer. + * Each input data buffer can have up to *categories* matches. + * That implies that results array should be big enough to hold + * (categories * num) elements. + * Also categories parameter should be either one or multiple of + * RTE_ACL_RESULTS_MULTIPLIER and can't be bigger than RTE_ACL_MAX_CATEGORIES. + * If more than one rule is applicable for given input buffer and + * given category, then rule with highest priority will be returned as a match. + * Note, that it is a caller's responsibility to ensure that input parameters + * are valid and point to correct memory locations. + * + * @param ctx + * ACL context to search with. + * @param data + * Array of pointers to input data buffers to perform search. + * Note that all fields in input data buffers supposed to be in network + * byte order (MSB). + * @param results + * Array of search results, *categories* results per each input data buffer. + * @param num + * Number of elements in the input data buffers array. + * @param categories + * Number of maximum possible matches for each input buffer, one possible + * match per category. + * @return + * zero on successful completion. + * -EINVAL for incorrect arguments. + */ +extern int +rte_acl_classify(const struct rte_acl_ctx *ctx, + const uint8_t **data, + uint32_t *results, uint32_t num, + uint32_t categories); + +/** + * Perform search using specified algorithm for a matching ACL rule for + * each input data buffer. + * Each input data buffer can have up to *categories* matches. + * That implies that results array should be big enough to hold + * (categories * num) elements. + * Also categories parameter should be either one or multiple of + * RTE_ACL_RESULTS_MULTIPLIER and can't be bigger than RTE_ACL_MAX_CATEGORIES. + * If more than one rule is applicable for given input buffer and + * given category, then rule with highest priority will be returned as a match. + * Note, that it is a caller's responsibility to ensure that input parameters + * are valid and point to correct memory locations. + * + * @param ctx + * ACL context to search with. + * @param data + * Array of pointers to input data buffers to perform search. + * Note that all fields in input data buffers supposed to be in network + * byte order (MSB). + * @param results + * Array of search results, *categories* results per each input data buffer. + * @param num + * Number of elements in the input data buffers array. + * @param categories + * Number of maximum possible matches for each input buffer, one possible + * match per category. + * @param alg + * Algorithm to be used for the search. + * It is the caller responsibility to ensure that the value refers to the + * existing algorithm, and that it could be run on the given CPU. + * @return + * zero on successful completion. + * -EINVAL for incorrect arguments. + */ +extern int +rte_acl_classify_alg(const struct rte_acl_ctx *ctx, + const uint8_t **data, + uint32_t *results, uint32_t num, + uint32_t categories, + enum rte_acl_classify_alg alg); + +/* + * Override the default classifier function for a given ACL context. + * @param ctx + * ACL context to change classify function for. + * @param alg + * New default classify algorithm for given ACL context. + * It is the caller responsibility to ensure that the value refers to the + * existing algorithm, and that it could be run on the given CPU. + * @return + * - -EINVAL if the parameters are invalid. + * - Zero if operation completed successfully. + */ +extern int +rte_acl_set_ctx_classify(struct rte_acl_ctx *ctx, + enum rte_acl_classify_alg alg); + +/** + * Dump an ACL context structure to the console. + * + * @param ctx + * ACL context to dump. + */ +void +rte_acl_dump(const struct rte_acl_ctx *ctx); + +/** + * Dump all ACL context structures to the console. + */ +void +rte_acl_list_dump(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_ACL_H_ */ diff --git a/lib/librte_acl/rte_acl_osdep.h b/lib/librte_acl/rte_acl_osdep.h new file mode 100644 index 00000000..41f7e3d4 --- /dev/null +++ b/lib/librte_acl/rte_acl_osdep.h @@ -0,0 +1,80 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_ACL_OSDEP_H_ +#define _RTE_ACL_OSDEP_H_ + +/** + * @file + * + * RTE ACL DPDK/OS dependent file. + */ + +#include <stdint.h> +#include <stddef.h> +#include <inttypes.h> +#include <limits.h> +#include <ctype.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> +#include <stdarg.h> +#include <stdlib.h> +#include <sys/queue.h> + +/* + * Common defines. + */ + +#define DIM(x) RTE_DIM(x) + +#include <rte_common.h> +#include <rte_vect.h> +#include <rte_memory.h> +#include <rte_log.h> +#include <rte_memcpy.h> +#include <rte_prefetch.h> +#include <rte_byteorder.h> +#include <rte_branch_prediction.h> +#include <rte_memzone.h> +#include <rte_malloc.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_per_lcore.h> +#include <rte_errno.h> +#include <rte_string_fns.h> +#include <rte_cpuflags.h> +#include <rte_log.h> +#include <rte_debug.h> + +#endif /* _RTE_ACL_OSDEP_H_ */ diff --git a/lib/librte_acl/rte_acl_version.map b/lib/librte_acl/rte_acl_version.map new file mode 100644 index 00000000..b09370a1 --- /dev/null +++ b/lib/librte_acl/rte_acl_version.map @@ -0,0 +1,19 @@ +DPDK_2.0 { + global: + + rte_acl_add_rules; + rte_acl_build; + rte_acl_classify; + rte_acl_classify_alg; + rte_acl_classify_scalar; + rte_acl_create; + rte_acl_dump; + rte_acl_find_existing; + rte_acl_free; + rte_acl_list_dump; + rte_acl_reset; + rte_acl_reset_rules; + rte_acl_set_ctx_classify; + + local: *; +}; diff --git a/lib/librte_acl/tb_mem.c b/lib/librte_acl/tb_mem.c new file mode 100644 index 00000000..157e6080 --- /dev/null +++ b/lib/librte_acl/tb_mem.c @@ -0,0 +1,108 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "tb_mem.h" + +/* + * Memory management routines for temporary memory. + * That memory is used only during build phase and is released after + * build is finished. + * Note, that tb_pool/tb_alloc() are not supposed to return NULL. + * Instead, in the case of failure to allocate memory, + * it would do siglongjmp(pool->fail). + * It is responsibility of the caller to save the proper context/environment, + * in the pool->fail before calling tb_alloc() for the given pool first time. + */ + +static struct tb_mem_block * +tb_pool(struct tb_mem_pool *pool, size_t sz) +{ + struct tb_mem_block *block; + uint8_t *ptr; + size_t size; + + size = sz + pool->alignment - 1; + block = calloc(1, size + sizeof(*pool->block)); + if (block == NULL) { + RTE_LOG(ERR, MALLOC, "%s(%zu)\n failed, currently allocated " + "by pool: %zu bytes\n", __func__, sz, pool->alloc); + siglongjmp(pool->fail, -ENOMEM); + return NULL; + } + + block->pool = pool; + + block->next = pool->block; + pool->block = block; + + pool->alloc += size; + + ptr = (uint8_t *)(block + 1); + block->mem = RTE_PTR_ALIGN_CEIL(ptr, pool->alignment); + block->size = size - (block->mem - ptr); + + return block; +} + +void * +tb_alloc(struct tb_mem_pool *pool, size_t size) +{ + struct tb_mem_block *block; + void *ptr; + size_t new_sz; + + size = RTE_ALIGN_CEIL(size, pool->alignment); + + block = pool->block; + if (block == NULL || block->size < size) { + new_sz = (size > pool->min_alloc) ? size : pool->min_alloc; + block = tb_pool(pool, new_sz); + } + ptr = block->mem; + block->size -= size; + block->mem += size; + return ptr; +} + +void +tb_free_pool(struct tb_mem_pool *pool) +{ + struct tb_mem_block *next, *block; + + for (block = pool->block; block != NULL; block = next) { + next = block->next; + free(block); + } + pool->block = NULL; + pool->alloc = 0; +} diff --git a/lib/librte_acl/tb_mem.h b/lib/librte_acl/tb_mem.h new file mode 100644 index 00000000..ca7af966 --- /dev/null +++ b/lib/librte_acl/tb_mem.h @@ -0,0 +1,76 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TB_MEM_H_ +#define _TB_MEM_H_ + +/** + * @file + * + * RTE ACL temporary (build phase) memory management. + * Contains structures and functions to manage temporary (used by build only) + * memory. Memory allocated in large blocks to speed 'free' when trie is + * destructed (finish of build phase). + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_acl_osdep.h> +#include <setjmp.h> + +struct tb_mem_block { + struct tb_mem_block *next; + struct tb_mem_pool *pool; + size_t size; + uint8_t *mem; +}; + +struct tb_mem_pool { + struct tb_mem_block *block; + size_t alignment; + size_t min_alloc; + size_t alloc; + /* jump target in case of memory allocation failure. */ + sigjmp_buf fail; +}; + +void *tb_alloc(struct tb_mem_pool *pool, size_t size); +void tb_free_pool(struct tb_mem_pool *pool); + +#ifdef __cplusplus +} +#endif + +#endif /* _TB_MEM_H_ */ |